In [1]:
spark

## Processing `Comments.xml`

In [2]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [3]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-samples/'
dataset_comments = f"{dataset_bucket}Comments-sample.xml"

In [4]:
# Load the data as an RDD
rdd = spark.sparkContext.textFile(dataset_comments)

parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .filter(lambda row: "UserDisplayName=" not in row) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(lambda row: (int(row.split('"')[1]), int(row.split('"')[3]), int(row.split('"')[5]), row.split('"')[7], datetime.strptime(row.split('"')[9], "%Y-%m-%dT%H:%M:%S.%f"), int(row.split('"')[11]), row.split('"')[13])) 


In [5]:
parsed_rdd.count()

                                                                                

1989

In [6]:
# Define the schema for the DataFrame
my_schema = StructType([
    StructField("Id", LongType()),
    StructField("PostId", LongType()),
    StructField("Score", LongType()),
    StructField("Text", StringType()),
    StructField("CreationDate", TimestampType()),
    StructField("UserId", LongType()),
    StructField("ContentLicense", StringType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(my_schema)

In [7]:
df.count()

                                                                                

1989

In [8]:
df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- PostId: long (nullable = true)
 |-- Score: long (nullable = true)
 |-- Text: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- UserId: long (nullable = true)
 |-- ContentLicense: string (nullable = true)



In [9]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-samples-processed/'
output_folder_name = f"{output_bucket}Comments-sample-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

                                                                                

## Verifying the data by reading from S3

In [10]:
df = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(output_folder_name)

In [11]:
df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- PostId: long (nullable = true)
 |-- Score: long (nullable = true)
 |-- Text: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- UserId: long (nullable = true)
 |-- ContentLicense: string (nullable = true)



In [12]:
df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+---------+--------+-----+--------------------+--------------------+--------+--------------+
|       Id|  PostId|Score|                Text|        CreationDate|  UserId|ContentLicense|
+---------+--------+-----+--------------------+--------------------+--------+--------------+
|131809104|74679059|    0|Thanks for this, ...|2022-12-04 17:35:...| 2451763|  CC BY-SA 4.0|
|131809105|74679059|    0|That being said, ...|2022-12-04 17:35:...| 2451763|  CC BY-SA 4.0|
|131809107|74671427|    0|I follow the step...|2022-12-04 17:35:...|20432719|  CC BY-SA 4.0|
|131809108|74606162|    0|Your answer could...|2022-12-04 17:35:...|      -1|  CC BY-SA 4.0|
|131809109|74679006|    0|I want the data t...|2022-12-04 17:35:...|19874197|  CC BY-SA 4.0|
|131809110|74677453|    0|see https://stack...|2022-12-04 17:35:...|10802527|  CC BY-SA 4.0|
|131809111|74679042|    0|I added the &quot...|2022-12-04 17:35:...|20431208|  CC BY-SA 4.0|
|131809113|74678997|    0|What would be the...|2022-12-04 17:35:...| 2

                                                                                

In [13]:
df.count()

                                                                                

1989