In [1]:
spark

## Processing `Comments.xml`

In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [2]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw'
dataset_comments = f"{dataset_bucket}/Comments.xml"

In [66]:
rdd = spark.sparkContext.textFile(dataset_comments)

parsed_rdd = rdd.map(lambda row: row.strip()) \
                .filter(lambda row: row.startswith("<row")) \
                .filter(lambda row: "UserDisplayName=" not in row) \
                .map(lambda row: row[4:-3]) \
                .map(lambda row: row.strip()) \
                .filter(lambda row: len(row.split('"')) == 15) \
                .map(lambda row: (int(row.split('"')[1]), int(row.split('"')[3]), int(row.split('"')[5]), row.split('"')[7], datetime.strptime(row.split('"')[9], "%Y-%m-%dT%H:%M:%S.%f"), int(row.split('"')[11]), row.split('"')[13])) 


In [68]:
# Define the schema for the DataFrame
my_schema = StructType([
    StructField("Id", LongType()),
    StructField("PostId", LongType()),
    StructField("Score", LongType()),
    StructField("Text", StringType()),
    StructField("CreationDate", TimestampType()),
    StructField("UserId", LongType()),
    StructField("ContentLicense", StringType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(my_schema)

In [69]:
df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- PostId: long (nullable = true)
 |-- Score: long (nullable = true)
 |-- Text: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- UserId: long (nullable = true)
 |-- ContentLicense: string (nullable = true)



In [70]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed'
output_folder_name = f"{output_bucket}/Comments-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

                                                                                

## Verifying the data by reading from S3

In [71]:
df = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(output_folder_name)

[Stage 32:>                                                         (0 + 1) / 1]                                                                                

In [72]:
df.show()

[Stage 33:>                                                         (0 + 1) / 1]

+-------+-------+-----+--------------------+--------------------+------+--------------+
|     Id| PostId|Score|                Text|        CreationDate|UserId|ContentLicense|
+-------+-------+-----+--------------------+--------------------+------+--------------+
|3289318|3192701|    0|@ckv - area51 is ...|2010-07-07 07:31:...|  1583|  CC BY-SA 2.5|
|3289319|3183199|    0|@Alex: that optim...|2010-07-07 07:31:...| 84270|  CC BY-SA 2.5|
|3289321|1573241|    1|I also see this p...|2010-07-07 07:32:...|108238|  CC BY-SA 2.5|
|3289323|3192624|    0|Do'h!!. I see wha...|2010-07-07 07:32:...|342817|  CC BY-SA 2.5|
|3289324|3192156|    1|All the solutions...|2010-07-07 07:32:...|270287|  CC BY-SA 2.5|
|3289325|3192701|    1|@Oded: he means t...|2010-07-07 07:32:...|130758|  CC BY-SA 2.5|
|3289326|3192196|    0|-1: Doesn't follo...|2010-07-07 07:32:...|239916|  CC BY-SA 2.5|
|3289327|3192719|    0|I also applied th...|2010-07-07 07:32:...|295264|  CC BY-SA 2.5|
|3289329|3181607|    0|I don't k

                                                                                