In [2]:
spark

## Processing `Comments.xml`

In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [2]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw/2023'
dataset_comments = f"{dataset_bucket}/Comments.xml"

In [3]:
rdd = spark.sparkContext.textFile(dataset_comments)


In [4]:
def row_parser(row):
    
    fields = [
                "Id=",
                "PostId=",
                "Score=",
                "Text=",
                "CreationDate=",
                "UserDisplayName=",
                "UserId=",
                "ContentLicense="
            ]
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        if row_list[i] == 'CreationDate=':
            row_field[row_list[i]] = datetime.strptime(row_list[i+1], "%Y-%m-%dT%H:%M:%S.%f")        
        else:
            row_field[row_list[i]] = row_list[i+1]
        
    
    return tuple(row_field.values())

In [5]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)

In [6]:
parsed_rdd.count()

                                                                                

88222951

In [7]:
# Define the schema for the DataFrame
comments_schema = StructType([
    StructField("Id", StringType()),
    StructField("PostId", StringType()),
    StructField("Score", StringType()),
    StructField("Text", StringType()),
    StructField("CreationDate", TimestampType()),
    StructField("UserDisplayName", StringType()),
    StructField("UserId", StringType()),
    StructField("ContentLicense", StringType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(comments_schema)

In [8]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- PostId: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- UserDisplayName: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- ContentLicense: string (nullable = true)



In [9]:
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---+------+-----+--------------------+--------------------+---------------+-------+--------------+
| Id|PostId|Score|                Text|        CreationDate|UserDisplayName| UserId|ContentLicense|
+---+------+-----+--------------------+--------------------+---------------+-------+--------------+
| 10| 45651|    6|It will help if y...|2008-09-06 13:38:...|           null|    242|  CC BY-SA 2.5|
| 12| 47428|    3|One of the things...|2008-09-06 13:51:...|           null|   4642|  CC BY-SA 2.5|
| 14| 47481|    0|I agree, both Cod...|2008-09-06 14:15:...|           null|   4642|  CC BY-SA 2.5|
| 15| 47373|    0|Just wanted to me...|2008-09-06 14:30:...|           null|   2495|  CC BY-SA 2.5|
| 16| 47497|    1|Indeed, the only ...|2008-09-06 14:42:...|           null|   4642|  CC BY-SA 2.5|
| 18| 47513|    2|This advice goes ...|2008-09-06 15:02:...|           null|   2515|  CC BY-SA 2.5|
| 19| 47466|    2|It was bad when p...|2008-09-06 15:11:...|           null|   2515|  CC BY-SA 2.5|


                                                                                

In [10]:
df = df \
    .withColumn('Id', F.col('Id').cast('int')) \
    .withColumn('PostId', F.col('PostId').cast('int')) \
    .withColumn('Score', F.col('Score').cast('int')) \
    .withColumn('UserId', F.col('UserId').cast('int')) 

df.count()

                                                                                

88222951

In [5]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed/2023'
output_folder_name = f"{output_bucket}/Comments-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

## Verifying the data by reading from S3

In [3]:
df = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(output_folder_name)

                                                                                

In [4]:
df.count()

                                                                                

88222951