In [1]:
spark

## Processing `Post.xml`

In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [3]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw/2023/'
dataset_comments = f"{dataset_bucket}/Posts.xml"

In [4]:
rdd = spark.sparkContext.textFile(dataset_comments)

In [163]:
def row_parser(row):
    
    fields = ['Id=',
             'PostTypeId=',
             'ParentId=',
             'AcceptedAnswerId=',
             'CreationDate=',
             'DeletionDate=', 
             'Score=',
             'ViewCount=',
             'Body=',
             'OwnerUserId=',
             'OwnerDisplayName=',
             'LastEditorUserId=',
             'LastEditorDisplayName=',
             'LastEditDate=',
             'LastActivityDate=',
             'Title=',
             'Tags=',
             'AnswerCount=',
             'CommentCount=',
             'FavoriteCount=',
             'ClosedDate=',
             'CommunityOwnedDate=', 
             'ContentLicense=']
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        if row_list[i] in ['ClosedDate=', 'CreationDate=', 'LastEditDate=', 'LastActivityDate=', 'CommunityOwnedDate=', 'DeletionDate=']:
            row_field[row_list[i]] = datetime.strptime(row_list[i+1], "%Y-%m-%dT%H:%M:%S.%f")
        
        else:
            row_field[row_list[i]] = row_list[i+1]
        
    
    return tuple(row_field.values())

In [164]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)

In [165]:
# Define the schema for the DataFrame
schema_post = StructType([
    StructField('Id', StringType()),
    StructField('PostTypeId', StringType()),
    StructField('ParentId', StringType()),
    StructField('AcceptedAnswerId', StringType()),
    StructField('CreationDate', TimestampType()),
    StructField('DeletionDate', TimestampType()),
    StructField('Score', StringType()),
    StructField('ViewCount', StringType()),
    StructField('Body', StringType()),
    StructField('OwnerUserId', StringType()),
    StructField('OwnerDisplayName', StringType()),
    StructField('LastEditorUserId', StringType()),
    StructField('LastEditorDisplayName', StringType()),
    StructField('LastEditDate', TimestampType()),
    StructField('LastActivityDate', TimestampType()),
    StructField('Title', StringType()),
    StructField('Tags', StringType()),
    StructField('AnswerCount', StringType()),
    StructField('CommentCount', StringType()),
    StructField('FavoriteCount', StringType()),
    StructField('ClosedDate', TimestampType()),
    StructField('CommunityOwnedDate', TimestampType()),
    StructField('ContentLicense', StringType())
])

In [166]:
# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_post)

In [167]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- PostTypeId: string (nullable = true)
 |-- ParentId: string (nullable = true)
 |-- AcceptedAnswerId: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- DeletionDate: timestamp (nullable = true)
 |-- Score: string (nullable = true)
 |-- ViewCount: string (nullable = true)
 |-- Body: string (nullable = true)
 |-- OwnerUserId: string (nullable = true)
 |-- OwnerDisplayName: string (nullable = true)
 |-- LastEditorUserId: string (nullable = true)
 |-- LastEditorDisplayName: string (nullable = true)
 |-- LastEditDate: timestamp (nullable = true)
 |-- LastActivityDate: timestamp (nullable = true)
 |-- Title: string (nullable = true)
 |-- Tags: string (nullable = true)
 |-- AnswerCount: string (nullable = true)
 |-- CommentCount: string (nullable = true)
 |-- FavoriteCount: string (nullable = true)
 |-- ClosedDate: timestamp (nullable = true)
 |-- CommunityOwnedDate: timestamp (nullable = true)
 |-- ContentLicense: string (n

In [168]:
df.count()

                                                                                

58329356

In [169]:
df = df \
    .withColumn('Id', F.col('Id').cast('int')) \
    .withColumn('PostTypeId', F.col('PostTypeId').cast('int')) \
    .withColumn('ParentId', F.col('ParentId').cast('int')) \
    .withColumn('AcceptedAnswerId', F.col('AcceptedAnswerId').cast('int')) \
    .withColumn('Score', F.col('Score').cast('int')) \
    .withColumn('ViewCount', F.col('ViewCount').cast('int')) \
    .withColumn('OwnerUserId', F.col('OwnerUserId').cast('int')) \
    .withColumn('LastEditorUserId', F.col('LastEditorUserId').cast('int')) \
    .withColumn('AnswerCount', F.col('AnswerCount').cast('int')) \
    .withColumn('CommentCount', F.col('CommentCount').cast('int')) \
    .withColumn('FavoriteCount', F.col('FavoriteCount').cast('int')) 

df.count()

                                                                                

58329356

In [12]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed/2023'
output_folder_name = f"{output_bucket}/Post-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+---+-----------------+------+--------------------+--------------------+------+---------------+-------+--------------------+--------------+
| Id|PostHistoryTypeId|PostId|        RevisionGUID|        CreationDate|UserId|UserDisplayName|Comment|                Text|ContentLicense|
+---+-----------------+------+--------------------+--------------------+------+---------------+-------+--------------------+--------------+
|  6|                2|     7|c30df0f4-a2d9-426...|2008-07-31 22:17:...|     9|           null|   null|The explicit cast...|  CC BY-SA 2.5|
| 12|                1|    17|0421fb42-a29a-4cb...|2008-08-01 05:09:...|     2|           null|   null|Binary Data in MYSQL|  CC BY-SA 2.5|
| 13|                3|    17|0421fb42-a29a-4cb...|2008-08-01 05:09:...|     2|           null|   null|&lt;database&gt;&...|  CC BY-SA 2.5|
| 14|                2|    17|0421fb42-a29a-4cb...|2008-08-01 05:09:...|     2|           null|   null|How do I store bi...|  CC BY-SA 2.5|
| 16|               

                                                                                