In [1]:
spark

## Processing `PostHistory.xml`

In [35]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [36]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw/'
dataset_comments = f"{dataset_bucket}/PostHistory.xml"

In [37]:
rdd = spark.sparkContext.textFile(dataset_comments)


In [41]:
def row_parser(row):
    
    fields = [
                "Id=",
                "PostHistoryTypeId=",
                "PostId=",
                "RevisionGUID=",
                "CreationDate=",
                "UserId=",
                "UserDisplayName=",
                "Comment=",
                "Text=",
                "ContentLicense=",
            ]
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        if row_list[i] == 'CreationDate=':
            row_field[row_list[i]] = datetime.strptime(row_list[i+1], "%Y-%m-%dT%H:%M:%S.%f")
        
        else:
            row_field[row_list[i]] = row_list[i+1]
        
    
    return tuple(row_field.values())

In [42]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)

In [43]:
# Define the schema for the DataFrame
schema_posthistory = StructType([
    StructField("Id", StringType()),
    StructField("PostHistoryTypeId", StringType()),
    StructField("PostId", StringType()),
    StructField("RevisionGUID", StringType()),
    StructField("CreationDate", TimestampType()),
    StructField("UserId", StringType()),
    StructField("UserDisplayName", StringType()),
    StructField("Comment", StringType()),
    StructField("Text", StringType()),
    StructField("ContentLicense", StringType())
])


In [44]:
# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_posthistory)

In [45]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- PostHistoryTypeId: string (nullable = true)
 |-- PostId: string (nullable = true)
 |-- RevisionGUID: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- UserId: string (nullable = true)
 |-- UserDisplayName: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- ContentLicense: string (nullable = true)



In [46]:
df.show()

[Stage 39:>                                                         (0 + 1) / 1]

+---+-----------------+------+--------------------+--------------------+------+---------------+-------+--------------------+--------------+
| Id|PostHistoryTypeId|PostId|        RevisionGUID|        CreationDate|UserId|UserDisplayName|Comment|                Text|ContentLicense|
+---+-----------------+------+--------------------+--------------------+------+---------------+-------+--------------------+--------------+
|  6|                2|     7|c30df0f4-a2d9-426...|2008-07-31 22:17:...|     9|           null|   null|The explicit cast...|  CC BY-SA 2.5|
| 12|                1|    17|0421fb42-a29a-4cb...|2008-08-01 05:09:...|     2|           null|   null|Binary Data in MYSQL|  CC BY-SA 2.5|
| 13|                3|    17|0421fb42-a29a-4cb...|2008-08-01 05:09:...|     2|           null|   null|&lt;database&gt;&...|  CC BY-SA 2.5|
| 14|                2|    17|0421fb42-a29a-4cb...|2008-08-01 05:09:...|     2|           null|   null|How do I store bi...|  CC BY-SA 2.5|
| 16|               

                                                                                

In [47]:
df = df \
    .withColumn('Id', F.col('Id').cast('int')) \
    .withColumn('PostHistoryTypeId', F.col('PostHistoryTypeId').cast('int')) \
    .withColumn('PostId', F.col('PostId').cast('int')) \
    .withColumn('UserId', F.col('UserId').cast('int')) 

df.count()

                                                                                

117255126

In [None]:

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_posthistory)

# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed'
output_folder_name = f"{output_bucket}/PostHistory-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

df.show()