In [1]:
spark

## Processing `Votes.xml`

In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [2]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw'
dataset_file = f"{dataset_bucket}/Votes.xml"

In [3]:
rdd = spark.sparkContext.textFile(dataset_file)


In [None]:
def row_parser(row):
    
    fields = [
                "Id=",
                "PostId=",
                "VoteTypeId=",
                "TagName=",
                "Count=",
                "ExcerptPostId=",
                "WikiPostId=",
            ]
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        if row_list[i] == 'Date=':
            row_field[row_list[i]] = datetime.strptime(row_list[i+1], "%Y-%m-%dT%H:%M:%S.%f")
        elif row_list[i] == 'TagBased=':
            row_field[row_list[i]] = True if row_list[i+1].lower() == 'true'  else False 
        
        else:
            row_field[row_list[i]] = row_list[i+1]
        
    
    return tuple(row_field.values())

In [4]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) 

In [5]:
parsed_rdd.count()

                                                                                

224535501

In [8]:
parsed_rdd.take(1)

                                                                                

['Id="1" PostId="1" VoteTypeId="2" CreationDate="2008-07-31T00:00:00.000"']

In [10]:
row = ['Id="1" PostId="1" VoteTypeId="2" CreationDate="2008-07-31T00:00:00.000"'][0]

len(row.split('"'))

9

In [16]:
parsed_rdd.filter(lambda row: len(row.split('"')) <= 13).count()

                                                                                

224535501

In [17]:
parsed_rdd.filter(lambda row: len(row.split('"')) == 13).take(1)

                                                                                

['Id="1472821" PostId="98225" VoteTypeId="8" UserId="4234" CreationDate="2009-01-27T00:00:00.000" BountyAmount="350"']

In [50]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(lambda row: (int(row.split('"')[1]), int(row.split('"')[3]), int(row.split('"')[5]), F.to_str(row.split('"')[7]))) 

In [51]:
parsed_rdd.count()

                                                                                

224535501

In [52]:
# Define the schema for the DataFrame
schema_votes = StructType([
    StructField("Id", LongType()),
    StructField("PostId", LongType()),
    StructField("VoteTypeId", LongType()),
    StructField("CreationDate", StringType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_votes)

In [53]:
df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- PostId: long (nullable = true)
 |-- VoteTypeId: long (nullable = true)
 |-- CreationDate: string (nullable = true)



In [54]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed'
output_folder_name = f"{output_bucket}/Votes-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

                                                                                