In [1]:
spark

## Processing `Badges.xml`

In [20]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType, BooleanType
from datetime import datetime

In [29]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw'
dataset_comments = f"{dataset_bucket}/Badges.xml"

In [30]:
rdd = spark.sparkContext.textFile(dataset_comments)


In [44]:
def row_parser(row):
    
    fields = [
                "Id=",
                "UserId=",
                "Name=",
                "Date=",
                "Class=",
                "TagBased="
            ]
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        if row_list[i] == 'Date=':
            row_field[row_list[i]] = datetime.strptime(row_list[i+1], "%Y-%m-%dT%H:%M:%S.%f")
        elif row_list[i] == 'TagBased=':
            row_field[row_list[i]] = True if row_list[i+1].lower() == 'true'  else False 
        
        else:
            row_field[row_list[i]] = row_list[i+1]
        
    
    return tuple(row_field.values())

In [45]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)

parsed_rdd.count()

                                                                                

47062506

In [46]:
# Define the schema for the DataFrame
schema_badges = StructType([
    StructField("Id", StringType()),
    StructField("UserId", StringType()),
    StructField("Name", StringType()),
    StructField("Date", TimestampType()),
    StructField("Class", StringType()),
    StructField("TagBased", BooleanType()),
])


In [41]:
# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_badges)

In [42]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- UserId: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Class: string (nullable = true)
 |-- TagBased: string (nullable = true)



In [43]:
df.show()

[Stage 33:>                                                         (0 + 1) / 1]

+-----+------+-------+--------------------+-----+--------+
|   Id|UserId|   Name|                Date|Class|TagBased|
+-----+------+-------+--------------------+-----+--------+
|82946|  3718|Teacher|2008-09-15 08:55:...|    3|   False|
|82947|   994|Teacher|2008-09-15 08:55:...|    3|   False|
|82949|  3893|Teacher|2008-09-15 08:55:...|    3|   False|
|82950|  4591|Teacher|2008-09-15 08:55:...|    3|   False|
|82951|  5196|Teacher|2008-09-15 08:55:...|    3|   False|
|82952|  2635|Teacher|2008-09-15 08:55:...|    3|   False|
|82953|  1113|Teacher|2008-09-15 08:55:...|    3|   False|
|82954|  4182|Teacher|2008-09-15 08:55:...|    3|   False|
|82955|   164|Teacher|2008-09-15 08:55:...|    3|   False|
|82956|   652|Teacher|2008-09-15 08:55:...|    3|   False|
|82957|  5246|Teacher|2008-09-15 08:55:...|    3|   False|
|82958|   509|Teacher|2008-09-15 08:55:...|    3|   False|
|82959|   670|Teacher|2008-09-15 08:55:...|    3|   False|
|82960|  5024|Teacher|2008-09-15 08:55:...|    3|   Fals

                                                                                

In [47]:
df = df \
    .withColumn('Id', F.col('Id').cast('int')) \
    .withColumn('UserId', F.col('UserId').cast('int')) 
df.count()

                                                                                

47062506

In [19]:

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_badges)

# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed'
output_folder_name = f"{output_bucket}/Badges-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+-----+------+-------+--------------------+-----+--------+
|   Id|UserId|   Name|                Date|Class|TagBased|
+-----+------+-------+--------------------+-----+--------+
|82946|  3718|Teacher|2008-09-15 08:55:...|    3|    null|
|82947|   994|Teacher|2008-09-15 08:55:...|    3|    null|
|82949|  3893|Teacher|2008-09-15 08:55:...|    3|    null|
|82950|  4591|Teacher|2008-09-15 08:55:...|    3|    null|
|82951|  5196|Teacher|2008-09-15 08:55:...|    3|    null|
|82952|  2635|Teacher|2008-09-15 08:55:...|    3|    null|
|82953|  1113|Teacher|2008-09-15 08:55:...|    3|    null|
|82954|  4182|Teacher|2008-09-15 08:55:...|    3|    null|
|82955|   164|Teacher|2008-09-15 08:55:...|    3|    null|
|82956|   652|Teacher|2008-09-15 08:55:...|    3|    null|
|82957|  5246|Teacher|2008-09-15 08:55:...|    3|    null|
|82958|   509|Teacher|2008-09-15 08:55:...|    3|    null|
|82959|   670|Teacher|2008-09-15 08:55:...|    3|    null|
|82960|  5024|Teacher|2008-09-15 08:55:...|    3|    nul

                                                                                