In [1]:
spark

## Processing `Users.xml`

In [7]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [2]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw/2023/'
dataset_comments = f"{dataset_bucket}/Users.xml"

In [3]:
def row_parser(row):
    
    fields = [
                "Id=",
                "Reputation=",
                "CreationDate=",
                "DisplayName=",
                "LastAccessDate=",
                "WebsiteUrl=",
                "Location=",
                "AboutMe=",
                "Views=",
                "UpVotes=",
                "DownVotes=",
                "ProfileImageUrl=",
                "AccountId="
            ]
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        if row_list[i] in ["LastAccessDate=", "CreationDate="]:
            row_field[row_list[i]] = datetime.strptime(row_list[i+1], "%Y-%m-%dT%H:%M:%S.%f")
        else:
            row_field[row_list[i]] = row_list[i+1]
        
    
    return tuple(row_field.values())

In [4]:
rdd = spark.sparkContext.textFile(dataset_comments)


In [5]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)


In [9]:
parsed_rdd.count()

                                                                                

19942787

In [10]:
# Define the schema for the DataFrame
schema_users = StructType([
    StructField("Id", StringType()),
    StructField("Reputation", StringType()),
    StructField("CreationDate", TimestampType()),
    StructField("DisplayName", StringType()),
    StructField("LastAccessDate", TimestampType()),
    StructField("WebsiteUrl", StringType()),
    StructField("Location", StringType()),
    StructField("AboutMe", StringType()),
    StructField("Views", StringType()),
    StructField("UpVotes", StringType()),
    StructField("DownVotes", StringType()),
    StructField("ProfileImageUrl", StringType()),
    StructField("AccountId", StringType())
])

In [11]:
# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_users)

In [12]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- Reputation: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- DisplayName: string (nullable = true)
 |-- LastAccessDate: timestamp (nullable = true)
 |-- WebsiteUrl: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- AboutMe: string (nullable = true)
 |-- Views: string (nullable = true)
 |-- UpVotes: string (nullable = true)
 |-- DownVotes: string (nullable = true)
 |-- ProfileImageUrl: string (nullable = true)
 |-- AccountId: string (nullable = true)



In [13]:
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+-----+----------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+------+-------+---------+---------------+---------+
|   Id|Reputation|        CreationDate|    DisplayName|      LastAccessDate|          WebsiteUrl|            Location|             AboutMe| Views|UpVotes|DownVotes|ProfileImageUrl|AccountId|
+-----+----------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+------+-------+---------+---------------+---------+
|-1014|         1|2023-02-17 19:52:...|     R Language|2023-02-17 19:52:...|                null|                null|&lt;p&gt;A collec...|     0|      0|        0|           null|     null|
|-1013|         1|2023-02-17 19:25:...|          CI/CD|2023-02-17 19:25:...|                null|                null|&lt;p&gt;A collec...|     0|      0|        0|           null|     null|
|-1012|         1|2023-02-15 23:24:...|      

                                                                                

In [14]:
df = df \
    .withColumn('Id', F.col('Id').cast('int')) \
    .withColumn('Reputation', F.col('Reputation').cast('int')) \
    .withColumn('Views', F.col('Views').cast('int')) \
    .withColumn('UpVotes', F.col('UpVotes').cast('int')) \
    .withColumn('DownVotes', F.col('DownVotes').cast('int')) \
    .withColumn('AccountId', F.col('AccountId').cast('int')) 

df.count()

                                                                                

19942787

In [135]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- Reputation: integer (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- DisplayName: string (nullable = true)
 |-- LastAccessDate: timestamp (nullable = true)
 |-- WebsiteUrl: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- AboutMe: string (nullable = true)
 |-- Views: integer (nullable = true)
 |-- UpVotes: integer (nullable = true)
 |-- DownVotes: integer (nullable = true)
 |-- ProfileImageUrl: string (nullable = true)
 |-- AccountId: integer (nullable = true)



In [136]:
df.show()

[Stage 78:>                                                         (0 + 1) / 1]

+-----+----------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+------+-------+---------+--------------------+---------+
|   Id|Reputation|        CreationDate|     DisplayName|      LastAccessDate|          WebsiteUrl|            Location|             AboutMe| Views|UpVotes|DownVotes|     ProfileImageUrl|AccountId|
+-----+----------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+------+-------+---------+--------------------+---------+
|-1011|         1|2022-11-09 20:49:...|             AWS|2022-11-09 20:49:...|                null|                null|&lt;p&gt;Amazon W...|     0|      0|        0|                null|     null|
|-1010|         1|2022-10-25 19:18:...| Microsoft Azure|2022-10-25 19:18:...|                null|                null|&lt;p&gt;Azure Co...|     0|      0|        0|                null|     null|
|-1009|        

                                                                                

In [16]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed/2023'
output_folder_name = f"{output_bucket}/Users-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

df.show()


[Stage 6:>                                                          (0 + 1) / 1]

+-----+----------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+------+-------+---------+---------------+---------+
|   Id|Reputation|        CreationDate|    DisplayName|      LastAccessDate|          WebsiteUrl|            Location|             AboutMe| Views|UpVotes|DownVotes|ProfileImageUrl|AccountId|
+-----+----------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+------+-------+---------+---------------+---------+
|-1014|         1|2023-02-17 19:52:...|     R Language|2023-02-17 19:52:...|                null|                null|&lt;p&gt;A collec...|     0|      0|        0|           null|     null|
|-1013|         1|2023-02-17 19:25:...|          CI/CD|2023-02-17 19:25:...|                null|                null|&lt;p&gt;A collec...|     0|      0|        0|           null|     null|
|-1012|         1|2023-02-15 23:24:...|      

                                                                                