In [1]:
spark

## Processing `Comments.xml`

In [53]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [54]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw/'
dataset_comments = f"{dataset_bucket}/Tags.xml"

In [55]:
rdd = spark.sparkContext.textFile(dataset_comments)

In [56]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(lambda row: (int(row.split('"')[1]), row.split('"')[3], int(row.split('"')[5]), int(row.split('"')[7]), int(row.split('"')[9])) if len(row) == 11 else (int(row.split('"')[1]), row.split('"')[3], int(row.split('"')[5]), None, None)) 

In [57]:
parsed_rdd.count()

[Stage 24:>                                                         (0 + 2) / 2]                                                                                

64155

In [61]:
# Define the schema for the DataFrame
schema_tags = StructType([
    StructField("Id", LongType()),
    StructField("TagName", StringType()),
    StructField("Count", LongType()),
    StructField("ExcerptPostId", LongType()),
    StructField("WikiPostId", LongType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_tags)

In [62]:
df.printSchema()

root
 |-- Id: long (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: long (nullable = true)
 |-- ExcerptPostId: long (nullable = true)
 |-- WikiPostId: long (nullable = true)



In [63]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed'
output_folder_name = f"{output_bucket}/Tags-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

                                                                                

## Verifying the data by reading from S3

In [65]:
df = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(output_folder_name)

[Stage 30:>                                                         (0 + 1) / 1]                                                                                

In [66]:
df.show()

+-----+-------------------+-----+-------------+----------+
|   Id|            TagName|Count|ExcerptPostId|WikiPostId|
+-----+-------------------+-----+-------------+----------+
|97196|      chef-template|   21|         null|      null|
|97197|          openbadge|   15|         null|      null|
|97199|               rhom|    2|         null|      null|
|97200|     kendo-dropdown|  353|         null|      null|
|97202|    easy-thumbnails|   58|         null|      null|
|97203|          todataurl|  202|         null|      null|
|97208|            go-flag|   19|         null|      null|
|97211|         stripe.net|  133|         null|      null|
|97212|             midori|   26|         null|      null|
|97213|              bento|   14|         null|      null|
|97215|             fedext|   38|         null|      null|
|97218|       browser-link|   84|         null|      null|
|97222|      atomicinteger|  116|         null|      null|
|97223|revealing-prototype|   11|         null|      nul

[Stage 31:>                                                         (0 + 1) / 1]                                                                                