In [1]:
spark

## Processing `Tags.xml`

In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [2]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw'
dataset_comments = f"{dataset_bucket}/Tags.xml"

In [3]:
rdd = spark.sparkContext.textFile(dataset_comments)

In [7]:
def row_parser(row):
    
    fields = [
                "Id=",
                "TagName=",
                "Count=",
                "ExcerptPostId=",
                "WikiPostId="
            ]
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        row_field[row_list[i]] = row_list[i+1]
    
    
    return tuple(row_field.values())

In [8]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)

parsed_rdd.count()

                                                                                

64155

In [9]:
# Define the schema for the DataFrame
schema_tags = StructType([
    StructField("Id", StringType()),
    StructField("TagName", StringType()),
    StructField("Count", StringType()),
    StructField("ExcerptPostId", StringType()),
    StructField("WikiPostId", StringType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_tags)

In [10]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: string (nullable = true)
 |-- ExcerptPostId: string (nullable = true)
 |-- WikiPostId: string (nullable = true)



In [11]:
df.show()

[Stage 4:>                                                          (0 + 1) / 1]

+---+----------+-------+-------------+----------+
| Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+---+----------+-------+-------------+----------+
|  1|      .net| 326206|      3624959|   3607476|
|  2|      html|1156034|      3673183|   3673182|
|  3|javascript|2453736|      3624960|   3607052|
|  4|       css| 779112|      3644670|   3644669|
|  5|       php|1451338|      3624936|   3607050|
|  8|         c| 390095|      3624961|   3607013|
|  9|        c#|1571218|      3624962|   3607007|
| 10|       c++| 783573|      3624963|   3606997|
| 12|      ruby| 226594|      3624964|   3607043|
| 14|      lisp|   6834|      3656743|   3656742|
| 16|    python|2071403|      3624965|   3607014|
| 17|      java|1878099|      3624966|   3607018|
| 18|     regex| 254923|      3624967|   3607017|
| 19|       xml| 211315|      3624968|   3607588|
| 21|     mysql| 655007|      3624969|   3607033|
| 22|       sql| 650214|      3625226|   3607304|
| 23|      tsql|  71604|      4777787|   4777786|


                                                                                

In [12]:
df = df \
    .withColumn('Id', F.col('Id').cast('int')) \
    .withColumn('Count', F.col('Count').cast('int')) \
    .withColumn('ExcerptPostId', F.col('ExcerptPostId').cast('int')) \
    .withColumn('WikiPostId', F.col('WikiPostId').cast('int')) 

df.count()

                                                                                

64155

In [13]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed'
output_folder_name = f"{output_bucket}/Tags-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

## Verifying the data by reading from S3

In [14]:
df = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(output_folder_name)

                                                                                

In [15]:
df.show(100)

[Stage 9:>                                                          (0 + 1) / 1]

+---+--------------------+-------+-------------+----------+
| Id|             TagName|  Count|ExcerptPostId|WikiPostId|
+---+--------------------+-------+-------------+----------+
|  1|                .net| 326206|      3624959|   3607476|
|  2|                html|1156034|      3673183|   3673182|
|  3|          javascript|2453736|      3624960|   3607052|
|  4|                 css| 779112|      3644670|   3644669|
|  5|                 php|1451338|      3624936|   3607050|
|  8|                   c| 390095|      3624961|   3607013|
|  9|                  c#|1571218|      3624962|   3607007|
| 10|                 c++| 783573|      3624963|   3606997|
| 12|                ruby| 226594|      3624964|   3607043|
| 14|                lisp|   6834|      3656743|   3656742|
| 16|              python|2071403|      3624965|   3607014|
| 17|                java|1878099|      3624966|   3607018|
| 18|               regex| 254923|      3624967|   3607017|
| 19|                 xml| 211315|      

                                                                                