In [1]:
spark

## Processing `Tags.xml`

In [2]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [3]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw'
dataset_comments = f"{dataset_bucket}/Tags.xml"

In [4]:
rdd = spark.sparkContext.textFile(dataset_comments)

In [5]:
def row_parser(row):
    
    fields = [
                "Id=",
                "TagName=",
                "Count=",
                "ExcerptPostId=",
                "WikiPostId="
            ]
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        row_field[row_list[i]] = row_list[i+1]
    
    
    return tuple(row_field.values())

In [6]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)

parsed_rdd.count()

                                                                                

64155

In [7]:
# Define the schema for the DataFrame
schema_tags = StructType([
    StructField("Id", StringType()),
    StructField("TagName", StringType()),
    StructField("Count", StringType()),
    StructField("ExcerptPostId", StringType()),
    StructField("WikiPostId", StringType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_tags)

In [8]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: string (nullable = true)
 |-- ExcerptPostId: string (nullable = true)
 |-- WikiPostId: string (nullable = true)



In [9]:
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---+----------+-------+-------------+----------+
| Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+---+----------+-------+-------------+----------+
|  1|      .net| 326206|      3624959|   3607476|
|  2|      html|1156034|      3673183|   3673182|
|  3|javascript|2453736|      3624960|   3607052|
|  4|       css| 779112|      3644670|   3644669|
|  5|       php|1451338|      3624936|   3607050|
|  8|         c| 390095|      3624961|   3607013|
|  9|        c#|1571218|      3624962|   3607007|
| 10|       c++| 783573|      3624963|   3606997|
| 12|      ruby| 226594|      3624964|   3607043|
| 14|      lisp|   6834|      3656743|   3656742|
| 16|    python|2071403|      3624965|   3607014|
| 17|      java|1878099|      3624966|   3607018|
| 18|     regex| 254923|      3624967|   3607017|
| 19|       xml| 211315|      3624968|   3607588|
| 21|     mysql| 655007|      3624969|   3607033|
| 22|       sql| 650214|      3625226|   3607304|
| 23|      tsql|  71604|      4777787|   4777786|


                                                                                

In [10]:
df = df \
    .withColumn('Id', F.col('Id').cast('int')) \
    .withColumn('Count', F.col('Count').cast('int')) \
    .withColumn('ExcerptPostId', F.col('ExcerptPostId').cast('int')) \
    .withColumn('WikiPostId', F.col('WikiPostId').cast('int')) 

df.count()

                                                                                

64155

In [11]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- ExcerptPostId: integer (nullable = true)
 |-- WikiPostId: integer (nullable = true)



In [12]:
df.show()

[Stage 5:>                                                          (0 + 1) / 1]

+---+----------+-------+-------------+----------+
| Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+---+----------+-------+-------------+----------+
|  1|      .net| 326206|      3624959|   3607476|
|  2|      html|1156034|      3673183|   3673182|
|  3|javascript|2453736|      3624960|   3607052|
|  4|       css| 779112|      3644670|   3644669|
|  5|       php|1451338|      3624936|   3607050|
|  8|         c| 390095|      3624961|   3607013|
|  9|        c#|1571218|      3624962|   3607007|
| 10|       c++| 783573|      3624963|   3606997|
| 12|      ruby| 226594|      3624964|   3607043|
| 14|      lisp|   6834|      3656743|   3656742|
| 16|    python|2071403|      3624965|   3607014|
| 17|      java|1878099|      3624966|   3607018|
| 18|     regex| 254923|      3624967|   3607017|
| 19|       xml| 211315|      3624968|   3607588|
| 21|     mysql| 655007|      3624969|   3607033|
| 22|       sql| 650214|      3625226|   3607304|
| 23|      tsql|  71604|      4777787|   4777786|


                                                                                

In [13]:
df.orderBy(F.desc('Count')).show()

[Stage 6:>                                                          (0 + 2) / 2]

+-----+----------+-------+-------------+----------+
|   Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+-----+----------+-------+-------------+----------+
|    3|javascript|2453736|      3624960|   3607052|
|   16|    python|2071403|      3624965|   3607014|
|   17|      java|1878099|      3624966|   3607018|
|    9|        c#|1571218|      3624962|   3607007|
|    5|       php|1451338|      3624936|   3607050|
| 1386|   android|1393144|      3625001|   3607484|
|    2|      html|1156034|      3673183|   3673182|
|  820|    jquery|1031377|      3625262|   3607053|
|   10|       c++| 783573|      3624963|   3606997|
|    4|       css| 779112|      3644670|   3644669|
|58338|       ios| 677317|      4536664|   4536663|
|   21|     mysql| 655007|      3624969|   3607033|
|   22|       sql| 650214|      3625226|   3607304|
| 4452|         r| 473435|      3625322|   3607736|
|46426|   node.js| 450209|      4238969|   4238968|
|92497|   reactjs| 430730|     16880335|  16880334|
|  114|    a



In [14]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed'
output_folder_name = f"{output_bucket}/Tags-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

                                                                                

## Verifying the data by reading from S3

In [15]:
df = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(output_folder_name)

In [16]:
df.show(100)

[Stage 9:>                                                          (0 + 1) / 1]                                                                                

+-----+--------------------+-----+-------------+----------+
|   Id|             TagName|Count|ExcerptPostId|WikiPostId|
+-----+--------------------+-----+-------------+----------+
|97196|       chef-template|   21|         null|      null|
|97197|           openbadge|   15|     19533613|  19533612|
|97199|                rhom|    2|         null|      null|
|97200|      kendo-dropdown|  353|     23653918|  23653917|
|97202|     easy-thumbnails|   58|     19540558|  19540557|
|97203|           todataurl|  202|     35613258|  35613257|
|97208|             go-flag|   19|     45148577|  45148576|
|97211|          stripe.net|  133|     19545780|  19545779|
|97212|              midori|   26|     19545676|  19545675|
|97213|               bento|   14|         null|      null|
|97215|              fedext|   38|     19547016|  19547015|
|97218|        browser-link|   84|     19552377|  19552376|
|97222|       atomicinteger|  116|     67507147|  67507146|
|97223| revealing-prototype|   11|      

In [17]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- ExcerptPostId: integer (nullable = true)
 |-- WikiPostId: integer (nullable = true)



In [18]:
df.orderBy(F.desc('Count')).show()

                                                                                

+-----+----------+-------+-------------+----------+
|   Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+-----+----------+-------+-------------+----------+
|    3|javascript|2453736|      3624960|   3607052|
|   16|    python|2071403|      3624965|   3607014|
|   17|      java|1878099|      3624966|   3607018|
|    9|        c#|1571218|      3624962|   3607007|
|    5|       php|1451338|      3624936|   3607050|
| 1386|   android|1393144|      3625001|   3607484|
|    2|      html|1156034|      3673183|   3673182|
|  820|    jquery|1031377|      3625262|   3607053|
|   10|       c++| 783573|      3624963|   3606997|
|    4|       css| 779112|      3644670|   3644669|
|58338|       ios| 677317|      4536664|   4536663|
|   21|     mysql| 655007|      3624969|   3607033|
|   22|       sql| 650214|      3625226|   3607304|
| 4452|         r| 473435|      3625322|   3607736|
|46426|   node.js| 450209|      4238969|   4238968|
|92497|   reactjs| 430730|     16880335|  16880334|
|  114|    a

In [20]:
list_of_keywords = ['aws', 'AWS', 'amazon', 'Amazon', 'amz']


In [21]:
df_tags_with_amazon = df.select(F.col("Id"), F.col("TagName"), F.col("Count")) \
                       .filter(F.col("TagName").startswith('aws') | F.col("TagName").isin(list_of_keywords)) \

In [22]:
df_tags_with_amazon.orderBy(F.desc(F.col("Count"))).show(230)



+------+--------------------+-----+
|    Id|             TagName|Count|
+------+--------------------+-----+
|108737|          aws-lambda|28291|
|113256|     aws-api-gateway| 6989|
|107211|             aws-sdk| 5948|
| 99166|             aws-cli| 4076|
|  1688|              amazon| 4054|
|130293|         aws-amplify| 3866|
|128181|            aws-glue| 3554|
|133995|             aws-cdk| 2804|
|130125|         aws-appsync| 1674|
|130040|         aws-fargate| 1499|
|114449|    aws-codepipeline| 1400|
|123484|       aws-codebuild| 1342|
|123710|  aws-step-functions| 1261|
|115200|             aws-iot| 1167|
|132737|      aws-serverless| 1108|
|108181|     aws-code-deploy| 1028|
|133429|             aws-sam|  856|
|135716|aws-application-l...|  803|
|115018|  aws-security-group|  765|
|132552| aws-secrets-manager|  731|
|132248|   aws-load-balancer|  716|
|125258|          aws-sdk-js|  609|
| 92026|        aws-opsworks|  543|
|112389|        aws-java-sdk|  528|
|114185|      aws-codecommit

                                                                                