In [1]:
spark

## Processing `Tags.xml`

In [1]:
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, LongType
from datetime import datetime

In [2]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw/2023'
dataset_comments = f"{dataset_bucket}/Tags.xml"

In [3]:
rdd = spark.sparkContext.textFile(dataset_comments)

In [4]:
def row_parser(row):
    
    fields = [
                "Id=",
                "TagName=",
                "Count=",
                "ExcerptPostId=",
                "WikiPostId="
            ]
    
    row_field = dict.fromkeys(fields, None)
    row_list = [ i.strip() for i in row.split('"')[:-1] ]
    
    for i in range(0, len(row_list), 2):
        row_field[row_list[i]] = row_list[i+1]
    
    
    return tuple(row_field.values())

In [5]:
parsed_rdd = rdd.map(lambda row: row.strip()) \
   .filter(lambda row: row.startswith("<row")) \
   .map(lambda row: row[4:-3]) \
   .map(lambda row: row.strip()) \
   .map(row_parser)

parsed_rdd.count()

                                                                                

64465

In [6]:
# Define the schema for the DataFrame
schema_tags = StructType([
    StructField("Id", StringType()),
    StructField("TagName", StringType()),
    StructField("Count", StringType()),
    StructField("ExcerptPostId", StringType()),
    StructField("WikiPostId", StringType())
])

# Convert the RDD to a DataFrame
df = parsed_rdd.toDF(schema_tags)

In [7]:
df.printSchema()

root
 |-- Id: string (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: string (nullable = true)
 |-- ExcerptPostId: string (nullable = true)
 |-- WikiPostId: string (nullable = true)



In [8]:
df.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---+----------+-------+-------------+----------+
| Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+---+----------+-------+-------------+----------+
|  1|      .net| 329455|      3624959|   3607476|
|  2|      html|1167742|      3673183|   3673182|
|  3|javascript|2479947|      3624960|   3607052|
|  4|       css| 787138|      3644670|   3644669|
|  5|       php|1456271|      3624936|   3607050|
|  8|         c| 393884|      3624961|   3607013|
|  9|        c#|1583879|      3624962|   3607007|
| 10|       c++| 789699|      3624963|   3606997|
| 12|      ruby| 227478|      3624964|   3607043|
| 14|      lisp|   6865|      3656743|   3656742|
| 16|    python|2113196|      3624965|   3607014|
| 17|      java|1889767|      3624966|   3607018|
| 18|     regex| 256791|      3624967|   3607017|
| 19|       xml| 212440|      3624968|   3607588|
| 21|     mysql| 658506|      3624969|   3607033|
| 22|       sql| 656848|      3625226|   3607304|
| 23|      tsql|  72061|      4777787|   4777786|


                                                                                

In [9]:
df = df \
    .withColumn('Id', F.col('Id').cast('int')) \
    .withColumn('Count', F.col('Count').cast('int')) \
    .withColumn('ExcerptPostId', F.col('ExcerptPostId').cast('int')) \
    .withColumn('WikiPostId', F.col('WikiPostId').cast('int')) 

df.count()

                                                                                

64465

In [11]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- ExcerptPostId: integer (nullable = true)
 |-- WikiPostId: integer (nullable = true)



In [10]:
df.show()

[Stage 5:>                                                          (0 + 1) / 1]

+---+----------+-------+-------------+----------+
| Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+---+----------+-------+-------------+----------+
|  1|      .net| 329455|      3624959|   3607476|
|  2|      html|1167742|      3673183|   3673182|
|  3|javascript|2479947|      3624960|   3607052|
|  4|       css| 787138|      3644670|   3644669|
|  5|       php|1456271|      3624936|   3607050|
|  8|         c| 393884|      3624961|   3607013|
|  9|        c#|1583879|      3624962|   3607007|
| 10|       c++| 789699|      3624963|   3606997|
| 12|      ruby| 227478|      3624964|   3607043|
| 14|      lisp|   6865|      3656743|   3656742|
| 16|    python|2113196|      3624965|   3607014|
| 17|      java|1889767|      3624966|   3607018|
| 18|     regex| 256791|      3624967|   3607017|
| 19|       xml| 212440|      3624968|   3607588|
| 21|     mysql| 658506|      3624969|   3607033|
| 22|       sql| 656848|      3625226|   3607304|
| 23|      tsql|  72061|      4777787|   4777786|


                                                                                

In [11]:
df.orderBy(F.desc('Count')).show()



+-----+----------+-------+-------------+----------+
|   Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+-----+----------+-------+-------------+----------+
|    3|javascript|2479947|      3624960|   3607052|
|   16|    python|2113196|      3624965|   3607014|
|   17|      java|1889767|      3624966|   3607018|
|    9|        c#|1583879|      3624962|   3607007|
|    5|       php|1456271|      3624936|   3607050|
| 1386|   android|1400026|      3625001|   3607484|
|    2|      html|1167742|      3673183|   3673182|
|  820|    jquery|1033113|      3625262|   3607053|
|   10|       c++| 789699|      3624963|   3606997|
|    4|       css| 787138|      3644670|   3644669|
|58338|       ios| 679677|      4536664|   4536663|
|   21|     mysql| 658506|      3624969|   3607033|
|   22|       sql| 656848|      3625226|   3607304|
| 4452|         r| 483254|      3625322|   3607736|
|46426|   node.js| 457999|      4238969|   4238968|
|92497|   reactjs| 446452|     16880335|  16880334|
|  114|    a

                                                                                

In [13]:
df.count()

                                                                                

64465

In [14]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed/2023'
output_folder_name = f"{output_bucket}/Tags-parquet"

# save dataframe as csv
df.write \
  .format('parquet') \
  .option('header', True) \
  .mode('overwrite') \
  .save(output_folder_name)

                                                                                

## Verifying the data by reading from S3

In [15]:
df = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(output_folder_name)

[Stage 12:>                                                         (0 + 1) / 1]                                                                                

In [16]:
df.show(100)

[Stage 13:>                                                         (0 + 1) / 1]

+-----+--------------------+-----+-------------+----------+
|   Id|             TagName|Count|ExcerptPostId|WikiPostId|
+-----+--------------------+-----+-------------+----------+
|97665|                sidr|   52|     19815160|  19815159|
|97667|              deedle|  271|     19796305|  19796304|
|97671|  nsincrementalstore|   16|         null|      null|
|97673|       compound-type|    6|         null|      null|
|97679|        managed-file|    7|         null|      null|
|97681|            pushapps|   12|     19948129|  19948128|
|97682|    multichoiceitems|   70|     23291906|  23291905|
|97683|           sttwitter|   60|     19821491|  19821490|
|97684|            weighttp|    2|     47309724|  47309723|
|97686|    samsung-touchwiz|   36|     19947960|  19947959|
|97689|                juzu|    2|     19810458|  19810457|
|97691|         ldap-client|   29|     19810586|  19810585|
|97693|         semantic-ui| 2734|     20095688|  20095687|
|97694|   uikit-transitions|   18|      

                                                                                

In [17]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- ExcerptPostId: integer (nullable = true)
 |-- WikiPostId: integer (nullable = true)



In [18]:
df.orderBy(F.desc('Count')).show()



+-----+----------+-------+-------------+----------+
|   Id|   TagName|  Count|ExcerptPostId|WikiPostId|
+-----+----------+-------+-------------+----------+
|    3|javascript|2479947|      3624960|   3607052|
|   16|    python|2113196|      3624965|   3607014|
|   17|      java|1889767|      3624966|   3607018|
|    9|        c#|1583879|      3624962|   3607007|
|    5|       php|1456271|      3624936|   3607050|
| 1386|   android|1400026|      3625001|   3607484|
|    2|      html|1167742|      3673183|   3673182|
|  820|    jquery|1033113|      3625262|   3607053|
|   10|       c++| 789699|      3624963|   3606997|
|    4|       css| 787138|      3644670|   3644669|
|58338|       ios| 679677|      4536664|   4536663|
|   21|     mysql| 658506|      3624969|   3607033|
|   22|       sql| 656848|      3625226|   3607304|
| 4452|         r| 483254|      3625322|   3607736|
|46426|   node.js| 457999|      4238969|   4238968|
|92497|   reactjs| 446452|     16880335|  16880334|
|  114|    a

                                                                                

In [20]:
list_of_keywords = ['aws', 'AWS', 'amazon', 'Amazon', 'amz']


In [21]:
df_tags_with_amazon = df.select(F.col("Id"), F.col("TagName"), F.col("Count")) \
                       .filter(F.col("TagName").startswith('aws') | F.col("TagName").isin(list_of_keywords)) \

In [22]:
df_tags_with_amazon.orderBy(F.desc(F.col("Count"))).show(230)



+------+--------------------+-----+
|    Id|             TagName|Count|
+------+--------------------+-----+
|108737|          aws-lambda|28291|
|113256|     aws-api-gateway| 6989|
|107211|             aws-sdk| 5948|
| 99166|             aws-cli| 4076|
|  1688|              amazon| 4054|
|130293|         aws-amplify| 3866|
|128181|            aws-glue| 3554|
|133995|             aws-cdk| 2804|
|130125|         aws-appsync| 1674|
|130040|         aws-fargate| 1499|
|114449|    aws-codepipeline| 1400|
|123484|       aws-codebuild| 1342|
|123710|  aws-step-functions| 1261|
|115200|             aws-iot| 1167|
|132737|      aws-serverless| 1108|
|108181|     aws-code-deploy| 1028|
|133429|             aws-sam|  856|
|135716|aws-application-l...|  803|
|115018|  aws-security-group|  765|
|132552| aws-secrets-manager|  731|
|132248|   aws-load-balancer|  716|
|125258|          aws-sdk-js|  609|
| 92026|        aws-opsworks|  543|
|112389|        aws-java-sdk|  528|
|114185|      aws-codecommit

                                                                                