In [1]:
spark

In [7]:
import pyspark.sql.functions as F
import pyspark.sql.window as W

# Loading the Data 

In [8]:
# Dataset path 
dataset_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed/2023'

# All folders 
votes_dataset_file = f"{dataset_bucket}/Votes-parquet"
badges_dataset_file = f"{dataset_bucket}/Badges-parquet"
comments_dataset_file = f"{dataset_bucket}/Comments-parquet"
post_history_dataset_file = f"{dataset_bucket}/PostHistory-parquet"
post_dataset_file = f"{dataset_bucket}/Post-parquet"
tags_dataset_file = f"{dataset_bucket}/Tags-parquet"
users_dataset_file = f"{dataset_bucket}/Users-parquet"


In [9]:
# Votes
df_votes = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(votes_dataset_file)

# Badges
df_badges = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(badges_dataset_file)

# Comments
df_comments = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(comments_dataset_file)

# PostHistory 
df_post_history = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(post_history_dataset_file)

# Post
df_post = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(post_dataset_file)

# Tags
df_tags = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(tags_dataset_file)

# Users
df_users = spark.read \
         .option("header", True) \
         .option("inferSchema", True) \
         .parquet(users_dataset_file)

                                                                                

# Validating the `loaded` Data

## 1. `Votes` Data 

In [10]:
print(f"No. of Records in the Votes table : {df_votes.count()}")



No. of Records in the Votes table : 228077281


                                                                                

In [11]:
df_votes.show(100)

[Stage 17:>                                                         (0 + 1) / 1]

+--------+--------+----------+------+-------+-----+-------------------+-------------+----------+------------+
|      Id|  PostId|VoteTypeId|UserId|TagName|Count|       CreationDate|ExcerptPostId|WikiPostId|BountyAmount|
+--------+--------+----------+------+-------+-----+-------------------+-------------+----------+------------+
|70827674|24502486|         2|  null|   null| null|2014-07-01 00:00:00|         null|      null|        null|
|70827675| 7197327|         2|  null|   null| null|2014-07-01 00:00:00|         null|      null|        null|
|70827676| 5367068|         2|  null|   null| null|2014-07-01 00:00:00|         null|      null|        null|
|70827677|24340423|         2|  null|   null| null|2014-07-01 00:00:00|         null|      null|        null|
|70827678|24489826|         1|  null|   null| null|2014-07-01 00:00:00|         null|      null|        null|
|70827679|24489826|         2|  null|   null| null|2014-07-01 00:00:00|         null|      null|        null|
|70827680|

                                                                                

## 2. `Badges` Data

In [12]:
print(f"No. of Records in the Badges table : {df_badges.count()}")



No. of Records in the Badges table : 48022288


                                                                                

In [13]:
df_badges.show(100)

[Stage 21:>                                                         (0 + 1) / 1]

+-----+------+-------+--------------------+-----+--------+
|   Id|UserId|   Name|                Date|Class|TagBased|
+-----+------+-------+--------------------+-----+--------+
|82946|  3718|Teacher|2008-09-15 08:55:...|    3|   false|
|82947|   994|Teacher|2008-09-15 08:55:...|    3|   false|
|82949|  3893|Teacher|2008-09-15 08:55:...|    3|   false|
|82950|  4591|Teacher|2008-09-15 08:55:...|    3|   false|
|82951|  5196|Teacher|2008-09-15 08:55:...|    3|   false|
|82952|  2635|Teacher|2008-09-15 08:55:...|    3|   false|
|82953|  1113|Teacher|2008-09-15 08:55:...|    3|   false|
|82954|  4182|Teacher|2008-09-15 08:55:...|    3|   false|
|82955|   164|Teacher|2008-09-15 08:55:...|    3|   false|
|82956|   652|Teacher|2008-09-15 08:55:...|    3|   false|
|82957|  5246|Teacher|2008-09-15 08:55:...|    3|   false|
|82958|   509|Teacher|2008-09-15 08:55:...|    3|   false|
|82959|   670|Teacher|2008-09-15 08:55:...|    3|   false|
|82960|  5024|Teacher|2008-09-15 08:55:...|    3|   fals

                                                                                

## 3. `Comments` Data

In [14]:
print(f"No. of Records in the Comments table : {df_comments.count()}")



No. of Records in the Comments table : 88222951


                                                                                

In [15]:
df_comments.show(100)

[Stage 25:>                                                         (0 + 1) / 1]

+-------+-------+-----+--------------------+--------------------+---------------+------+--------------+
|     Id| PostId|Score|                Text|        CreationDate|UserDisplayName|UserId|ContentLicense|
+-------+-------+-----+--------------------+--------------------+---------------+------+--------------+
|2687987|2669420|    0|Ok, I'm coming fr...|2010-04-19 17:53:...|           null|267075|  CC BY-SA 2.5|
|2687989|2669610|    0|Could you define ...|2010-04-19 17:53:...|           null| 10174|  CC BY-SA 2.5|
|2687990|2669627|    0|Please, indicate ...|2010-04-19 17:53:...|           null|308903|  CC BY-SA 2.5|
|2687994|2659117|    0|thanks, the wikip...|2010-04-19 17:54:...|           null|297353|  CC BY-SA 2.5|
|2687996|2669629|    0|Which part doesn'...|2010-04-19 17:54:...|           null|187606|  CC BY-SA 2.5|
|2687998|2669207|    0|It should, yes. W...|2010-04-19 17:54:...|           null| 29676|  CC BY-SA 2.5|
|2688002| 696453|    0|This answer works...|2010-04-19 17:54:...

                                                                                

## 4. `Post History` Data

In [16]:
print(f"No. of Records in the PostHistory table : {df_post_history.count()}")



No. of Records in the PostHistory table : 156055380


                                                                                

In [17]:
df_post_history.show(100)

[Stage 29:>                                                         (0 + 1) / 1]

+---------+-----------------+--------+--------------------+--------------------+--------+---------------+--------------------+--------------------+--------------+
|       Id|PostHistoryTypeId|  PostId|        RevisionGUID|        CreationDate|  UserId|UserDisplayName|             Comment|                Text|ContentLicense|
+---------+-----------------+--------+--------------------+--------------------+--------+---------------+--------------------+--------------------+--------------+
|289111469|                3|75632632|5bae3186-9bde-491...|2023-03-03 23:17:...|10876372|           null|                null|&lt;r&gt;&lt;dply...|  CC BY-SA 4.0|
|289111472|                5|75632589|5ccd2b80-c2e7-41f...|2023-03-03 23:17:...|  523612|           null|add more interest...|This is a curious...|  CC BY-SA 4.0|
|289111474|                5|75553692|90062ba6-44c7-463...|2023-03-03 23:18:...|  472495|           null|           Trim chat|![Picture of Arra...|  CC BY-SA 4.0|
|289111475|           

                                                                                

## 5. `Tags` Data 

In [18]:
print(f"No. of Records in the Tags table : {df_tags.count()}")

No. of Records in the Tags table : 64465


In [19]:
df_tags.show(100)

[Stage 33:>                                                         (0 + 1) / 1]

+-----+--------------------+-----+-------------+----------+
|   Id|             TagName|Count|ExcerptPostId|WikiPostId|
+-----+--------------------+-----+-------------+----------+
|97665|                sidr|   52|     19815160|  19815159|
|97667|              deedle|  271|     19796305|  19796304|
|97671|  nsincrementalstore|   16|         null|      null|
|97673|       compound-type|    6|         null|      null|
|97679|        managed-file|    7|         null|      null|
|97681|            pushapps|   12|     19948129|  19948128|
|97682|    multichoiceitems|   70|     23291906|  23291905|
|97683|           sttwitter|   60|     19821491|  19821490|
|97684|            weighttp|    2|     47309724|  47309723|
|97686|    samsung-touchwiz|   36|     19947960|  19947959|
|97689|                juzu|    2|     19810458|  19810457|
|97691|         ldap-client|   29|     19810586|  19810585|
|97693|         semantic-ui| 2734|     20095688|  20095687|
|97694|   uikit-transitions|   18|      

                                                                                

## 6. `Users` Data

In [20]:
print(f"No. of Records in the Users table : {df_users.count()}")



No. of Records in the Users table : 19942787


                                                                                

In [21]:
df_users.show(100)

[Stage 37:>                                                         (0 + 1) / 1]

+--------+----------+--------------------+-------------------------------------+--------------------+--------------------+-------------+----------------------------+-----+-------+---------+---------------+---------+
|      Id|Reputation|        CreationDate|                          DisplayName|      LastAccessDate|          WebsiteUrl|     Location|                     AboutMe|Views|UpVotes|DownVotes|ProfileImageUrl|AccountId|
+--------+----------+--------------------+-------------------------------------+--------------------+--------------------+-------------+----------------------------+-----+-------+---------+---------------+---------+
|18872189|         1|2022-04-20 01:35:...|                                 Mark|2022-04-20 01:35:...|                null|         null|                        null|    0|      0|        0|           null| 25012836|
|18872190|         1|2022-04-20 01:35:...|                             MSFTMish|2023-02-08 19:06:...|                null|         null|

                                                                                

## 7. `Post` Data

In [22]:
print(f"No. of Records in the Posts table : {df_post.count()}")



No. of Records in the Posts table : 58329356


                                                                                

In [23]:
df_post.show(100)

[Stage 41:>                                                         (0 + 1) / 1]

+------+----------+--------+----------------+--------------------+------------+-----+---------+--------------------+-----------+----------------+----------------+---------------------+--------------------+--------------------+--------------------+--------------------+-----------+------------+-------------+----------+------------------+--------------+
|    Id|PostTypeId|ParentId|AcceptedAnswerId|        CreationDate|DeletionDate|Score|ViewCount|                Body|OwnerUserId|OwnerDisplayName|LastEditorUserId|LastEditorDisplayName|        LastEditDate|    LastActivityDate|               Title|                Tags|AnswerCount|CommentCount|FavoriteCount|ClosedDate|CommunityOwnedDate|ContentLicense|
+------+----------+--------+----------------+--------------------+------------+-----+---------+--------------------+-----------+----------------+----------------+---------------------+--------------------+--------------------+--------------------+--------------------+-----------+------------+-

                                                                                

# Data Analysis 

### Q1: No. of Posts which has `AWS or Amazon` in the `tag`

In [24]:
df_tags.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- ExcerptPostId: integer (nullable = true)
 |-- WikiPostId: integer (nullable = true)



In [25]:
list_of_keywords = ['AWS', 'amazon', 'Amazon', 'amz']

df_tags_with_amazon = df_tags.select(F.col("Id"), F.col("TagName"), F.col("Count"), F.col("ExcerptPostId"), F.col("WikiPostId")) \
                       .filter(F.col("TagName").startswith('aws') |
                               F.col("TagName").startswith('amazon') |
                               F.col("TagName").isin(list_of_keywords)) \
                       .withColumnRenamed('Count', "No_of_Posts")

In [26]:
n = df_tags_with_amazon.count()

                                                                                

In [27]:
n

394

In [28]:
window = W.Window.orderBy(F.desc(F.col('No_of_Posts')))
df_tags_with_amazon_with_rank = df_tags_with_amazon.withColumn("Rank_Number", F.row_number().over(window))

In [29]:
df_tags_with_amazon_with_rank.show(n, truncate=False)

23/03/29 15:29:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/29 15:29:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/29 15:29:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/29 15:29:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/29 15:29:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 45:>                                                         (0 + 2) / 2]

+------+----------------------------------+-----------+-------------+----------+-----------+
|Id    |TagName                           |No_of_Posts|ExcerptPostId|WikiPostId|Rank_Number|
+------+----------------------------------+-----------+-------------+----------+-----------+
|33388 |amazon-web-services               |148966     |5063862      |5063861   |1          |
|11444 |amazon-s3                         |48707      |4994052      |4994051   |2          |
|12375 |amazon-ec2                        |35842      |5123215      |5123214   |3          |
|108737|aws-lambda                        |29194      |27496557     |27496556  |4          |
|76578 |amazon-dynamodb                   |13332      |8940831      |8940830   |5          |
|105747|amazon-elastic-beanstalk          |9083       |24799042     |24799041  |6          |
|89537 |amazon-redshift                   |8251       |15304712     |15304711  |7          |
|76486 |aws-cloudformation                |7885       |8940855      |8

                                                                                

In [172]:

# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/query_results/'
output_folder_name = f"{output_bucket}/01_Most_Popular_Service_tagCount"

df_tags_with_amazon_with_rank.repartition(1) \
                            .write \
                            .format('csv') \
                            .option('header', True) \
                            .mode('overwrite') \
                            .save(output_folder_name)

23/03/13 19:27:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/13 19:27:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/13 19:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/13 19:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/13 19:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/13 19:27:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
          

In [30]:
df_tags_with_amazon_with_rank.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- TagName: string (nullable = true)
 |-- No_of_Posts: integer (nullable = true)
 |-- ExcerptPostId: integer (nullable = true)
 |-- WikiPostId: integer (nullable = true)
 |-- Rank_Number: integer (nullable = false)



In [31]:
# For reporting 

df_tags_with_amazon_with_rank.select(F.col('Id'), F.col('TagName'), F.col('No_of_Posts'), F.col('Rank_Number')) \
                             .show(n, truncate=False)

23/03/29 15:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/29 15:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/29 15:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/29 15:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/03/29 15:32:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 47:>                                                         (0 + 2) / 2]

+------+----------------------------------+-----------+-----------+
|Id    |TagName                           |No_of_Posts|Rank_Number|
+------+----------------------------------+-----------+-----------+
|33388 |amazon-web-services               |148966     |1          |
|11444 |amazon-s3                         |48707      |2          |
|12375 |amazon-ec2                        |35842      |3          |
|108737|aws-lambda                        |29194      |4          |
|76578 |amazon-dynamodb                   |13332      |5          |
|105747|amazon-elastic-beanstalk          |9083       |6          |
|89537 |amazon-redshift                   |8251       |7          |
|76486 |aws-cloudformation                |7885       |8          |
|113256|aws-api-gateway                   |7216       |9          |
|105719|amazon-cognito                    |6922       |10         |
|42168 |amazon-rds                        |6523       |11         |
|107211|aws-sdk                           |6060 

                                                                                

### Q2: `Posthistory` and `Tags` 

In [90]:
df_post_history.show(5)

[Stage 135:>                                                        (0 + 1) / 1]

+---------+-----------------+--------+--------------------+--------------------+------+----------------+--------------------+--------------------+--------------+
|       Id|PostHistoryTypeId|  PostId|        RevisionGUID|        CreationDate|UserId| UserDisplayName|             Comment|                Text|ContentLicense|
+---------+-----------------+--------+--------------------+--------------------+------+----------------+--------------------+--------------------+--------------+
|146175008|                5|42248528|ebad5287-e4ae-454...|2017-05-23 10:29:...|  null|URL Rewriter Bot|replaced http://s...|## Background ##&...|  CC BY-SA 3.0|
|146175009|                5| 2176210|0aa181ad-5b55-40d...|2017-05-23 10:29:...|  null|URL Rewriter Bot|replaced http://s...|You are fetching ...|  CC BY-SA 3.0|
|146175010|                5| 1031042|c860038f-254c-483...|2017-05-23 10:29:...|  null|URL Rewriter Bot|replaced http://s...|A bit more specif...|  CC BY-SA 3.0|
|146175011|                5

                                                                                

In [190]:
df_post_history = df_post_history.drop('RevisionGUID', 'UserId', 'UserDisplayName','ContentLicense')

In [178]:
# No. of questions based on PostType

df_post_history.groupBy(F.col('PostHistoryTypeId')).count().orderBy(F.desc("count")).show()



+-----------------+--------+
|PostHistoryTypeId|   count|
+-----------------+--------+
|                2|44652246|
|                5|26071104|
|                1|17433109|
|                3|17416584|
|                6| 3715289|
|               24| 2878366|
|                4| 2688549|
|               10|  793338|
|               13|  371647|
|               12|  366970|
|               33|  180070|
|               34|  178925|
|                8|  115613|
|               16|  110856|
|               50|  107951|
|               11|   50866|
|               19|   34702|
|                7|   25880|
|               36|   23492|
|                9|   21029|
+-----------------+--------+
only showing top 20 rows



                                                                                

In [179]:
df_post_history_filtered = df_post_history.filter(F.col("PostHistoryTypeId") \
                                           .isin([1, 4]))

In [180]:
df_post_history_filtered.show(5)

[Stage 251:>                                                        (0 + 1) / 1]

+---------+-----------------+--------+--------------------+-------+--------------------+
|       Id|PostHistoryTypeId|  PostId|        CreationDate|Comment|                Text|
+---------+-----------------+--------+--------------------+-------+--------------------+
|146175423|                1|44132268|2017-05-23 10:29:...|   null|How to properly a...|
|146177276|                1|44132271|2017-05-23 10:29:...|   null|Unknown pattern c...|
|146181789|                1|44132274|2017-05-23 10:29:...|   null|write data to tex...|
|146184521|                1|44132277|2017-05-23 10:30:...|   null|SQL How to select...|
|146191028|                1|44132280|2017-05-23 10:30:...|   null|NumberFormatExcep...|
+---------+-----------------+--------+--------------------+-------+--------------------+
only showing top 5 rows



                                                                                

In [191]:
df_post_history_and_tags = df_post_history.join(df_tags_with_amazon, 
                                                df_post_history.Id == df_tags_with_amazon.ExcerptPostId) 

In [192]:
df_post_history_and_tags = df_post_history_filtered.join(df_tags_with_amazon.drop('Id'),
                                                         df_tags_with_amazon.ExcerptPostId == df_post_history_filtered.Id)

In [193]:
df_post_history_and_tags = df_tags_with_amazon.join(df_post_history_filtered,
                                                    df_tags_with_amazon.ExcerptPostId == df_post_history_filtered.Id)

In [None]:
df_post_history_filtered

#### Final merged data (`Posthistory` and `Tags`)

In [112]:
df_post_history_and_tags_merged = df_post_history_and_tags_1 \
                                  .union(df_post_history_and_tags_2)\
                                  .orderBy(F.col('PostId'))
                         

In [115]:
df_post_history_and_tags_merged.show()



+--------+-----------------+-------+--------------------+--------------------+-------+---------------+--------------------+--------------------+--------------+-------+------+-------------+----------+
|      Id|PostHistoryTypeId| PostId|        RevisionGUID|        CreationDate| UserId|UserDisplayName|             Comment|                Text|ContentLicense|TagName| Count|ExcerptPostId|WikiPostId|
+--------+-----------------+-------+--------------------+--------------------+-------+---------------+--------------------+--------------------+--------------+-------+------+-------------+----------+
| 7534882|                5|3606997|8ff5db4c-b495-45f...|2010-08-31 07:44:...| 151292|           null|deleted 122 chara...|###New to C++?&#x...|  CC BY-SA 2.5|    c++|783573|      3624963|   3606997|
| 7534871|                5|3606997|de0ff197-3eef-48d...|2010-08-31 07:44:...| 168225|           null|added 109 charact...|##External FAQs&#...|  CC BY-SA 2.5|    c++|783573|      3624963|   3606997|


                                                                                

In [116]:
df_post_history_and_tags_merged.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- PostHistoryTypeId: integer (nullable = true)
 |-- PostId: integer (nullable = true)
 |-- RevisionGUID: string (nullable = true)
 |-- CreationDate: timestamp (nullable = true)
 |-- UserId: integer (nullable = true)
 |-- UserDisplayName: string (nullable = true)
 |-- Comment: string (nullable = true)
 |-- Text: string (nullable = true)
 |-- ContentLicense: string (nullable = true)
 |-- TagName: string (nullable = true)
 |-- Count: integer (nullable = true)
 |-- ExcerptPostId: integer (nullable = true)
 |-- WikiPostId: integer (nullable = true)



In [124]:
# All posts with AWS/Amazon Tag 
list_of_keywords = ['AWS', 'amazon', 'Amazon', 'amz']

# List of cols to be printed
my_cols = ['PostId', F.date_format(F.col('CreationDate'), 'yyyy-MM-dd').alias('Date'), 'Comment', 'TagName', 'Text']

df_post_history_and_tags_merged.select(*my_cols) \
                               .filter(F.col('Comment').isNotNull()) \
                               .filter(F.col("TagName").startswith('aws') |
                                       F.col("TagName").startswith('amazon') |
                                       F.col("TagName").isin(list_of_keywords)) \
                               .orderBy(F.desc(F.col('Date'))) \
                               .show()



+--------+----------+--------------------+-------------------+--------------------+
|  PostId|      Date|             Comment|            TagName|                Text|
+--------+----------+--------------------+-------------------+--------------------+
|56874634|2019-07-03|added 98 characte...|    aws-step-config|`StepConfig` : Sp...|
|56874633|2019-07-03|added 219 charact...|    aws-step-config|                null|
|56874633|2019-07-03|added 219 charact...|    aws-step-config|&#xD;&#xA;&#xD;&#...|
|56874634|2019-07-03|added 98 characte...|    aws-step-config|                null|
|43577346|2019-06-26|added 48 characte...|  amazon-quicksight|Amazon QuickSight...|
|43577346|2019-06-26|remove plagiarise...|  amazon-quicksight|                    |
|43577346|2019-06-26|Rollback to [0736...|  amazon-quicksight|Amazon QuickSight...|
|43577346|2019-06-26|remove plagiarise...|  amazon-quicksight|&lt;!-- previous ...|
|47639438|2019-06-24|Proposed by 73041...|        aws-fargate|              

                                                                                

#### Saving the merged data in S3

In [113]:
# Dataset path 
output_bucket = 's3://stackoverflow-dataset-2023/dataset/raw-processed/merged/'
output_folder_name = f"{output_bucket}/PostHistory-and-Tags-parquet"

In [114]:
df_post_history_and_tags_merged.write \
      .format('parquet') \
      .option('header', True) \
      .mode('overwrite') \
      .save(output_folder_name)

                                                                                

### Q3: `Votes` and  `Posthistory` and `Tags`