In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,substring,desc

In [3]:
spark = SparkSession.builder.master('local').appName('Rank Variance Per Country').getOrCreate()

In [4]:
df_fb_comments = spark.read.options(header='True', InferSchema='True').csv('fb_comments_count.csv')
df_fb_active_users = spark.read.options(header='True', InferSchema='True').csv('fb_active_users.csv')
df_fb_comments.show(10)
df_fb_active_users.show(10)

+-------+----------+------------------+
|user_id|created_at|number_of_comments|
+-------+----------+------------------+
|     18|29-12-2019|                 1|
|     25|21-12-2019|                 1|
|     78|04-01-2020|                 1|
|     37|01-02-2020|                 1|
|     41|23-12-2019|                 1|
|     99|02-02-2020|                 1|
|     21|28-12-2019|                 1|
|     18|31-01-2020|                 1|
|     37|11-02-2020|                 1|
|     58|26-01-2020|                 1|
+-------+----------+------------------+
only showing top 10 rows

+-------+----------------+------+----------+
|user_id|            name|status|   country|
+-------+----------------+------+----------+
|     33|     Amanda Leon|  open| Australia|
|     27| Jessica Farrell|  open|Luxembourg|
|     18|   Wanda Ramirez|  open|       USA|
|     50|   Samuel Miller|closed|    Brazil|
|     16|      Jacob York|  open| Australia|
|     25|Natasha Bradford|closed|       USA|
|     34|

In [46]:
##joining both tables to get country name together
df_fb_comments = df_fb_comments.withColumnRenamed("user_id","userID")
df_merged = df_fb_active_users.join(df_fb_comments, df_fb_active_users.user_id == df_fb_comments.userID, 'left')
df_merged = df_merged.withColumn("time_period", substring('created_at', 1,7))
#df_merged.show(5)

In [47]:
##calculating comment sum per user based on created_at and country
df_comment_sum = df_merged.groupBy('country','time_period').sum('number_of_comments').sort(desc('sum(number_of_comments)'))
#df_comment_sum.show()

In [87]:
##filtering out 2019-12 and 2020-01
df_tp_filtered_dec_2019 = df_comment_sum.filter((col("time_period") == '2019-12'))
df_tp_filtered_jan_2020 = df_comment_sum.filter((col("time_period") == '2020-01'))
df_tp_filtered_dec_2019 = df_tp_filtered_dec_2019.withColumnRenamed("sum(number_of_comments)","total_comments_december")
df_tp_filtered_jan_2020 = df_tp_filtered_jan_2020.withColumnRenamed("sum(number_of_comments)","total_comments_jan")
#df_tp_filtered_dec_2019.show()
#df_tp_filtered_jan_2020.show()

In [94]:
#join above both df 
df_tp_filtered_dec_2019 = df_tp_filtered_dec_2019.withColumnRenamed("time_period","tp")
df_tp_filtered_dec_2019 = df_tp_filtered_dec_2019.withColumnRenamed("country","country_name")
df_comparsion = df_tp_filtered_jan_2020.join(df_tp_filtered_dec_2019, df_tp_filtered_dec_2019.country_name == df_tp_filtered_jan_2020.country, 'left').selectExpr('Country','tp','total_comments_december','time_period','total_comments_jan').sort(desc('total_comments_jan'))
#df_comparsion.show()

In [95]:
##filtering out countries with raised comments
df_comparsion = df_comparsion.na.fill(value=0,subset=["total_comments_december"])
df_comparsion = df_comparsion.na.fill(value=0,subset=["total_comments_jan"])
df_comparsion = df_comparsion.withColumn("status", col('total_comments_jan') - col('total_comments_december'))
df_comparsion = df_comparsion.filter(df_comparsion.status > 0).sort(desc('status')) .show()

+----------+
|   country|
+----------+
|      Mali|
|   Denmark|
|Luxembourg|
| Australia|
+----------+

