In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,desc
from pyspark.sql import functions as F

In [36]:
spark = SparkSession.builder.appName('Call Declines').getOrCreate()

In [37]:
df_call_input = spark.read.options(header='True', InferSchema='True').csv('rc_calls.csv')
df_status_input = spark.read.options(header='True', InferSchema='True').csv('rc_users.csv')
df_call_input.show()
df_status_input.show()

+-------+-------------------+-------+
|user_id|               date|call_id|
+-------+-------------------+-------+
|   1218|2020-04-19 01:06:46|      0|
|   1554|2020-03-01 16:51:01|      1|
|   1857|2020-03-29 07:06:13|      2|
|   1525|2020-03-07 02:01:12|      3|
|   1271|2020-04-28 21:39:12|      4|
|   1181|2020-03-18 04:49:36|      5|
|   1950|2020-04-12 23:57:03|      6|
|   1339|2020-04-11 02:15:43|      7|
|   1910|2020-03-21 08:56:38|      8|
|   1093|2020-03-07 15:47:46|      9|
|   1859|2020-04-25 13:55:44|     10|
|   1079|2020-04-17 16:38:00|     11|
|   1519|2020-04-15 12:14:22|     12|
|   1854|2020-04-25 19:59:22|     13|
|   1968|2020-03-16 21:19:39|     14|
|   1891|2020-03-30 23:11:06|     15|
|   1575|2020-03-14 15:21:45|     16|
|   1162|2020-04-06 18:39:32|     17|
|   1503|2020-04-01 18:31:36|     18|
|   1884|2020-04-08 08:44:19|     19|
+-------+-------------------+-------+
only showing top 20 rows

+-------+--------+----------+
|user_id|  status|company_id|
+-

In [38]:
##filtering out records from March to April 2020 from call table and also filtering inactive records from status table
df_filtered_calls = df_call_input.filter(F.col('date').between('2020-03-01 00:00:00','2020-03-31 23:59:59')).selectExpr('user_id','date','call_id').orderBy('date')
df_filtered_inactive_users = df_status_input.filter(df_status_input.status == "inactive").selectExpr('user_id','status','company_id')
df_filtered.show(20)
df_filtered_inactive_users.show(20)

+-------+-------------------+-------+
|user_id|               date|call_id|
+-------+-------------------+-------+
|   1554|2020-03-01 16:51:01|      1|
|   1181|2020-03-02 17:07:11|     22|
|   1525|2020-03-04 14:44:47|     21|
|   1525|2020-03-07 02:01:12|      3|
|   1093|2020-03-07 15:47:46|      9|
|   1162|2020-03-08 06:47:32|     36|
|   1854|2020-03-10 10:04:19|     20|
|   1910|2020-03-11 08:33:37|     39|
|   1859|2020-03-13 23:52:34|     25|
|   1575|2020-03-14 15:21:45|     16|
|   1968|2020-03-16 21:19:39|     14|
|   1950|2020-03-17 11:17:04|     34|
|   1181|2020-03-18 04:49:36|      5|
|   1884|2020-03-20 14:41:15|     33|
|   1910|2020-03-21 08:56:38|      8|
|   1854|2020-03-28 00:35:52|     27|
|   1857|2020-03-29 07:06:13|      2|
|   1503|2020-03-29 11:17:26|     23|
|   1891|2020-03-30 23:11:06|     15|
+-------+-------------------+-------+

+-------+--------+----------+
|user_id|  status|company_id|
+-------+--------+----------+
|   1554|inactive|         1|
|   1

In [39]:
##Merging both dataframe to get the call_id wrt to user_id for Mar-Apr for Inactive users
df_output = df_filtered_inactive_users.join(df_filtered, df_filtered_inactive_users.user_id == df_filtered.user_id, 'inner')
#df_output.show()

In [40]:
df_output = df_output.groupBy('company_id').count().select(col('company_id'), col('count').alias('total_count')).sort(desc('total_count')).show()

+----------+-----------+
|company_id|total_count|
+----------+-----------+
|         2|          3|
|         1|          1|
|         3|          1|
+----------+-----------+

