In [77]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,substring,when,asc

In [6]:
spark = SparkSession.builder.appName('New And Existing Users').getOrCreate()

In [7]:
df_input = spark.read.options(header='True', InferSchema='True').csv('fact_events.csv')
#df_input.show()

In [8]:
##creating period column with YYYY-MM
df_period = df_input.withColumn("period", substring('time_id',1,7))
df_period.show()

+---+----------+----------+----------------+---------+-------------------+--------+-------+
| id|   time_id|   user_id|     customer_id|client_id|         event_type|event_id| period|
+---+----------+----------+----------------+---------+-------------------+--------+-------+
|  1|2020-02-28|3668-QPYBK|          Sendit|  desktop|       message sent|       3|2020-02|
|  2|2020-02-28|7892-POOKP|       Connectix|   mobile|      file received|       2|2020-02|
|  3|2020-04-03|9763-GRSKD|          Zoomit|  desktop|video call received|       7|2020-04|
|  4|2020-04-02|9763-GRSKD|       Connectix|  desktop|video call received|       7|2020-04|
|  5|2020-02-06|9237-HQITU|          Sendit|  desktop|video call received|       7|2020-02|
|  6|2020-02-27|8191-XWSZG|       Connectix|  desktop|      file received|       2|2020-02|
|  7|2020-04-03|9237-HQITU|       Connectix|  desktop|video call received|       7|2020-04|
|  8|2020-03-01|9237-HQITU|       Connectix|   mobile|   message received|      

In [135]:
df_period.filter(df_period.user_id == '4190-MFLUW').show()

+---+----------+----------+----------------+---------+-------------------+--------+-------+
| id|   time_id|   user_id|     customer_id|client_id|         event_type|event_id| period|
+---+----------+----------+----------------+---------+-------------------+--------+-------+
|  9|2020-04-02|4190-MFLUW|       Connectix|   mobile|video call received|       7|2020-04|
| 40|2020-03-25|4190-MFLUW|       Connectix|   mobile|      file received|       2|2020-03|
| 47|2020-04-06|4190-MFLUW|       Connectix|   mobile|          file sent|       1|2020-04|
| 63|2020-03-07|4190-MFLUW|Electric Gravity|  desktop|       message sent|       3|2020-03|
| 64|2020-03-05|4190-MFLUW|       Connectix|  desktop|          file sent|       1|2020-03|
| 75|2020-03-02|4190-MFLUW|          Sendit|   mobile|       message sent|       3|2020-03|
|126|2020-03-25|4190-MFLUW|       Connectix|  desktop| video call started|       6|2020-03|
+---+----------+----------+----------------+---------+-------------------+------

In [50]:
##getting new user purchase
df_period.createOrReplaceTempView("fb")
df_most_flagged_video = spark.sql("select period,user_id,rnk from "+ " (select period,user_id, ROW_NUMBER() OVER ( PARTITION BY period,user_id ORDER BY period,user_id) as rnk " +" FROM fb) tmp where rnk=1  ")
df_most_flagged_video.groupBy('period').count().show()

+-------+-----+
| period|count|
+-------+-----+
|2020-02|   13|
|2020-03|   17|
|2020-04|   14|
+-------+-----+



In [95]:
##getting existing users purchase
df_existing_user = df_period.select('user_id','period').distinct()
df_existing_user = df_existing_user.withColumn("February",  when(df_existing_user.period =='2020-02', df_existing_user.user_id).otherwise('In-Active'))
df_existing_user = df_existing_user.withColumn("March",  when(df_existing_user.period =='2020-03', df_existing_user.user_id).otherwise('In-Active'))
df_existing_user = df_existing_user.withColumn("April",  when(df_existing_user.period =='2020-04', df_existing_user.user_id).otherwise('In-Active'))
df_existing_user.orderBy('period').show()

+----------+-------+----------+----------+---------+
|   user_id| period|  February|     March|    April|
+----------+-------+----------+----------+---------+
|1452-KIOVK|2020-02|1452-KIOVK| In-Active|In-Active|
|3668-QPYBK|2020-02|3668-QPYBK| In-Active|In-Active|
|3655-SNQYZ|2020-02|3655-SNQYZ| In-Active|In-Active|
|7892-POOKP|2020-02|7892-POOKP| In-Active|In-Active|
|8191-XWSZG|2020-02|8191-XWSZG| In-Active|In-Active|
|7469-LKBCI|2020-02|7469-LKBCI| In-Active|In-Active|
|6713-OKOMC|2020-02|6713-OKOMC| In-Active|In-Active|
|7795-CFOCW|2020-02|7795-CFOCW| In-Active|In-Active|
|7590-VHVEG|2020-02|7590-VHVEG| In-Active|In-Active|
|9237-HQITU|2020-02|9237-HQITU| In-Active|In-Active|
|5129-JLPIS|2020-02|5129-JLPIS| In-Active|In-Active|
|0280-XJGEX|2020-02|0280-XJGEX| In-Active|In-Active|
|9305-CDSKC|2020-02|9305-CDSKC| In-Active|In-Active|
|6388-TABGU|2020-03| In-Active|6388-TABGU|In-Active|
|9305-CDSKC|2020-03| In-Active|9305-CDSKC|In-Active|
|5129-JLPIS|2020-03| In-Active|5129-JLPIS|In-A

In [133]:
df_existing_user.filter(df_period.user_id == '6388-TABGU').show()

+----------+-------+---------+----------+---------+
|   user_id| period| February|     March|    April|
+----------+-------+---------+----------+---------+
|6388-TABGU|2020-03|In-Active|6388-TABGU|In-Active|
+----------+-------+---------+----------+---------+



In [132]:
df_existing_user.createOrReplaceTempView("fb2")
df_feb_new_user = spark.sql("select '2020-02' as time_period,user_id FROM fb2 where february!='In-Active'").show(50)
df_march_new_user = spark.sql("select '2020-03' as time_period,user_id FROM fb2 where march not in (select february from fb2) ").show(50)
df_april_new_user = spark.sql("select '2020-04' as time_period,user_id FROM fb2 where march not in (select april from fb2) ").show(50)
df_merged = df_feb_new_user.union(df_march_new_user).union(df_april_new_user).show(50)
#df_new_user_count = df_merged.groupBy('period').count().show()

+-----------+----------+
|time_period|   user_id|
+-----------+----------+
|    2020-02|3668-QPYBK|
|    2020-02|3655-SNQYZ|
|    2020-02|7795-CFOCW|
|    2020-02|9237-HQITU|
|    2020-02|9305-CDSKC|
|    2020-02|1452-KIOVK|
|    2020-02|6713-OKOMC|
|    2020-02|7590-VHVEG|
|    2020-02|5129-JLPIS|
|    2020-02|0280-XJGEX|
|    2020-02|7892-POOKP|
|    2020-02|8191-XWSZG|
|    2020-02|7469-LKBCI|
+-----------+----------+

+-----------+----------+
|time_period|   user_id|
+-----------+----------+
|    2020-03|6388-TABGU|
|    2020-03|4190-MFLUW|
|    2020-03|8091-TTVAX|
|    2020-03|4183-MYFRB|
|    2020-03|5575-GNVDE|
+-----------+----------+

+-----------+----------+
|time_period|   user_id|
+-----------+----------+
|    2020-04|6388-TABGU|
|    2020-04|0280-XJGEX|
|    2020-04|6713-OKOMC|
|    2020-04|7590-VHVEG|
|    2020-04|1452-KIOVK|
+-----------+----------+



AttributeError: 'NoneType' object has no attribute 'union'