In [332]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import lag
from pyspark.sql.functions import collect_list,when
from pyspark.sql.functions import array_distinct

In [333]:
spark = SparkSession.builder.appName('Marketing Campaign Success').getOrCreate()

In [348]:
df_input = spark.read.options(header='True', InferSchema='True').csv('marketing_campaign.csv')
df_input.show()

+-------+----------+----------+--------+-----+
|user_id|created_at|product_id|quantity|price|
+-------+----------+----------+--------+-----+
|     10|2019-01-01|       101|       3|   55|
|     10|2019-01-02|       119|       5|   29|
|     10|2019-03-31|       111|       2|  149|
|     11|2019-01-02|       105|       3|  234|
|     11|2019-03-31|       120|       3|   99|
|     12|2019-01-02|       112|       2|  200|
|     12|2019-03-31|       110|       2|  299|
|     13|2019-01-05|       113|       1|   67|
|     13|2019-03-31|       118|       3|   35|
|     14|2019-01-06|       109|       5|  199|
|     14|2019-01-06|       107|       2|   27|
|     14|2019-03-31|       112|       3|  200|
|     15|2019-01-08|       105|       4|  234|
|     15|2019-01-09|       110|       4|  299|
|     15|2019-03-31|       116|       2|  499|
|     16|2019-01-10|       113|       2|   67|
|     16|2019-03-31|       107|       4|   27|
|     17|2019-01-11|       116|       2|  499|
|     17|2019

In [335]:
#users that only made one or multiple purchases on the first day do not count
df_input.createOrReplaceTempView("campaign")
df_second_purchase= spark.sql("select user_id,created_at,product_id,rn from "+ " (select  user_id,created_at,product_id, RANK() OVER (PARTITION BY user_id ORDER BY created_at ASC) as rn " +" FROM campaign) tmp where rn >1 ")
#df_second_purchase.show()

In [336]:
#first day purchase
df_input.createOrReplaceTempView("campaign")
df_first_purchase= spark.sql("select user_id,created_at,product_id,rn from "+ " (select  user_id,created_at,product_id, RANK() OVER (PARTITION BY user_id ORDER BY created_at ASC) as rn " +" FROM campaign) tmp where rn=1")
#df_first_purchase.show()

In [337]:
#creating list of all product_id groupby user_id and also removing duplicate product_id
first_grouped_df = df_first_purchase.groupby('user_id').agg(collect_list('product_id').alias("first_product_list")).select(col("first_product_list"), col("user_id").alias("first_user_id"))
first_grouped_df = first_grouped_df.withColumn("first_product", array_distinct("first_product_list"))
second_grouped_df = df_second_purchase.groupby('user_id').agg(collect_list('product_id').alias("sec_product_list")).select(col("sec_product_list"), col("user_id").alias("sec_user_id"))
second_grouped_df = second_grouped_df.withColumn("sec_product", array_distinct("sec_product_list"))

In [338]:
##joining above df
df_combined = first_grouped_df.join(second_grouped_df, first_grouped_df.first_user_id == second_grouped_df.sec_user_id , 'inner').selectExpr('first_user_id','first_product','sec_product')

In [339]:
df_output = df_combined.withColumn("Status_Flag", when(df_combined.first_product == df_combined.sec_product,"InActive").otherwise("Active"))
#df_output.show()

In [340]:
df_output = df_output.filter(df_output.Status_Flag == 'Active')

In [341]:
df_output.select('first_user_id').distinct().count()

23