In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit

In [5]:
spark = SparkSession.builder.master('local').appName('Campaign Name').getOrCreate()

In [6]:
df_input = spark.read.options(header='True', InferSchema='True',  quote = "\"", escape = "\"").csv('events.csv')
df_input.count()

387567

In [7]:
##filtering out following events(app_lauch,add_to_cart,create_order,payment_suceess)
df_event_app_launch = df_input.filter(col('event_name') == 'app_launch').selectExpr('city','event_name','campaign','customer_user_id').orderBy('city').distinct()
df_event_add_to_cart = df_input.filter(col('event_name') == 'add_to_cart').selectExpr('city','event_name','campaign','customer_user_id').orderBy('city').distinct()
df_event_create_order = df_input.filter(col('event_name') == 'create_order').selectExpr('city','event_name','campaign','customer_user_id').orderBy('city').distinct()
df_event_payment_success = df_input.filter(col('event_name') == 'payment_success').selectExpr('city','event_name','campaign','customer_user_id').orderBy('city').distinct()
#df_event_app_launch.count()
#df_event_add_to_cart.count()
#df_event_create_order.count()
#df_event_payment_success.count()


In [12]:
##taking count of each event based on city
df_event_app_launch.createOrReplaceTempView("app_lauch")
df_event_add_to_cart.createOrReplaceTempView("add_to_cart")
df_event_create_order.createOrReplaceTempView("create_order")
df_event_payment_success.createOrReplaceTempView("payment_success")
df_event_count_app_launch = spark.sql("select city as al_city,event_name,campaign,count(*) as app_lauch_count from  app_lauch where city='Hyderabad' group by city,event_name,campaign order by city,event_name ")
df_event_count_add_to_cart = spark.sql("select city atc_city ,event_name,count(*) as add_to_cart_count from  add_to_cart where city='Hyderabad' group by city,event_name order by city,event_name ")
df_event_count_create_order = spark.sql("select city as co_city,event_name,count(*) as create_order_count from  create_order where city='Hyderabad' group by city,event_name order by city,event_name ")
df_event_count_payment_success = spark.sql("select city as ps_city,event_name,count(*) as payment_success_count from  payment_success where city='Hyderabad' group by city,event_name order by city,event_name ")

df_event_count_app_launch.show(43)

+---------+----------+--------------------+---------------+
|  al_city|event_name|            campaign|app_lauch_count|
+---------+----------+--------------------+---------------+
|Hyderabad|app_launch|                None|             17|
|Hyderabad|app_launch|App Purchase_iOS_HYD|             21|
|Hyderabad|app_launch|Newspaper Inserts...|              1|
|Hyderabad|app_launch|                  NA|           1457|
|Hyderabad|app_launch|ET-007-App-Purcha...|            196|
|Hyderabad|app_launch|App Installs_iOS_HYD|            141|
|Hyderabad|app_launch|ET-006-TOF-Broad-...|              2|
|Hyderabad|app_launch|Founders Notes - ...|              1|
|Hyderabad|app_launch|App Installs_Andr...|            319|
|Hyderabad|app_launch|ET-003-TOF-LAL-Pu...|              1|
|Hyderabad|app_launch|Womens Day SMS ca...|              1|
|Hyderabad|app_launch|Womens Day SMS ca...|              6|
|Hyderabad|app_launch|ET-001-App-Purcha...|            940|
|Hyderabad|app_launch|Feb 2021 Install .

In [254]:
##merging above datasets and dispalying count values
df_merged = df_event_count_app_launch.join(df_event_count_add_to_cart, df_event_count_app_launch.al_city == df_event_count_add_to_cart.atc_city, 'inner').join(df_event_count_create_order, df_event_count_app_launch.al_city == df_event_count_create_order.co_city, 'inner').join(df_event_count_payment_success, df_event_count_app_launch.al_city == df_event_count_payment_success.ps_city, 'inner').selectExpr('al_city','app_lauch_count','add_to_cart_count','create_order_count','payment_success_count')
#df_merged.show()

In [255]:
1216/2377

0.5115692048801009

In [256]:
2000/3905 , 1305/3905, 906/3905

(0.5121638924455826, 0.33418693982074266, 0.23201024327784892)

In [None]:
df_merged

In [257]:
##to convert into percentage
df_merged_percentage = df_merged.withColumn("app_lauch_percentage", lit(100))
df_merged_percentage = df_merged_percentage.withColumn("add_to_cart_percentage", (col('add_to_cart_count')/ col('app_lauch_count')) *100 )
df_merged_percentage = df_merged_percentage.withColumn("create_order_percentage", (col('create_order_count')/ col('app_lauch_count')) *100 )
df_merged_percentage = df_merged_percentage.withColumn("payment_success_percentage", (col('payment_success_count')/ col('app_lauch_count')) *100 )
#df_merged_percentage.show()

In [258]:
##formatting based on the required output
df_merged_percentage.createOrReplaceTempView("event_master_table")
df_output = spark.sql("select  al_city as city ,concat(app_lauch_count, '(', app_lauch_percentage, ')%') as app_lauch,concat(add_to_cart_count, '(', add_to_cart_percentage, ')%') as add_to_cart,concat(create_order_count, '(', create_order_percentage, ')%') as create_order, concat(payment_success_count, '(', payment_success_percentage, ')%') as payment_success from  event_master_table")
df_output.show()

+---------+----------+--------------------+--------------------+--------------------+
|     city| app_lauch|         add_to_cart|        create_order|     payment_success|
+---------+----------+--------------------+--------------------+--------------------+
|Hyderabad|3905(100)%|2000(51.216389244...|1305(33.418693982...|906(23.2010243277...|
+---------+----------+--------------------+--------------------+--------------------+

