In [187]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col, when, count
from pyspark.sql.window import Window
from pyspark.sql.functions import lag

In [188]:
spark = SparkSession.builder.appName("Acceptance Rate By Date").getOrCreate()

In [189]:
df_input = spark.read.options(header='True', InferSchema='True').csv('fb_friend_requests.csv')
#df_input.show()

In [190]:
df_accepted = df_input.filter(df_input['action'] == 'accepted')
df_sent = df_input.filter(df_input['action'] == 'sent')

In [191]:
df_accepted.show()
df_sent.show()

+--------------+----------------+----------+--------+
|user_id_sender|user_id_receiver|      date|  action|
+--------------+----------------+----------+--------+
|     ad4943sdz|      948ksx123d|06-01-2020|accepted|
|   fffkfld9499|     993lsldidif|10-01-2020|accepted|
|    fg503kdsdd|       ofp049dkd|10-01-2020|accepted|
|    r4gfgf2344|      234ddr4545|11-01-2020|accepted|
|    dfdfxf9483|      9djjjd9283|15-01-2020|accepted|
+--------------+----------------+----------+--------+

+--------------+----------------+----------+------+
|user_id_sender|user_id_receiver|      date|action|
+--------------+----------------+----------+------+
|     ad4943sdz|      948ksx123d|04-01-2020|  sent|
|    dfdfxf9483|      9djjjd9283|04-01-2020|  sent|
|    fg503kdsdd|       ofp049dkd|04-01-2020|  sent|
|    hh643dfert|      847jfkf203|04-01-2020|  sent|
| ffdfff4234234|     lpjzjdi4949|06-01-2020|  sent|
|   fffkfld9499|     993lsldidif|06-01-2020|  sent|
|    r4gfgf2344|      234ddr4545|06-01-2020| 

In [235]:
##creating concat column with user_id_sender and receiver
df_accepted_concat = df_accepted.withColumn('accepted_concat',F.concat(F.col('user_id_sender'),F.lit('_'), F.col('user_id_receiver')))
df_sent_concat = df_sent.withColumn('sent_concat',F.concat(F.col('user_id_sender'),F.lit('_'), F.col('user_id_receiver')))

In [236]:
##joining above df
df_joined = df_sent_concat.join(df_accepted_concat, df_sent_concat.sent_concat == df_accepted_concat.accepted_concat , 'left').select(df_sent_concat['user_id_sender'],df_sent_concat['user_id_receiver'],df_sent_concat['date'],df_accepted_concat['accepted_concat'])

In [237]:
## total count of records after group by date
df_total = df_joined.groupBy('date').count().select(col("date").alias("date1"),col("count").alias("total_count"))

In [238]:
## total accepted count of records after group by date
df_joined = df_joined.filter("accepted_concat IS NOT NULL")
df_joined = df_joined.groupBy('date').count().select(col("date").alias("date2"),col("count").alias("accepted_count"))

In [239]:
##joining above df
df_output = df_joined.join(df_total, df_total.date1 == df_joined.date2 , 'inner').selectExpr('date2','accepted_count','total_count')

In [240]:
## calculating percentage
df_output = df_output.withColumn("percentage_acceptance", df_output['accepted_count']/df_output['total_count'])

In [241]:
df_output.select('date2','percentage_acceptance').orderBy('date2').show()

+----------+---------------------+
|     date2|percentage_acceptance|
+----------+---------------------+
|04-01-2020|                 0.75|
|06-01-2020|   0.6666666666666666|
+----------+---------------------+

