In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,substring

In [3]:
spark = SparkSession.builder.appName('Distance Per Dollar').getOrCreate()

In [8]:
df_input = spark.read.options(header='True', InferSchema='True').csv('uber_request_logs.csv')
df_input.show(5)

+----------+------------+--------------+------------------+-------------+-------------------------+
|request_id|request_date|request_status|distance_to_travel|monetary_cost|driver_to_client_distance|
+----------+------------+--------------+------------------+-------------+-------------------------+
|         1|  2020-01-09|       success|             70.59|         6.56|                    14.36|
|         2|  2020-01-24|       success|             93.36|        22.68|                     19.9|
|         3|  2020-02-08|          fail|             51.24|        11.39|                    21.32|
|         4|  2020-02-23|       success|             61.58|         8.04|                    44.26|
|         5|  2020-03-09|       success|             25.04|         7.19|                     1.74|
+----------+------------+--------------+------------------+-------------+-------------------------+
only showing top 5 rows



In [30]:
##calculating distance per dollar for everyday
df_distance_per_dollar = df_input.withColumn("distance_to_cost",  col('distance_to_travel') / col('monetary_cost')).orderBy('request_date')
df_monthly_average = df_distance_per_dollar.withColumn("time_period", substring('request_date', 1,7))
df_distance_per_dollar = df_monthly_average.select('request_date','distance_to_cost','time_period')
#df_distance_per_dollar.show

In [31]:
##calculating average distance_to_cost based on every month YYYY-MM. will create period column with substing of request_date 
df_monthly_average = df_distance_per_dollar.withColumn("period", substring('request_date', 1,7))
df_monthly_average = df_monthly_average.groupBy('period').avg('distance_to_cost').orderBy('period')
df_monthly_average = df_monthly_average.withColumnRenamed("avg(distance_to_cost)", "avg_distance_to_cost")
#df_monthly_average.show()

In [34]:
##joining above two dataframes to get resultant
df_output = df_distance_per_dollar.join(df_monthly_average, df_distance_per_dollar.time_period == df_monthly_average.period, 'inner')
df_output = df_output.select('request_date','distance_to_cost','avg_distance_to_cost').orderBy('request_date')
df_output.show()

+------------+-------------------+--------------------+
|request_date|   distance_to_cost|avg_distance_to_cost|
+------------+-------------------+--------------------+
|  2020-01-09| 10.760670731707318|   7.438536424054718|
|  2020-01-24|  4.116402116402116|   7.438536424054718|
|  2020-02-08|  4.498683055311677|   6.078943517705589|
|  2020-02-23|  7.659203980099503|   6.078943517705589|
|  2020-03-09| 3.4826147426981917|   6.609897114938839|
|  2020-03-24|  9.737179487179487|   6.609897114938839|
|  2020-04-08| 1.9267139479905437|   3.116035545423843|
|  2020-04-23|  4.305357142857143|   3.116035545423843|
|  2020-05-08| 14.017326732673268|   7.580134837808106|
|  2020-05-23| 1.1429429429429432|   7.580134837808106|
|  2020-06-07| 12.347560975609756|   7.376940758685239|
|  2020-06-22| 2.4063205417607225|   7.376940758685239|
|  2020-07-07| 0.8299549549549549|   2.853939465781571|
|  2020-07-22|  4.877923976608187|   2.853939465781571|
|  2020-08-06| 2.0395794681508965|  13.652941907