In [1]:
import pyspark
from pyspark.sql import SparkSession, types
from pyspark.sql import functions as F
from pyspark.sql.functions import sha2, concat_ws

In [2]:
spark = SparkSession.builder \
    .master("spark://192.168.86.51:7077") \
    .appName('test') \
    .getOrCreate()

In [3]:
spark.sparkContext

In [4]:
df_green = spark.read.parquet('data/pq/green/*/*')

In [5]:
df_green = df_green.withColumnRenamed('lpep_pickup_datetime', 'pickup_datetime') \
                    .withColumnRenamed('lpep_dropoff_datetime', 'dropoff_datetime')

In [6]:
df_green.createOrReplaceTempView("green")

In [7]:
df_green_revenue = spark.sql("""
                        SELECT 
                            PULocationID AS revenue_zone,
                            date_trunc('hour', pickup_datetime) AS revenue_hour, 
                            SUM(total_amount) AS amount,
                            COUNT(*) AS number_records
                        FROM green
                        WHERE pickup_datetime >= '2020-01-01'
                        GROUP BY 
                        	revenue_zone,
                        	revenue_hour
                    """)

In [8]:
df_green_revenue.show(vertical=True, n=5)

-RECORD 0-----------------------------
 revenue_zone   | 152                 
 revenue_hour   | 2020-01-29 18:00:00 
 amount         | 46.82               
 number_records | 6                   
-RECORD 1-----------------------------
 revenue_zone   | 41                  
 revenue_hour   | 2020-01-19 12:00:00 
 amount         | 768.7800000000002   
 number_records | 57                  
-RECORD 2-----------------------------
 revenue_zone   | 130                 
 revenue_hour   | 2020-01-05 20:00:00 
 amount         | 256.33000000000004  
 number_records | 16                  
-RECORD 3-----------------------------
 revenue_zone   | 166                 
 revenue_hour   | 2020-01-24 13:00:00 
 amount         | 685.4200000000001   
 number_records | 33                  
-RECORD 4-----------------------------
 revenue_zone   | 69                  
 revenue_hour   | 2020-01-16 12:00:00 
 amount         | 75.42               
 number_records | 5                   
only showing top 5 rows



In [9]:
df_green_revenue.write.parquet('data/report/revenue/green/', mode='overwrite')

In [10]:
df_yellow = spark.read.parquet('data/pq/yellow/*/*')

In [11]:
df_yellow= df_yellow.withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
                    .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime')

In [12]:
df_yellow.createOrReplaceTempView("yellow")

In [13]:
df_yellow_revenue = spark.sql("""
                        SELECT 
                            PULocationID AS revenue_zone,
                            date_trunc('hour', pickup_datetime) AS revenue_hour, 
                            SUM(total_amount) AS amount,
                            COUNT(*) AS number_records
                        FROM yellow
                        WHERE pickup_datetime >= '2020-01-01'
                        GROUP BY 
                        	revenue_zone,
                        	revenue_hour
                    """)

In [14]:
df_yellow_revenue.show(vertical=True,n=5)

-RECORD 0-----------------------------
 revenue_zone   | 226                 
 revenue_hour   | 2020-01-13 07:00:00 
 amount         | 1327.1900000000003  
 number_records | 57                  
-RECORD 1-----------------------------
 revenue_zone   | 233                 
 revenue_hour   | 2020-01-24 16:00:00 
 amount         | 3437.3299999999986  
 number_records | 172                 
-RECORD 2-----------------------------
 revenue_zone   | 166                 
 revenue_hour   | 2020-01-24 19:00:00 
 amount         | 1145.1600000000003  
 number_records | 85                  
-RECORD 3-----------------------------
 revenue_zone   | 238                 
 revenue_hour   | 2020-01-17 22:00:00 
 amount         | 2514.87             
 number_records | 158                 
-RECORD 4-----------------------------
 revenue_zone   | 41                  
 revenue_hour   | 2020-01-31 16:00:00 
 amount         | 593.5500000000001   
 number_records | 47                  
only showing top 5 rows



In [15]:
df_yellow_revenue.write.parquet('data/report/revenue/yellow/', mode='overwrite')

In [16]:
df_green_revenue_tmp = df_green_revenue.withColumnRenamed('amount', 'green_amount') \
                                       .withColumnRenamed('number_records', 'green_number_records')
df_yellow_revenue_tmp = df_yellow_revenue.withColumnRenamed('amount', 'yellow_amount') \
                                         .withColumnRenamed('number_records', 'yellow_number_records')

In [17]:
df_green_revenue.write.parquet('data/report/revenue/green/', mode='overwrite')

In [18]:
df_join = df_green_revenue_tmp.join(df_yellow_revenue_tmp, on=['revenue_hour','revenue_zone'], how='outer')

In [19]:
df_join.show()

+-------------------+------------+------------+--------------------+------------------+---------------------+
|       revenue_hour|revenue_zone|green_amount|green_number_records|     yellow_amount|yellow_number_records|
+-------------------+------------+------------+--------------------+------------------+---------------------+
|2020-01-01 04:00:00|           1|        NULL|                NULL|              94.8|                    1|
|2020-01-01 05:00:00|           1|        NULL|                NULL|              40.3|                    1|
|2020-01-01 11:00:00|           1|        NULL|                NULL|              90.3|                    1|
|2020-01-03 19:00:00|           1|        NULL|                NULL|            284.32|                    3|
|2020-01-04 07:00:00|           1|        NULL|                NULL|            198.76|                    2|
|2020-01-04 13:00:00|           1|        NULL|                NULL| 98.44999999999999|                    2|
|2020-01-0

In [20]:
df_join.write.parquet('data/report/revenue/total')

In [21]:
df_join = spark.read.parquet('data/report/revenue/total/')

In [24]:
df_join

DataFrame[revenue_hour: timestamp, revenue_zone: int, green_amount: double, green_number_records: bigint, yellow_amount: double, yellow_number_records: bigint]

In [22]:
df_zones = spark.read.parquet('zones/')

In [25]:
df_results = df_join.join(df_zones, df_join.revenue_zone==df_zones.LocationID)

In [28]:
df_results.show()

+-------------------+------------+------------+--------------------+------------------+---------------------+----------+-------+--------------+------------+
|       revenue_hour|revenue_zone|green_amount|green_number_records|     yellow_amount|yellow_number_records|LocationID|Borough|          Zone|service_zone|
+-------------------+------------+------------+--------------------+------------------+---------------------+----------+-------+--------------+------------+
|2020-01-01 07:00:00|           1|        NULL|                NULL|              0.31|                    1|         1|    EWR|Newark Airport|         EWR|
|2020-01-01 12:00:00|           1|        NULL|                NULL|            266.66|                    3|         1|    EWR|Newark Airport|         EWR|
|2020-01-02 01:00:00|           1|        NULL|                NULL|             84.36|                    1|         1|    EWR|Newark Airport|         EWR|
|2020-01-02 16:00:00|           1|        NULL|           

In [29]:
df_results.drop('LocationID', 'revenue_zone').write.parquet('tmp/revenue-zones/')