In [1]:
import findspark
findspark.init()
findspark.find()

'/usr/local/lib/python3.10/dist-packages/pyspark'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
            SparkSession
                .builder
    
                .appName("TaxiOperationsDataFrameApp")
                .master("local[4]")
    
                .config("spark.dynamicAllocation.enabled", "false")
                .config("spark.sql.adaptive.enabled", "false")    
                .getOrCreate()
        )

sc = spark.sparkContext

spark

23/08/21 08:09:03 WARN Utils: Your hostname, pupil-a resolves to a loopback address: 127.0.1.1; using 167.235.141.210 instead (on interface eth0)
23/08/21 08:09:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/21 08:09:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from IPython.display import *
display(HTML("<style>pre { white-space: pre !important; }</style>"))

### Create DataFrame - Option 1.a: From RDD

In [4]:
# Create RDD

data = [
            [ 1, "Neha",  10000 ],
            [ 2, "Steve", 20000 ],
            [ 3, "Kari",  30000 ],
            [ 4, "Ivan",  40000 ],
            [ 5, "Mohit", 50000 ]
       ]

employeesRdd = sc.parallelize(data)

In [5]:
# Create DataFrame and show content

employeesDF = employeesRdd.toDF()

employeesDF.show()

                                                                                

+---+-----+-----+
| _1|   _2|   _3|
+---+-----+-----+
|  1| Neha|10000|
|  2|Steve|20000|
|  3| Kari|30000|
|  4| Ivan|40000|
|  5|Mohit|50000|
+---+-----+-----+



In [6]:
# Define column names for DataFrame

employeesDF = employeesDF.toDF("Id", "Name", "Salary")

employeesDF.show()

+---+-----+------+
| Id| Name|Salary|
+---+-----+------+
|  1| Neha| 10000|
|  2|Steve| 20000|
|  3| Kari| 30000|
|  4| Ivan| 40000|
|  5|Mohit| 50000|
+---+-----+------+



In [7]:
# Print DataFrame schema

employeesDF.printSchema()

root
 |-- Id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Salary: long (nullable = true)



### Create DataFrame - Option 1.b: From data collection

In [8]:
# Create DataFrame from collection

employeesDF = (
                    spark
                        .createDataFrame
                        (
                            data,                                   # Pass RDD or collection
                            "Id: long, Name: string, Salary: long"  
                            
                                              # Pass schema as array ["Id", "Name", "Salary"]
                        )
               )

employeesDF.show()

+---+-----+------+
| Id| Name|Salary|
+---+-----+------+
|  1| Neha| 10000|
|  2|Steve| 20000|
|  3| Kari| 30000|
|  4| Ivan| 40000|
|  5|Mohit| 50000|
+---+-----+------+



### Create DataFrame - Option 2: Read a File

In [11]:
# Read YellowTaxis csv file to create DataFrame

yellowTaxiDF = (
                  spark    
                    .read    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

IllegalArgumentException: java.net.URISyntaxException: Relative path in absolute URI: C:%5CSparkCourse%5CDataFiles%5CRaw%5CYellowTaxis_202210.csv

In [10]:
# Display DataFrame content

yellowTaxiDF.show()

NameError: name 'yellowTaxiDF' is not defined

In [11]:
# Take column names from file header row

yellowTaxiDF = (
                  spark
                    .read
    
                    .option("header", "true")
    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



In [12]:
# Use tab delimiter to read GreenTaxis file

greenTaxiDF = (
                  spark
                    .read                     
                    .option("header", "true")
    
                    .option("delimiter", "\t")
    
                    .csv("C:\SparkCourse\DataFiles\Raw\GreenTaxis_*.csv")
              )

greenTaxiDF.show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorId|lpep_pickup_datetime|lpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2|2022-10-01T06:08:...| 2022-10-01T06:21:...|            1.0|         2.47|       1.0|                 N|         256|         225|         1.0|       11.5|  0.5|    0.5|      2.5

### Read JSON File

In [13]:
# Read PaymentTypes JSON file

paymentTypesDF = (
                      spark
                        .read
                        .json("C:\SparkCourse\DataFiles\Raw\PaymentTypes.json")
                 )

#paymentTypesDF = (
#                     spark
#                       .read

#                       .format("json")
#                       .load("C:\SparkCourse\DataFiles\Raw\PaymentTypes.json")
#                 )

paymentTypesDF.show()

+-----------+-------------+
|PaymentType|PaymentTypeID|
+-----------+-------------+
|Credit Card|            1|
|       Cash|            2|
|  No Charge|            3|
|    Dispute|            4|
|    Unknown|            5|
|Voided Trip|            6|
+-----------+-------------+



### Schema Option 1 - No schema inference or definition

<i>Check for jobs

In [14]:
# Read YellowTaxis csv file to create DataFrame

yellowTaxiDF = (
                  spark
                    .read    
                    .option("header", "true")    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)
 |-- airport_fee: string (nullable = true)



### Schema Option 2 - Infer schema

<i>Check for jobs

In [15]:
# Read YellowTaxis csv file, and create DataFrame by inferring the schema

yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")
    
                    .option("inferSchema", "true")
    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



### Schema Option 3 - Define schema & apply

<i>Check for jobs

In [16]:
# Create schema for Yellow Taxi Data

yellowTaxiSchema = (
                        StructType
                        ([ 
                            StructField("VendorId"               , IntegerType()   , True),
                            StructField("lpep_pickup_datetime"   , TimestampType() , True),
                            StructField("lpep_dropoff_datetime"  , TimestampType() , True),
                            StructField("passenger_count"        , DoubleType()    , True),
                            StructField("trip_distance"          , DoubleType()    , True),
                            StructField("RatecodeID"             , DoubleType()    , True),
                            StructField("store_and_fwd_flag"     , StringType()    , True),
                            StructField("PULocationID"           , IntegerType()   , True),
                            StructField("DOLocationID"           , IntegerType()   , True),
                            StructField("payment_type"           , IntegerType()   , True),
                            StructField("fare_amount"            , DoubleType()    , True),
                            StructField("extra"                  , DoubleType()    , True),
                            StructField("mta_tax"                , DoubleType()    , True),
                            StructField("tip_amount"             , DoubleType()    , True),
                            StructField("tolls_amount"           , DoubleType()    , True),
                            StructField("improvement_surcharge"  , DoubleType()    , True),
                            StructField("total_amount"           , DoubleType()    , True),
                            StructField("congestion_surcharge"   , DoubleType()    , True),
                            StructField("airport_fee"            , DoubleType()    , True)
                        ])
                   )

In [17]:
# Read YellowTaxis csv file, and create DataFrame by applying the schema

yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")
    
                    .schema(yellowTaxiSchema)
    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

yellowTaxiDF.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



### Define schema for JSON file

In [18]:
# Read JSON file

taxiBasesDF = (
                  spark
                    .read
                    .json("C:\SparkCourse\DataFiles\Raw\TaxiBases.json")
              )

taxiBasesDF.show()

AnalysisException: 
Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the
referenced columns only include the internal corrupt record column
(named _corrupt_record by default). For example:
spark.read.schema(schema).csv(file).filter($"_corrupt_record".isNotNull).count()
and spark.read.schema(schema).csv(file).select("_corrupt_record").show().
Instead, you can cache or save the parsed results and then send the same query.
For example, val df = spark.read.schema(schema).csv(file).cache() and then
df.filter($"_corrupt_record".isNotNull).count().
      

In [19]:
# Read multiline JSON file

taxiBasesDF = (
                  spark
                    .read
    
                    .option("multiline", "true")
    
                    .json("C:\SparkCourse\DataFiles\Raw\TaxiBases.json")
              )

taxiBasesDF.show(truncate=False)

+-----------------------------------------------------------+----------+--------------------------------------+------------------------------------------------+--------------+------------+----------------+--------+---------------------------+
|Address                                                    |Date      |Entity Name                           |GeoLocation                                     |License Number|SHL Endorsed|Telephone Number|Time    |Type of Base               |
+-----------------------------------------------------------+----------+--------------------------------------+------------------------------------------------+--------------+------------+----------------+--------+---------------------------+
|{636, NEW YORK, 10001, NY, WEST   28 STREET}               |08/15/2019|VIER-NY,LLC                           |{40.75273, (40.75273, -74.006408), -74.006408}  |B02865        |No          |6466657536      |18:03:31|BLACK CAR BASE             |
|{131, BRONX, 10468, NY, KIN

In [20]:
# Check the complex structure of JSON data

taxiBasesDF.printSchema()

root
 |-- Address: struct (nullable = true)
 |    |-- Building: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- Postcode: long (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- Street: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Entity Name: string (nullable = true)
 |-- GeoLocation: struct (nullable = true)
 |    |-- Latitude: double (nullable = true)
 |    |-- Location: string (nullable = true)
 |    |-- Longitude: double (nullable = true)
 |-- License Number: string (nullable = true)
 |-- SHL Endorsed: string (nullable = true)
 |-- Telephone Number: long (nullable = true)
 |-- Time: string (nullable = true)
 |-- Type of Base: string (nullable = true)



In [21]:
# Define schema for JSON data

taxiBasesSchema = (
                    StructType
                    ([
                        StructField("License Number"         , StringType()    , True),
                        StructField("Entity Name"            , StringType()    , True),
                        StructField("Telephone Number"       , LongType()      , True),
                        StructField("SHL Endorsed"           , StringType()    , True),
                        StructField("Type of Base"           , StringType()    , True),

                        StructField("Address", 
                                        StructType
                                        ([
                                            StructField("Building"   , StringType(),   True),
                                            StructField("Street"     , StringType(),   True), 
                                            StructField("City"       , StringType(),   True), 
                                            StructField("State"      , StringType(),   True), 
                                            StructField("Postcode"   , StringType(),   True)
                                        ]),
                                    True
                                   ),
                        
                        StructField("GeoLocation", 
                                        StructType
                                        ([
                                            StructField("Latitude"   , StringType(),   True),
                                            StructField("Longitude"  , StringType(),   True), 
                                            StructField("Location"   , StringType(),   True)
                                        ]),
                                    True
                                   )  
                  ])
                )

In [22]:
# Read JSON file by applying nested schema

taxiBasesDF = (
                  spark
                    .read    
                    .option("multiline", "true")
    
                    .schema(taxiBasesSchema)
    
                    .json("C:\SparkCourse\DataFiles\Raw\TaxiBases.json")
              )

taxiBasesDF.show(truncate=False)

+--------------+--------------------------------------+----------------+------------+---------------------------+-----------------------------------------------------------+------------------------------------------------+
|License Number|Entity Name                           |Telephone Number|SHL Endorsed|Type of Base               |Address                                                    |GeoLocation                                     |
+--------------+--------------------------------------+----------------+------------+---------------------------+-----------------------------------------------------------+------------------------------------------------+
|B02865        |VIER-NY,LLC                           |6466657536      |No          |BLACK CAR BASE             |{636, WEST   28 STREET, NEW YORK, NY, 10001}               |{40.75273, -74.006408, (40.75273, -74.006408)}  |
|B02634        |VETERANS RADIO DISPATCHER CORP.       |7183647878      |No          |LIVERY BASE            

### Analyze Data

In [23]:
yellowTaxiAnalyzedDF = (
                            yellowTaxiDF.describe
                            (
                                "passenger_count",
                                "trip_distance"
                            )
                       )

yellowTaxiAnalyzedDF.show()

+-------+------------------+-----------------+
|summary|   passenger_count|    trip_distance|
+-------+------------------+-----------------+
|  count|           3542392|          3675412|
|   mean|1.3846934500755421|6.206976298167358|
| stddev|0.9302303297407405|640.8236808320215|
|    min|               0.0|              0.0|
|    max|               9.0|        389678.46|
+-------+------------------+-----------------+



### Clean Data

#### 1. Accuracy Check: Filter inaccurate data

In [24]:
# Display the count before operation
print("Before operation = " + str( yellowTaxiDF.count()) )


yellowTaxiDF = (
                  yellowTaxiDF
    
                      .where("passenger_count > 0")

                      .filter(col("trip_distance") > 0.0)
               )


# Display the count after operation
print("After operation = " + str( yellowTaxiDF.count()) )

Before operation = 3675412
After operation = 3422296


#### 2.a. Completeness Check: Drop rows with nulls

In [25]:
# Display the count before operation
print("Before operation = " + str( yellowTaxiDF.count()) )


yellowTaxiDF = (
                   yellowTaxiDF    
                          .na.drop('all')
               )


# Display the count after operation
print("After operation = " + str( yellowTaxiDF.count()) )

Before operation = 3422296
After operation = 3422296


#### 2.b. Completeness Check: Replace nulls with default values

In [26]:
defaultValueMap = {'payment_type': 5, 'RateCodeID': 1}


yellowTaxiDF = (
                   yellowTaxiDF    
                      .na.fill(defaultValueMap)
               )

#### 3. Uniqueness Check: Drop duplicate rows

In [27]:
# Display the count before operation
print("Before operation = " + str( yellowTaxiDF.count()) )


yellowTaxiDF = (
                   yellowTaxiDF
                          .dropDuplicates()
               )


# Display the count after operation
print("After operation = " + str( yellowTaxiDF.count()) )

Before operation = 3422296
After operation = 3422295


#### 4. Timeliness Check: Remove records outside the bound

In [28]:
# Display the count before operation
print("Before operation = " + str( yellowTaxiDF.count()) )


yellowTaxiDF = (
    
     yellowTaxiDF
        .where("lpep_pickup_datetime >= '2022-10-01' AND lpep_dropoff_datetime < '2022-11-01'")
)


# Display the count after operation
print("After operation = " + str( yellowTaxiDF.count()) )

Before operation = 3422295
After operation = 3393897


### Chain all cleanup operation together

In [None]:
defaultValueMap = {'payment_type': 5, 'RateCodeID': 1}

# Read file
yellowTaxiDF = (
                  spark
                    .read
                    .option("header", "true")    
                    .schema(yellowTaxiSchema)    
                    .csv("C:\SparkCourse\DataFiles\Raw\YellowTaxis_202210.csv")
               )

# Cleanup data by applying data quality checks
yellowTaxiDF = (
                  yellowTaxiDF    
    
                      .where("passenger_count > 0")
    
                      .filter(col("trip_distance") > 0.0)
                          
                      .na.drop('all')
    
                      .na.fill(defaultValueMap)
    
                      .dropDuplicates()
    
                      .where("lpep_pickup_datetime >= '2022-10-01' AND lpep_dropoff_datetime < '2022-11-01'")
               )

# Display the count after operation
print("After operation = " + str( yellowTaxiDF.count()) )

In [29]:
yellowTaxiDF.printSchema()

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = false)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = false)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



### Transform Data

#### 1. Select limited columns

In [30]:
yellowTaxiDF = (
                   yellowTaxiDF

                        # Select only limited columns
                        .select(
                                  "VendorID",
                             
                                  col("passenger_count").cast(IntegerType()),
                            
                                  column("trip_distance").alias("TripDistance"),
                            
                                  yellowTaxiDF.lpep_pickup_datetime,
                            
                                  "lpep_dropoff_datetime",
                                  "PUlocationID",
                                  "DOlocationID",
                                  "RatecodeID",
                                  "total_amount",
                                  "payment_type"
                               )
    
                        # Don't run, since airport_fee has not been selected above    
                        # .drop("airport_fee") 
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- PUlocationID: integer (nullable = true)
 |-- DOlocationID: integer (nullable = true)
 |-- RatecodeID: double (nullable = false)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = false)



#### 2. Rename columns

In [31]:
yellowTaxiDF = (
                   yellowTaxiDF                        
                        
                        .withColumnRenamed("passenger_count", "PassengerCount")
    
                        .withColumnRenamed("lpep_pickup_datetime", "PickupTime")
                        .withColumnRenamed("lpep_dropoff_datetime", "DropTime")
                        .withColumnRenamed("PUlocationID", "PickupLocationId")
                        .withColumnRenamed("DOlocationID", "DropLocationId")
                        .withColumnRenamed("total_amount", "TotalAmount")
                        .withColumnRenamed("payment_type", "PaymentType")    
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- RatecodeID: double (nullable = false)
 |-- TotalAmount: double (nullable = true)
 |-- PaymentType: integer (nullable = false)



#### 3.a. Create derived columns - TripYear, TripMonth, TripDay

In [32]:
# Create derived columns for year, month and day
yellowTaxiDF = (
                  yellowTaxiDF
    
                        .withColumn("TripYear", year(col("PickupTime")))
    
                        .select(
                                    "*",
                            
                                    expr("month(PickupTime) AS TripMonth"),
                            
                                    dayofmonth(col("PickupTime")).alias("TripDay")
                               )
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- RatecodeID: double (nullable = false)
 |-- TotalAmount: double (nullable = true)
 |-- PaymentType: integer (nullable = false)
 |-- TripYear: integer (nullable = true)
 |-- TripMonth: integer (nullable = true)
 |-- TripDay: integer (nullable = true)



#### 3.b. Create derived column - TripTimeInMinutes

In [33]:
# Option 1
yellowTaxiDF = (
                  yellowTaxiDF
                        .withColumn("TripTimeInMinutes", 
                                        round(
                                                (unix_timestamp(col("DropTime")) 
                                                     - unix_timestamp(col("PickupTime"))) 
                                            
                                                  / 60
                                             )
                                   )
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- RatecodeID: double (nullable = false)
 |-- TotalAmount: double (nullable = true)
 |-- PaymentType: integer (nullable = false)
 |-- TripYear: integer (nullable = true)
 |-- TripMonth: integer (nullable = true)
 |-- TripDay: integer (nullable = true)
 |-- TripTimeInMinutes: double (nullable = true)



In [34]:
# Option 2

tripTimeInSecondsExpr = unix_timestamp(col("DropTime")) - unix_timestamp(col("PickupTime"))


tripTimeInMinutesExpr = round(tripTimeInSecondsExpr / 60)


yellowTaxiDF = (
                  yellowTaxiDF
                        .withColumn("TripTimeInMinutes", tripTimeInMinutesExpr)
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- RatecodeID: double (nullable = false)
 |-- TotalAmount: double (nullable = true)
 |-- PaymentType: integer (nullable = false)
 |-- TripYear: integer (nullable = true)
 |-- TripMonth: integer (nullable = true)
 |-- TripDay: integer (nullable = true)
 |-- TripTimeInMinutes: double (nullable = true)



#### 3.c. Create derived column - TripType

In [35]:
tripTypeColumn = (
                    when(
                            col("RatecodeID") == 6,
                              "SharedTrip"
                         )
                    .otherwise("SoloTrip")
                 )    


yellowTaxiDF = (
                  yellowTaxiDF
    
                        .withColumn("TripType", tripTypeColumn)
    
                        .drop("RatecodeID")
               )

yellowTaxiDF.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- PassengerCount: integer (nullable = true)
 |-- TripDistance: double (nullable = true)
 |-- PickupTime: timestamp (nullable = true)
 |-- DropTime: timestamp (nullable = true)
 |-- PickupLocationId: integer (nullable = true)
 |-- DropLocationId: integer (nullable = true)
 |-- TotalAmount: double (nullable = true)
 |-- PaymentType: integer (nullable = false)
 |-- TripYear: integer (nullable = true)
 |-- TripMonth: integer (nullable = true)
 |-- TripDay: integer (nullable = true)
 |-- TripTimeInMinutes: double (nullable = true)
 |-- TripType: string (nullable = false)



### Check Execution Plans

In [36]:
yellowTaxiDF.explain( mode = "extended" )

# Other modes - simple, codegen, cost, formatted

== Parsed Logical Plan ==
Project [VendorID#550, PassengerCount#1230, TripDistance#1218, PickupTime#1241, DropTime#1252, PickupLocationId#1263, DropLocationId#1274, TotalAmount#1285, PaymentType#1296, TripYear#1307, TripMonth#1319, TripDay#1320, TripTimeInMinutes#1349, TripType#1364]
+- Project [VendorID#550, PassengerCount#1230, TripDistance#1218, PickupTime#1241, DropTime#1252, PickupLocationId#1263, DropLocationId#1274, RatecodeID#1025, TotalAmount#1285, PaymentType#1296, TripYear#1307, TripMonth#1319, TripDay#1320, TripTimeInMinutes#1349, CASE WHEN (RatecodeID#1025 = cast(6 as double)) THEN SharedTrip ELSE SoloTrip END AS TripType#1364]
   +- Project [VendorID#550, PassengerCount#1230, TripDistance#1218, PickupTime#1241, DropTime#1252, PickupLocationId#1263, DropLocationId#1274, RatecodeID#1025, TotalAmount#1285, PaymentType#1296, TripYear#1307, TripMonth#1319, TripDay#1320, round((cast((unix_timestamp(DropTime#1252, yyyy-MM-dd HH:mm:ss, Some(Asia/Calcutta), false) - unix_timestamp

In [None]:
# NOT IN VIDEO, RUN AND SEE PLAN IN SPARK

yellowTaxiDF.show()

### Read JSON file using schema

In [None]:
taxiBasesSchema = (
                    StructType
                    ([
                        StructField("License Number"         , StringType()    , True),
                        StructField("Entity Name"            , StringType()    , True),
                        StructField("Telephone Number"       , LongType()      , True),
                        StructField("SHL Endorsed"           , StringType()    , True),
                        StructField("Type of Base"           , StringType()    , True),

                        StructField("Address", 
                                        StructType
                                        ([
                                            StructField("Building"   , StringType(),   True),
                                            StructField("Street"     , StringType(),   True), 
                                            StructField("City"       , StringType(),   True), 
                                            StructField("State"      , StringType(),   True), 
                                            StructField("Postcode"   , StringType(),   True)
                                        ]),
                                    True
                                   ),
                        
                        StructField("GeoLocation", 
                                        StructType
                                        ([
                                            StructField("Latitude"   , StringType(),   True),
                                            StructField("Longitude"  , StringType(),   True), 
                                            StructField("Location"   , StringType(),   True)
                                        ]),
                                    True
                                   )  
                  ])
                )

In [39]:
taxiBasesDF = (
                  spark
                    .read    
                    .option("multiline", "true")
    
                    .schema(taxiBasesSchema)
    
                    .json("C:\SparkCourse\DataFiles\Raw\TaxiBases.json")
              )

taxiBasesDF.show(truncate=False)

+--------------+--------------------------------------+----------------+------------+---------------------------+-----------------------------------------------------------+------------------------------------------------+
|License Number|Entity Name                           |Telephone Number|SHL Endorsed|Type of Base               |Address                                                    |GeoLocation                                     |
+--------------+--------------------------------------+----------------+------------+---------------------------+-----------------------------------------------------------+------------------------------------------------+
|B02865        |VIER-NY,LLC                           |6466657536      |No          |BLACK CAR BASE             |{636, WEST   28 STREET, NEW YORK, NY, 10001}               |{40.75273, -74.006408, (40.75273, -74.006408)}  |
|B02634        |VETERANS RADIO DISPATCHER CORP.       |7183647878      |No          |LIVERY BASE            

### Extract nested fields from JSON

In [40]:
taxiBasesFlatDF = (
                        taxiBasesDF
                            .select(
                                      col("License Number").alias("BaseLicenseNumber"),
                                      col("Entity Name").alias("EntityName"),

                                      col("Address.Building").alias("AddressBuilding"),

                                      col("Address.Street").alias("AddressStreet"),
                                      col("Address.City").alias("AddressCity"),
                                      col("Address.State").alias("AddressState"),
                                      col("Address.Postcode").alias("AddressPostCode"),

                                      col("GeoLocation.Latitude").alias("GeoLatitude"),
                                      col("GeoLocation.Longitude").alias("GeoLongitude")
                                   )
                  )

taxiBasesFlatDF.show()

+-----------------+--------------------+---------------+--------------------+-------------+------------+---------------+-----------+------------+
|BaseLicenseNumber|          EntityName|AddressBuilding|       AddressStreet|  AddressCity|AddressState|AddressPostCode|GeoLatitude|GeoLongitude|
+-----------------+--------------------+---------------+--------------------+-------------+------------+---------------+-----------+------------+
|           B02865|         VIER-NY,LLC|            636|    WEST   28 STREET|     NEW YORK|          NY|          10001|   40.75273|  -74.006408|
|           B02634|VETERANS RADIO DI...|            131|    KINGSBRIDGE ROAD|        BRONX|          NY|          10468|   40.86927|   -73.90281|
|           B80094|      ALPHA VAN LINE|         115-54|          238 STREET|       ELMONT|          NY|          11003|  40.693473|  -73.724446|
|           B02677|A.T.B. CAR AND LI...|            866|     NEW LOTS AVENUE|     BROOKLYN|          NY|          11208|  40

### Aggregate Data

In [41]:
yellowTaxiDFReport = (
                         yellowTaxiDF
    
                                .groupBy( "PickupLocationId", "DropLocationId" )
    
                                .agg( 
                                        avg("TripTimeInMinutes").alias("AvgTripTime"),
                                    
                                        sum("TotalAmount").alias("SumAmount")
                                    )
    
                                .orderBy( col("PickupLocationId").desc() )
                     )

yellowTaxiDFReport.show()

+----------------+--------------+------------------+------------------+
|PickupLocationId|DropLocationId|       AvgTripTime|         SumAmount|
+----------------+--------------+------------------+------------------+
|             265|           236|              68.0|            366.51|
|             265|            91|              33.0|              49.8|
|             265|            88|              66.0|              70.0|
|             265|            93|43.333333333333336|            167.55|
|             265|           264| 4.592592592592593|2377.5399999999995|
|             265|           163|              49.0|            568.46|
|             265|           216|              66.0|            126.85|
|             265|            64|               5.0|               9.3|
|             265|           134|              35.0|              30.0|
|             265|            19|               5.0|              60.3|
|             265|           126|              53.0|            

### Exercises

1. Based on PickupTime, add a new column with day text = Monday to Sunday

2. Based on PickupTime, add a new column with month text = January to December

3. Based on PickupTime, add a new column with value being the last day of month

In [None]:
# Exercise 1: Answer



In [None]:
# Exercise 2: Answer



In [None]:
# Exercise 3: Answer



In [42]:
# Reduce number of DataFrame partitions to 4

yellowTaxiDF = yellowTaxiDF.coalesce(4)

yellowTaxiDF.rdd.getNumPartitions()

4

### Save data in CSV format to storage

In [43]:
(
    yellowTaxiDF    
            .write
            
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    
            .mode("overwrite")    # Options - Append, ErrorIfExists, Ignore, Overwrite
    
            .csv("C:\SparkCourse\DataFiles\Output\YellowTaxisOutput.csv")
)

### Save data in Parquet format to storage

In [44]:
(
    yellowTaxiDF
            .write
    
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    
            .mode("overwrite")
    
            .parquet("C:\SparkCourse\DataFiles\Output\YellowTaxisOutput.parquet")
)

### Save partitioned data in Parquet format to storage

In [45]:
(
    yellowTaxiDF
            .write
    
            .partitionBy("VendorId")
            
            .option("header", "true")
            .option("dateFormat", "yyyy-MM-dd HH:mm:ss.S")
    
            .mode("overwrite")
    
            .parquet("C:\SparkCourse\DataFiles\Output\YellowTaxisPartitionedOutput.parquet")
)

### Check performance between non-partitioned & partitioned datasets

#### 1. Performance check: Run query on non-partitioned dataset

In [46]:
# Run query on non-partitioned dataset

outputDF = (
                spark
                    .read
                    .parquet("C:\SparkCourse\DataFiles\Output\YellowTaxisOutput.parquet")
    
                    .where("VendorId = 1")
    
                    .groupBy("PickupLocationId")
                    .agg(avg("TotalAmount"))
           )

outputDF.show()

+----------------+------------------+
|PickupLocationId|  avg(TotalAmount)|
+----------------+------------------+
|             148|19.995832255282632|
|             243| 25.97614035087719|
|              31| 26.21666666666667|
|             137| 17.32748269082856|
|              85| 27.50609756097561|
|             251|            61.825|
|              65|25.368939828080265|
|             255|22.516063492063477|
|              53| 39.64958333333333|
|             133|33.552083333333336|
|              78|              34.1|
|             108| 49.70862068965516|
|             155|            38.944|
|             211| 18.86426906623371|
|              34|28.052105263157895|
|             193| 19.01872246696034|
|             101|        36.5390625|
|             126| 26.52592592592593|
|              81| 37.83181818181818|
|             210| 46.57599999999999|
+----------------+------------------+
only showing top 20 rows



#### 2. Performance check: Run query on partitioned dataset

In [47]:
outputDF = (
                spark
                    .read
                    .parquet("C:\SparkCourse\DataFiles\Output\YellowTaxisPartitionedOutput.parquet")
    
                    .where("VendorId = 1")
    
                    .groupBy("PickupLocationId")
                    .agg(avg("TotalAmount"))
           )

outputDF.show()

+----------------+------------------+
|PickupLocationId|  avg(TotalAmount)|
+----------------+------------------+
|             148|19.995832255282664|
|             243| 25.97614035087719|
|              31| 26.21666666666667|
|             137| 17.32748269082855|
|              85| 27.50609756097561|
|             251|            61.825|
|              65|25.368939828080272|
|              53| 39.64958333333333|
|             255|22.516063492063484|
|             133|33.552083333333336|
|              78|              34.1|
|             108| 49.70862068965516|
|             155|            38.944|
|             211| 18.86426906623374|
|             193| 19.01872246696034|
|              34|28.052105263157895|
|             101|        36.5390625|
|             126| 26.52592592592593|
|              81| 37.83181818181818|
|              28| 35.15681818181818|
+----------------+------------------+
only showing top 20 rows

