In [1]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

24/08/18 10:24:13 WARN Utils: Your hostname, coldbrew.local resolves to a loopback address: 127.0.0.1; using 172.16.119.23 instead (on interface en0)
24/08/18 10:24:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/08/18 10:24:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
sdf_all = spark.read.parquet('../data/landing/tlc_data/')

                                                                                

In [3]:
num_rows = sdf_all.count()
print("Number of rows",num_rows)
sdf_all.printSchema()



Number of rows 121257739
root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- originating_base_num: string (nullable = true)
 |-- request_datetime: timestamp_ntz (nullable = true)
 |-- on_scene_datetime: timestamp_ntz (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- tolls: double (nullable = true)
 |-- bcf: double (nullable = true)
 |-- sales_tax: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)
 |-- tips: double (nullable = true)
 |-- driver_pay: double (nullable = true)
 |-- shared_request_flag: string (nullable = true)
 |-- shared_match_flag: string (nullab

                                                                                

In [4]:
sdf_all.show(5)

                                                                                

+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

In [5]:
from pyspark.sql import functions as F

# Create a dictionary to hold the counts of missing values (written by ChatGPT)
missing_counts = {col: sdf_all.filter(sdf_all[col].isNull()).count() for col in sdf_all.columns}

# Display the counts
for column, count in missing_counts.items():
    print(f"Column {column} has {count} missing values.")



Column hvfhs_license_num has 0 missing values.
Column dispatching_base_num has 0 missing values.
Column originating_base_num has 32186718 missing values.
Column request_datetime has 0 missing values.
Column on_scene_datetime has 32186330 missing values.
Column pickup_datetime has 0 missing values.
Column dropoff_datetime has 0 missing values.
Column PULocationID has 0 missing values.
Column DOLocationID has 0 missing values.
Column trip_miles has 0 missing values.
Column trip_time has 0 missing values.
Column base_passenger_fare has 0 missing values.
Column tolls has 0 missing values.
Column bcf has 0 missing values.
Column sales_tax has 0 missing values.
Column congestion_surcharge has 0 missing values.
Column airport_fee has 0 missing values.
Column tips has 0 missing values.
Column driver_pay has 0 missing values.
Column shared_request_flag has 0 missing values.
Column shared_match_flag has 0 missing values.
Column access_a_ride_flag has 0 missing values.
Column wav_request_flag has

                                                                                

In [6]:
sdf_all = sdf_all.drop('originating_base_num', 'on_scene_datetime')

In [7]:
# Identifying invalid data
num_rows_after_drop = sdf_all.filter((F.col('trip_miles') > 0) & (F.col('trip_time') > 0) & (F.col('tips') >= 0)).count()
print("Number of rows dropping",num_rows-num_rows_after_drop)
print("Percentage of rows dropping", (num_rows-num_rows_after_drop)/num_rows)



Number of rows dropping 17959
Percentage of rows dropping 0.00014810601078418592


                                                                                

In [8]:
sdf_all.write.mode('overwrite').parquet('../data/raw/tlc_data')

                                                                                

In [9]:
pluto_all = spark.read.parquet('../data/landing/pluto_data/')
pluto_all.printSchema()
print("Number of rows",pluto_all.count())
pluto_all.show(5)

root
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- assesstot: string (nullable = true)


                                                                                

Number of rows 859012
+----------+-----------+--------------+
|  latitude|  longitude|     assesstot|
+----------+-----------+--------------+
|40.8993514|-73.8669496| 7792200.00000|
|40.8878889|-73.8594210| 4099950.00000|
|40.9023086|-73.8542356| 3050100.00000|
|40.8957548|-73.8461911|24497847.00000|
|40.6690273|-73.7577478|24963300.00000|
+----------+-----------+--------------+
only showing top 5 rows


In [10]:
# Create a dictionary to hold the counts of missing values (written by ChatGPT)
missing_counts = {col: pluto_all.filter(pluto_all[col].isNull()).count() for col in pluto_all.columns}

# Display the counts
for column, count in missing_counts.items():
    print(f"Column {column} has {count} missing values.")

Column latitude has 1635 missing values.
Column longitude has 1635 missing values.
Column assesstot has 327 missing values.


In [11]:
pluto_all = pluto_all.dropna()
print("Number of rows",pluto_all.count())

Number of rows 857051


In [12]:
from pyspark.sql.types import DoubleType

# Convert the columns to the correct data type
pluto_all = pluto_all.withColumn("latitude", pluto_all["latitude"].cast(DoubleType()))
pluto_all = pluto_all.withColumn("longitude", pluto_all["longitude"].cast(DoubleType()))
pluto_all = pluto_all.withColumn("assesstot", pluto_all["assesstot"].cast(DoubleType()))

In [13]:
# Identifying invalid data
pluto_all = pluto_all.filter((F.col('assesstot') > 0))
print("Number of rows",pluto_all.count())

Number of rows 853692


                                                                                

In [14]:
pluto_all.write.mode('overwrite').parquet('../data/raw/pluto_data')

                                                                                