In [84]:
from pyspark.sql import SparkSession

# Create a spark session (which will run spark jobs)
spark = (
    SparkSession.builder.appName("MAST30034 Tutorial 1")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .getOrCreate()
)

In [85]:
sdf_all = spark.read.parquet('../data/landing/tlc_data/')

In [86]:
sdf_all.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)


In [87]:
sdf_all.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2024-03-01 00:18:51|  2024-03-01 00:23:45|              0|          1.3|         1|                 N|         142|         239|           1|        8.6|  3.5|    0.5|       2.

In [88]:
from pyspark.sql import functions as F

# Create a dictionary to hold the counts of missing values (written by ChatGPT)
missing_counts = {col: sdf_all.filter(sdf_all[col].isNull()).count() for col in sdf_all.columns}

# Display the counts
for column, count in missing_counts.items():
    print(f"Column {column} has {count} missing values.")

                                                                                

Column VendorID has 0 missing values.
Column tpep_pickup_datetime has 0 missing values.
Column tpep_dropoff_datetime has 0 missing values.
Column passenger_count has 1532766 missing values.
Column trip_distance has 0 missing values.
Column RatecodeID has 1532766 missing values.
Column store_and_fwd_flag has 1532766 missing values.
Column PULocationID has 0 missing values.
Column DOLocationID has 0 missing values.
Column payment_type has 0 missing values.
Column fare_amount has 0 missing values.
Column extra has 0 missing values.
Column mta_tax has 0 missing values.
Column tip_amount has 0 missing values.
Column tolls_amount has 0 missing values.
Column improvement_surcharge has 0 missing values.
Column total_amount has 0 missing values.
Column congestion_surcharge has 1532766 missing values.
Column Airport_fee has 1532766 missing values.


In [89]:
num_rows = sdf_all.count()
sdf_all = sdf_all.dropna()
num_rows_after_drop = sdf_all.count()

print("Drop Report")
print(f"Number of rows before dropping: {num_rows}")
print(f"Number of rows after dropping: {num_rows_after_drop}")
print(f"Number of rows dropped: {num_rows - num_rows_after_drop}")
print(f"Percentage of rows dropped: {(num_rows - num_rows_after_drop) / num_rows * 100:.2f}%")



Drop Report
Number of rows before dropping: 28371384
Number of rows after dropping: 26838618
Number of rows dropped: 1532766
Percentage of rows dropped: 5.40%


                                                                                

In [90]:
# Identifying invalid data
sdf_all = sdf_all.filter((F.col('passenger_count') > 0) & (F.col('fare_amount') > 0) & (F.col('trip_distance') > 0))
num_rows_after_drop = sdf_all.count()

print("Drop Report")
print(f"Number of rows before dropping: {num_rows}")
print(f"Number of rows after dropping: {num_rows_after_drop}")
print(f"Number of rows dropped: {num_rows - num_rows_after_drop}")
print(f"Cum. Percentage of rows dropped: {(num_rows - num_rows_after_drop) / num_rows * 100:.2f}%")

                                                                                

Drop Report
Number of rows before dropping: 28371384
Number of rows after dropping: 25838880
Number of rows dropped: 2532504
Cum. Percentage of rows dropped: 8.93%


In [ ]:
# Identifying invalid data
sdf_all = sdf_all.filter((F.col('passenger_count') > 0) & (F.col('fare_amount') > 0) & (F.col('trip_distance') > 0))
num_rows_after_drop = sdf_all.count()

print("Drop Report")
print(f"Number of rows before dropping: {num_rows}")
print(f"Number of rows after dropping: {num_rows_after_drop}")
print(f"Number of rows dropped: {num_rows - num_rows_after_drop}")
print(f"Cum. Percentage of rows dropped: {(num_rows - num_rows_after_drop) / num_rows * 100:.2f}%")

In [91]:
sdf_all.write.mode('overwrite').parquet('../data/raw/tlc_data')

24/08/07 23:10:16 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
24/08/07 23:10:28 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                