<a href="https://colab.research.google.com/github/felolivee/DVA-NYC_Congestion/blob/main/DVA_Project_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#download from Kaggle
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d jeffsinsel/nyc-fhvhv-data

Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/jeffsinsel/nyc-fhvhv-data
License(s): CC0-1.0
Downloading nyc-fhvhv-data.zip to /content
100% 17.8G/17.8G [03:34<00:00, 106MB/s]
100% 17.8G/17.8G [03:34<00:00, 89.2MB/s]


In [2]:
#unzip files in nyc-fhvhv-data from Kaggle
! unzip nyc-fhvhv-data.zip

Archive:  nyc-fhvhv-data.zip
  inflating: data_dictionary_trip_records_hvfhs.pdf  
  inflating: fhvhv_tripdata_2019-02.parquet  
  inflating: fhvhv_tripdata_2019-03.parquet  
  inflating: fhvhv_tripdata_2019-04.parquet  
  inflating: fhvhv_tripdata_2019-05.parquet  
  inflating: fhvhv_tripdata_2019-06.parquet  
  inflating: fhvhv_tripdata_2019-07.parquet  
  inflating: fhvhv_tripdata_2019-08.parquet  
  inflating: fhvhv_tripdata_2019-09.parquet  
  inflating: fhvhv_tripdata_2019-10.parquet  
  inflating: fhvhv_tripdata_2019-11.parquet  
  inflating: fhvhv_tripdata_2019-12.parquet  
  inflating: fhvhv_tripdata_2020-01.parquet  
  inflating: fhvhv_tripdata_2020-02.parquet  
  inflating: fhvhv_tripdata_2020-03.parquet  
  inflating: fhvhv_tripdata_2020-04.parquet  
  inflating: fhvhv_tripdata_2020-05.parquet  
  inflating: fhvhv_tripdata_2020-06.parquet  
  inflating: fhvhv_tripdata_2020-07.parquet  
  inflating: fhvhv_tripdata_2020-08.parquet  
  inflating: fhvhv_tripdata_2020-09.parquet

In [3]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
import glob

# initialize spark
spark = SparkSession.builder.appName("NYC_Rides").getOrCreate()

# list of all parquet files from content folder
parquet_files = glob.glob('/content/*.parquet')

removed_summary = {}
combined_df = None

for file in parquet_files:
    # read parquet files get features
    df = spark.read.parquet(file).select(
        "Pickup_datetime", "DropOff_datetime", "PULocationID",
        "DOLocationID", "base_passenger_fare", "trip_miles"
    )
    initial_count = df.count()

    # clean data
    df_clean = df.filter(
        (F.col("Hvfhs_license_num").isNotNull()) &
        (F.col("Pickup_datetime").isNotNull()) &
        (F.col("DropOff_datetime").isNotNull()) &
        (F.col("PULocationID").isNotNull()) &
        (F.col("DOLocationID").isNotNull()) &
        (F.col("base_passenger_fare").isNotNull()) &
        (F.col("trip_miles").isNotNull()) &
        (F.col("base_passenger_fare") > 0) &
        (F.col("trip_miles") > 0)
    )
    clean_count = df_clean.count()
    removed_summary[file] = initial_count - clean_count

    # combine clean DFs into one big DF
    if combined_df is None:
        combined_df = df_clean
    else:
        combined_df = combined_df.union(df_clean)

# check how many of which file were removed
print("Removal Summary:", removed_summary)

Removal Summary: {'/content/fhvhv_tripdata_2020-12.parquet': 19708, '/content/fhvhv_tripdata_2021-03.parquet': 31595, '/content/fhvhv_tripdata_2020-09.parquet': 20484, '/content/fhvhv_tripdata_2022-04.parquet': 23523, '/content/fhvhv_tripdata_2020-11.parquet': 17521, '/content/fhvhv_tripdata_2019-10.parquet': 66056, '/content/fhvhv_tripdata_2019-04.parquet': 89449, '/content/fhvhv_tripdata_2020-06.parquet': 14997, '/content/fhvhv_tripdata_2022-06.parquet': 29895, '/content/fhvhv_tripdata_2021-11.parquet': 42494, '/content/fhvhv_tripdata_2021-01.parquet': 17927, '/content/fhvhv_tripdata_2019-07.parquet': 729452, '/content/fhvhv_tripdata_2021-04.parquet': 41745, '/content/fhvhv_tripdata_2021-02.parquet': 31628, '/content/fhvhv_tripdata_2021-10.parquet': 40021, '/content/fhvhv_tripdata_2022-01.parquet': 28014, '/content/fhvhv_tripdata_2019-06.parquet': 58065, '/content/fhvhv_tripdata_2021-06.parquet': 55170, '/content/fhvhv_tripdata_2022-02.parquet': 29331, '/content/fhvhv_tripdata_2020-0

In [4]:
# calculate how many total rows of data were removed
tot = 0
for key,val in removed_summary.items():
  tot += int(val)

count = combined_df.count()
percent_removed = tot/count * 100

print("{}% was removed from a total of {} rides in 46 months".format(percent_removed, count))

0.7151187129558313% was removed from a total of 739995179 rides in 46 months


In [5]:
# show schema
combined_df.printSchema()

root
 |-- Pickup_datetime: timestamp_ntz (nullable = true)
 |-- DropOff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- base_passenger_fare: double (nullable = true)
 |-- trip_miles: double (nullable = true)

