This file is for grouping and merging all yellow, green, fhv and fhvhv data.

This file calculates the sum of passengers picked up and dropped off each hour in each location for each transport type and adds the two together. 

It then merges the data and gets the sum again by hour and location. This operation is done twice as there's not enough RAM on my PC to load the datasets at the same time.


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [2]:
parquet_files = ["all_yellow_tripdata_cleaned.parquet", "all_green_tripdata_cleaned.parquet", "all_fhv_tripdata_cleaned.parquet"] + [f"all_fhvhv_tripdata_cleaned_{i}.parquet" for i in range(20)]



In [3]:

for parquet_file in parquet_files:
    print("grouping and merging", parquet_file)
    # Read in file
    parquet_reader = pq.ParquetFile(parquet_file)
    table = pq.read_table(parquet_file)
    df = table.to_pandas()
    
    print(df.columns)

    df['pickup_datetime'] = df['pickup_datetime'].dt.floor(freq='h')
    df['dropoff_datetime'] = df['dropoff_datetime'].dt.floor(freq='h')

    # Get sum of passengers by hour by pickup location
    df_pickup = pd.DataFrame(df.groupby(['pickup_datetime', 'pickup_loc'])['passenger_count'].sum().reset_index())
    col_names = {
        "pickup_datetime": "datetime",
        "pickup_loc": "location"
    }
    df_pickup.rename(columns=col_names, inplace=True)

    # Get sum of passengers by hour by dropoff location
    df_dropoff = pd.DataFrame(df.groupby(['dropoff_datetime', 'dropoff_loc'])['passenger_count'].sum().reset_index())
    col_names = {
        "dropoff_datetime": "datetime",
        "dropoff_loc": "location"
    }

    df_dropoff.rename(columns=col_names, inplace=True)

    # Concatenate dfs
    merged_df = pd.concat([df_pickup,df_dropoff])
    merged_df.dropna(inplace=True)
    merged_df['location'] = merged_df['location'].astype('int64')


    merged_df.to_parquet(f"{parquet_file[:-8]}_grouped.parquet")
    print(merged_df.shape)



grouping and merging all_yellow_tripdata_cleaned.parquet
Index(['pickup_datetime', 'dropoff_datetime', 'pickup_loc', 'dropoff_loc',
       'passenger_count'],
      dtype='object')
(7386549, 3)
grouping and merging all_green_tripdata_cleaned.parquet
Index(['pickup_datetime', 'dropoff_datetime', 'pickup_loc', 'dropoff_loc',
       'passenger_count'],
      dtype='object')
(2015761, 3)
grouping and merging all_fhv_tripdata_cleaned.parquet
Index(['pickup_datetime', 'dropoff_datetime', 'pickup_loc', 'dropoff_loc',
       'passenger_count'],
      dtype='object')
(9054343, 3)
grouping and merging all_fhvhv_tripdata_cleaned_0.parquet
Index(['pickup_datetime', 'dropoff_datetime', 'pickup_loc', 'dropoff_loc',
       'passenger_count'],
      dtype='object')
(1013577, 3)
grouping and merging all_fhvhv_tripdata_cleaned_1.parquet
Index(['pickup_datetime', 'dropoff_datetime', 'pickup_loc', 'dropoff_loc',
       'passenger_count'],
      dtype='object')
(884805, 3)
grouping and merging all_fhvhv_tr

In [4]:
grouped_files = [f"{parquet_file[:-8]}_grouped.parquet" for parquet_file in parquet_files]

combined_df = pd.DataFrame()
for grouped_file in grouped_files:
    parquet_reader = pq.ParquetFile(grouped_file)
    table = pq.read_table(grouped_file)
    df = table.to_pandas()

    df.dropna(inplace=True)
    print(df.dtypes)

    combined_df = pd.concat([combined_df, df])

    combined_df = pd.DataFrame(combined_df.groupby(['datetime', 'location'])['passenger_count'].sum().reset_index())
    print(combined_df.head())
    print(combined_df.shape)
    
    


datetime           datetime64[us]
location                    int64
passenger_count             int64
dtype: object
    datetime  location  passenger_count
0 2021-01-01         4               11
1 2021-01-01         7                4
2 2021-01-01        11                1
3 2021-01-01        13                5
4 2021-01-01        17                3
(4828648, 3)
datetime           datetime64[us]
location                    int64
passenger_count             int64
dtype: object
    datetime  location  passenger_count
0 2021-01-01         4               11
1 2021-01-01         7                9
2 2021-01-01        11                1
3 2021-01-01        13                5
4 2021-01-01        17                4
(5012259, 3)
datetime           datetime64[us]
location                    int64
passenger_count             int64
dtype: object
    datetime  location  passenger_count
0 2021-01-01         3                4
1 2021-01-01         4               11
2 2021-01-01         7    

In [5]:
combined_df[combined_df.duplicated()]

Unnamed: 0,datetime,location,passenger_count


In [6]:
combined_df['datetime'].nunique()

29929

In [7]:
combined_df['location'].nunique()

265

In [8]:
combined_df.to_parquet("all_tlc_data_cleaned.parquet", engine='pyarrow', index=False)

In [9]:
combined_df.to_csv("all_tlc_data_cleaned.csv", index=False)