In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [2]:
parquet_file = "all_yellow_tripdata_cleaned.parquet"
parquet_reader = pq.ParquetFile(parquet_file)

In [3]:
table = pq.read_table(parquet_file)
df = table.to_pandas()

In [4]:
df.columns

Index(['pickup_datetime', 'dropoff_datetime', 'pickup_loc', 'dropoff_loc',
       'passenger_count'],
      dtype='object')

In [6]:
df[df['passenger_count'].isnull()]

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count


In [7]:
df.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count
0,2021-01-01 00:30:10,2021-01-01 00:36:12,142,43,1
1,2021-01-01 00:51:20,2021-01-01 00:52:19,238,151,1
2,2021-01-01 00:43:30,2021-01-01 01:11:06,132,165,1
3,2021-01-01 00:31:49,2021-01-01 00:48:21,68,33,1
4,2021-01-01 00:16:29,2021-01-01 00:24:30,224,68,1


In [8]:
df.dtypes

pickup_datetime     datetime64[us]
dropoff_datetime    datetime64[us]
pickup_loc                   int64
dropoff_loc                  int64
passenger_count              int64
dtype: object

In [14]:
df_sample = df.sample(n=100000)

In [16]:
df['pickup_datetime'] = df['pickup_datetime'].dt.floor(freq='h')
df['dropoff_datetime'] = df['dropoff_datetime'].dt.floor(freq='h')
df.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count
0,2021-01-01,2021-01-01 00:00:00,142,43,1
1,2021-01-01,2021-01-01 00:00:00,238,151,1
2,2021-01-01,2021-01-01 01:00:00,132,165,1
3,2021-01-01,2021-01-01 00:00:00,68,33,1
4,2021-01-01,2021-01-01 00:00:00,224,68,1


In [24]:
df_pickup = pd.DataFrame(df.groupby(['pickup_datetime', 'pickup_loc'])['passenger_count'].sum().reset_index())
col_names = {
    "pickup_datetime": "datetime",
    "pickup_loc": "location"
}
df_pickup.rename(columns=col_names, inplace=True)

In [28]:
df_pickup.head()

Unnamed: 0,datetime,location,passenger_count
0,2021-01-01,4,3
1,2021-01-01,7,2
2,2021-01-01,13,2
3,2021-01-01,17,1
4,2021-01-01,24,6


In [29]:
df_dropoff = pd.DataFrame(df.groupby(['dropoff_datetime', 'dropoff_loc'])['passenger_count'].sum().reset_index())
col_names = {
    "dropoff_datetime": "datetime",
    "dropoff_loc": "location"
}
df_dropoff.rename(columns=col_names, inplace=True)

In [30]:
df_dropoff.head()

Unnamed: 0,datetime,location,passenger_count
0,2021-01-01,4,8
1,2021-01-01,7,2
2,2021-01-01,11,1
3,2021-01-01,13,3
4,2021-01-01,17,2


In [33]:
grouped_merged = pd.concat([df_pickup,df_dropoff])
grouped_merged.shape

(6998834, 3)

In [37]:
grouped_merged[grouped_merged.isna().any(axis=1)]

Unnamed: 0,datetime,location,passenger_count


In [38]:
grouped_merged.to_parquet("all_yellow_tripdata_grouped.parquet", engine='pyarrow', index=False)

# Green taxidata

In [40]:
parquet_file = "all_green_tripdata_cleaned.parquet"
parquet_reader = pq.ParquetFile(parquet_file)

In [41]:
table = pq.read_table(parquet_file)
df = table.to_pandas()

In [42]:
df.columns

Index(['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'PULocationID',
       'DOLocationID', 'passenger_count'],
      dtype='object')

In [43]:
df[df['passenger_count'].isnull()]

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count


In [44]:
df.head()

Unnamed: 0,lpep_pickup_datetime,lpep_dropoff_datetime,PULocationID,DOLocationID,passenger_count
0,2021-01-01 00:15:56,2021-01-01 00:19:52,43,151,1.0
1,2021-01-01 00:25:59,2021-01-01 00:34:44,166,239,1.0
2,2021-01-01 00:45:57,2021-01-01 00:51:55,41,42,1.0
3,2021-01-01 00:16:36,2021-01-01 00:16:40,265,265,3.0
4,2021-01-01 00:19:14,2021-01-01 00:19:21,265,265,1.0


In [45]:
df.dtypes

lpep_pickup_datetime     datetime64[us]
lpep_dropoff_datetime    datetime64[us]
PULocationID                      int32
DOLocationID                      int32
passenger_count                 float64
dtype: object

In [None]:
df_sample = df.sample(n=100000)

In [None]:
df['pickup_datetime'] = df['pickup_datetime'].dt.floor(freq='h')
df['dropoff_datetime'] = df['dropoff_datetime'].dt.floor(freq='h')
df.head()

Unnamed: 0,pickup_datetime,dropoff_datetime,pickup_loc,dropoff_loc,passenger_count
0,2021-01-01,2021-01-01 00:00:00,142,43,1
1,2021-01-01,2021-01-01 00:00:00,238,151,1
2,2021-01-01,2021-01-01 01:00:00,132,165,1
3,2021-01-01,2021-01-01 00:00:00,68,33,1
4,2021-01-01,2021-01-01 00:00:00,224,68,1


In [None]:
df_pickup = pd.DataFrame(df.groupby(['pickup_datetime', 'pickup_loc'])['passenger_count'].sum().reset_index())
col_names = {
    "pickup_datetime": "datetime",
    "pickup_loc": "location"
}
df_pickup.rename(columns=col_names, inplace=True)

In [None]:
df_pickup.head()

Unnamed: 0,datetime,location,passenger_count
0,2021-01-01,4,3
1,2021-01-01,7,2
2,2021-01-01,13,2
3,2021-01-01,17,1
4,2021-01-01,24,6


In [None]:
df_dropoff = pd.DataFrame(df.groupby(['dropoff_datetime', 'dropoff_loc'])['passenger_count'].sum().reset_index())
col_names = {
    "dropoff_datetime": "datetime",
    "dropoff_loc": "location"
}
df_dropoff.rename(columns=col_names, inplace=True)

In [None]:
df_dropoff.head()

Unnamed: 0,datetime,location,passenger_count
0,2021-01-01,4,8
1,2021-01-01,7,2
2,2021-01-01,11,1
3,2021-01-01,13,3
4,2021-01-01,17,2


In [None]:
grouped_merged = pd.concat([df_pickup,df_dropoff])
grouped_merged.shape

(6998834, 3)

In [None]:
grouped_merged[grouped_merged.isna().any(axis=1)]

Unnamed: 0,datetime,location,passenger_count


In [None]:
grouped_merged.to_parquet("all_green_tripdata_grouped.parquet", engine='pyarrow', index=False)