# Libraries installation (via "pip")

In [1]:
%pip install -r ../requirements.txt

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


# Imports Section

In [2]:
import pandas as pd
import numpy as np

# --------------------------

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Training Dataframe preparation

In [3]:
df_train = pd.read_parquet('../data/fhv_tripdata_2021-01.parquet')

# Calculation of the duration
df_train['duration'] = df_train.dropOff_datetime - df_train.pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)
df_train.head(10)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,,71.0,,B00037,9.05
6,B00037,2021-01-01 00:18:12,2021-01-01 00:30:04,,91.0,,B00037,11.866667
7,B00037,2021-01-01 00:36:15,2021-01-01 00:45:08,,39.0,,B00037,8.883333
8,B00037,2021-01-01 00:55:04,2021-01-01 01:13:02,,37.0,,B00037,17.966667
9,B00037,2021-01-01 00:48:40,2021-01-01 01:12:02,,39.0,,B00037,23.366667


In [4]:
# Number of records 
original_dataset_size = df_train.shape
print (f'(Original dataset shape: Rows ({original_dataset_size[0]}) x Columns ({original_dataset_size[1]})')

(Original dataset shape: Rows (1154112) x Columns (8)


In [5]:
# Average trip duration
avg_duration = df_train.duration.mean()
print (f'Original dataset avg duration (min) = {avg_duration}')

Original dataset avg duration (min) = 19.167224093791006


In [6]:
# Filter one hour trips
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
print (f'Records dropped (<1 min or >60 min) ) {original_dataset_size[0] - df_train.shape[0]}')

# Missing values calculation
df_train['PUlocationID'] = df_train['PUlocationID'].replace(np.NaN, -1.0)
df_train['DOlocationID'] = df_train['DOlocationID'].replace(np.NaN, -1.0)

# df_missing = df[(df['PUlocationID'] == -1.0) | (df['DOlocationID'] == -1.0)]
df_missing = df_train[df_train['PUlocationID'] == -1.0]
df_filter = df_train[(df_train['PUlocationID'] != -1.0) & (df_train['DOlocationID'] != -1.0)]

fraction = len(df_missing) / len(df_train)

print (f'Original {df_train.shape[0]} - missing {df_missing.shape[0]} ({len(df_missing) / len(df_train)}%) - Ready-to-go {df_filter.shape[0]} ({len(df_filter) / len (df_train)}%)')

Records dropped (<1 min or >60 min) ) 44286
Original 1109826 - missing 927008 (0.8352732770722617%) - Ready-to-go 171670 (0.15468190509142873%)


In [7]:
# One-hot encoding
categorical = ['PUlocationID', 'DOlocationID']
# numerical = ['trip_distance']

df_train[categorical] = df_train[categorical].astype(str)
train_dicts = df_train[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)  

target = 'duration'
y_train = df_train[target].values

In [22]:
# Linear regression model (default params)
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mse_train = mean_squared_error(y_train, y_pred, squared=False)
print(f'MSE (train) = {mse_train}')

MSE (train) = 10.528519428392489


# Validation dataset

In [14]:
# Validation dataset (load)
df_val = pd.read_parquet('../data/fhv_tripdata_2021-02.parquet')

In [15]:
df_val.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037


In [24]:
# Calculation of the duration
df_val['duration'] = df_val.dropOff_datetime - df_val.pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

# Filter one hour trips
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

# Missing values calculation
# df_val['PUlocationID'] = df_val['PUlocationID'].replace(np.NaN, -1.0)
# df_val['DOlocationID'] = df_val['DOlocationID'].replace(np.NaN, -1,0)
df_val = df_val[(df_val['PUlocationID'] != np.NaN) & (df_val['DOlocationID'] != np.NaN)]


df_val[categorical] = df_val[categorical].astype(str)
val_dicts = df_val[categorical].to_dict(orient='records')

In [25]:
# dv = DictVectorizer()
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_val = df_val[target].values

In [26]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mse_val = mean_squared_error(y_val, y_pred, squared=False)
print(f'MSE (val) = {mse_val}')


MSE (val) = 11.237154387710653
