In [1]:
import pandas as pd

In [2]:
import pickle

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
# Read data

df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')
df_feb = pd.read_parquet('./data/fhv_tripdata_2021-02.parquet')

In [6]:
print("Number of records in Jan 2021 FHV data: ", df.shape[0])

Number of records in Jan 2021 FHV data:  1154112


In [7]:
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037
...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285


In [8]:
df['duration'] = df.dropOff_datetime - df.pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [9]:
df_feb['duration'] = df_feb.dropOff_datetime - df_feb.pickup_datetime
df_feb.duration = df_feb.duration.apply(lambda td: td.total_seconds() / 60)

In [10]:
print("Average duration in Jan 2021 FHV: ", df['duration'].mean())

Average duration in Jan 2021 FHV:  19.1672240937939


In [11]:
df.isnull().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               958267
DOlocationID               162220
SR_Flag                   1154112
Affiliated_base_number        885
duration                        0
dtype: int64

In [12]:
print("Fraction of missing values :", 958267/1154112)

Fraction of missing values : 0.8303067639882438


In [13]:
df.duration.describe(percentiles=[0.25, 0.50, 0.75, 0.95, 0.98, 0.99]).apply("{0:.5f}".format)

count    1154112.00000
mean          19.16722
std          398.69216
min            0.01667
25%            7.76667
50%           13.40000
75%           22.28333
95%           47.25000
98%           66.13333
99%           90.30000
max       423371.05000
Name: duration, dtype: object

In [14]:
df = df[(df.duration >= 1) & (df.duration <= 60)].copy()
df_feb = df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)].copy()

In [15]:
categorical = ['PUlocationID', 'DOlocationID']

In [16]:
df[categorical] = df[categorical].astype(str)
df_feb[categorical] = df_feb[categorical].astype(str)

In [17]:
# Transform features into a dictionary of records
train_dicts = df[categorical].to_dict(orient='records')

# Apply DictVectorizer -- good thing it is not necessary to select only the categorical features
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [18]:
print("Dimensionality after OHE:", X_train.shape[1])

Dimensionality after OHE: 525


In [20]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

print(f'RMSE: {round(mean_squared_error(y_train, y_pred, squared=False), 4)} Train')

RMSE: 10.5285 Train


In [21]:
val_dicts = df_feb[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_feb[target].values
y_pred_val = lr.predict(X_val)
print(f'RMSE: {round(mean_squared_error(y_val, y_pred_val, squared=False), 4)} Validation')

RMSE: 11.0143 Validation
