## setup

In [37]:
!python -V

Python 3.10.2


In [38]:
import pandas as pd

In [39]:
import pickle

In [40]:
import seaborn as sns
import matplotlib.pyplot as plt

In [41]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

## Checking dataframe and asnwering basic questions

In [42]:
df_yellow = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')

In [43]:
# Q1: number of columns
print(df_yellow.shape)
print("number of columns:",df_yellow.shape[1])


(2463931, 19)
number of columns: 19


In [44]:
before_rows = df_yellow.shape[0]

In [45]:
# get the difference to get duraiton and convert it to minutes
df_yellow['duration'] = df_yellow.tpep_dropoff_datetime - df_yellow.tpep_pickup_datetime
df_yellow['duration'] = df_yellow.duration.apply(lambda td: td.total_seconds() / 60)

In [47]:
# Q2: standard dev of duration
print("stddev of duraition: ", df_yellow.duration.std())

stddev of duraition:  46.44530513776499


In [48]:
# filter out outliers
df_yellow = df_yellow[(df_yellow.duration >= 1) & (df_yellow.duration <= 60)]

In [49]:
# get the number of rows again and compare it to before
print(df_yellow.shape)
after_rows = df_yellow.shape[0]

(2421440, 20)


In [50]:
# Q3: fraction of records after dropping the outliers
print("% retained:", after_rows/before_rows)

% retained: 0.9827547930522406


In [51]:
# convert these two categorial to string to use DictVectorizer
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']
df_yellow[categorical] = df_yellow[categorical].astype(str)

In [52]:
dv = DictVectorizer()

train_dicts = df_yellow[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)


In [53]:
# Q4: dimensionality after OHE
# get number of rows and columns after OHE
X_train.shape

(2421440, 515)

In [54]:
target = 'duration'
y_train = df_yellow[target].values

In [55]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [56]:
# RMSE on train for Linear Regression Model
y_pred = lr.predict(X_train)
mean_squared_error(y_train, y_pred, squared=False)

6.986190135967352

## training and validation set

In [57]:
def read_dataframe(filename):
    if filename.endswith('.parquet'):
        df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [58]:
df_train = read_dataframe('./data/yellow_tripdata_2022-01.parquet')
df_val = read_dataframe('./data/yellow_tripdata_2022-02.parquet')

In [59]:
categorical = ['PULocationID', 'DOLocationID']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [60]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [61]:
# Q6: RMSE on validaiton
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

7.7863895211068

In [62]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [63]:
lr = Lasso(0.01)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)