In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
import pickle

In [30]:
categorical = ['PU_DO']
numerical = ['trip_distance']

In [3]:
def read_dataFrame(file_name):
    df = pd.read_parquet(file_name)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration > 1)  & (df.duration < 60)]
    df[categorical] = df[categorical].astype(str)
    return df

In [4]:
df_train = read_dataFrame('./data/green_tripdata_2021-01.parquet')
df_val = read_dataFrame('./data/green_tripdata_2021-02.parquet')

In [5]:
(df_train.shape),(df_val.shape)

((73797, 21), (61827, 21))

In [14]:
df_train['PU_DO'] =  df_train['PULocationID'] + "_" + df_train["DOLocationID"]
df_val['PU_DO'] = df_val['PULocationID'] + "_" + df_val["DOLocationID"]
df_train

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,PU_DO
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.50,...,0.00,0.00,,0.3,6.80,2.0,1.0,0.00,3.933333,43_151
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.00,...,2.81,0.00,,0.3,16.86,1.0,1.0,2.75,8.750000,166_239
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.00,...,1.00,0.00,,0.3,8.30,1.0,1.0,0.00,5.966667,41_42
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.00,...,0.00,0.00,,0.3,9.30,2.0,1.0,0.00,7.083333,168_75
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.50,...,0.96,0.00,,0.3,5.76,1.0,1.0,0.00,2.316667,75_75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76513,2,2021-01-31 21:38:00,2021-01-31 22:16:00,,,81,90,,17.63,56.23,...,0.00,6.12,,0.3,65.40,,,,38.000000,81_90
76514,2,2021-01-31 22:43:00,2021-01-31 23:21:00,,,35,213,,18.36,46.66,...,12.20,6.12,,0.3,65.28,,,,38.000000,35_213
76515,2,2021-01-31 22:16:00,2021-01-31 22:27:00,,,74,69,,2.50,18.95,...,0.00,0.00,,0.3,22.00,,,,11.000000,74_69
76516,2,2021-01-31 23:10:00,2021-01-31 23:37:00,,,168,215,,14.48,48.87,...,0.00,6.12,,0.3,58.04,,,,27.000000,168_215


In [16]:
dv = DictVectorizer()

train_dicts = df_train[categorical+numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical+numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [17]:
X_train,X_val

(<73797x13194 sparse matrix of type '<class 'numpy.float64'>'
 	with 147594 stored elements in Compressed Sparse Row format>,
 <61827x13194 sparse matrix of type '<class 'numpy.float64'>'
 	with 118417 stored elements in Compressed Sparse Row format>)

In [18]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

# Linear Regression

In [28]:
lr = LinearRegression()
lr.fit(X_train,y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val,y_pred,squared=False)



7.701665746109168

# Lasso

In [22]:
lr = Lasso(alpha=0.001)
lr.fit(X_train,y_train)

y_pred =  lr.predict(X_val)

mean_squared_error(y_val,y_pred,squared=False)



9.165563184531326

# Ridge

In [26]:
lr = Ridge(alpha=0.001)
lr.fit(X_train,y_train)

y_pred =  lr.predict(X_val)

mean_squared_error(y_val,y_pred,squared=False)



7.4576924799959

In [29]:
with open('model/lin_reg.bin','wb') as f_out:
    pickle.dump((dv,lr),f_out)