In [25]:
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
from matplotlib import pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso

In [4]:
sns.set_style('darkgrid')

In [5]:
df_jan = pd.read_parquet('../../data/raw/green_tripdata_2021-01.parquet')
df_feb = pd.read_parquet('../../data/raw/green_tripdata_2021-02.parquet')

In [6]:
cat_features = ['PULocationID', 'DOLocationID']
num_features = ['trip_distance']
target = 'duration'
features = cat_features + num_features

In [7]:
def process_data(dataframe):
    dataframe = dataframe.copy()
    dataframe['duration'] = dataframe.lpep_dropoff_datetime - dataframe.lpep_pickup_datetime
    dataframe.duration = dataframe.duration.apply(lambda x: x.total_seconds() / 60)
    dataframe = dataframe[(dataframe.duration >= 1) & (dataframe.duration <= 60)]
    dataframe[cat_features] = dataframe[cat_features].astype(str)
    return dataframe

In [8]:
df_train = process_data(df_jan)
df_val = process_data(df_feb)

In [9]:
len(df_train), len(df_val)

(73908, 61921)

In [26]:
def make_dataset(train_df, val_df):
    train_df = train_df.copy()
    val_df = val_df.copy()
    
    train_dicts = train_df[features].to_dict(orient='records')
    val_dicts = val_df[features].to_dict(orient='records')
    dv = DictVectorizer()
    dv.fit(train_dicts)
    X_train = dv.transform(train_dicts)
    X_val = dv.transform(val_dicts)

    y_train = train_df[target].values
    y_val = val_df[target].values

    return X_train, y_train, X_val, y_val, dv

In [27]:
X_train, y_train, X_val, y_val, dv = make_dataset(df_train, df_val)

In [20]:
# fig, ax = plt.subplots(1,1)
# # sns.kdeplot(y_pred, label='prediction')
# sns.histplot(y_train, label='data', kde=True, color='orange')
# sns.histplot(y_pred, label='prediction', kde=True)
# plt.legend()
# plt.show()

In [28]:
lr = Lasso(alpha=0.05)
lr.fit(X_train, y_train)

In [29]:
y_pred = lr.predict(X_val)

In [30]:
mean_squared_error(y_val, y_pred, squared=False)

11.439943687389436

In [32]:
with open('../../models/lin_reg.bin', 'wb') as f:
    pickle.dump((dv, lr), f)