# MLOps Course - Homework 01

#### Gustavo Flores

In [1]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

import pandas as pd

In [2]:
def calculate_duration(df):
    duration = df['dropOff_datetime'] - df['pickup_datetime']
    return duration.apply(lambda td: td.total_seconds() / 60)

def filter_duration(df, inf_lim, sup_lim):
    return df[(df.duration >= inf_lim) & (df.duration <= sup_lim)]

def fill_categorical(df, cat_vars):
    return df[categorical].fillna(value=-1).copy()

def matrix_transform(df, cat_vars):

    train_dicts = df[cat_vars].to_dict(orient='records')

    dv = DictVectorizer()
    return dv.fit_transform(train_dicts)

## Train Predictions

In [3]:
train_df = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')

In [4]:
train_df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [5]:
train_df.shape

(1154112, 7)

In [6]:
train_df['duration'] = calculate_duration(train_df)

train_df['duration'].mean()

19.167224093791006

In [7]:
train_df = filter_duration(train_df, 1, 60)

categorical = ['PUlocationID', 'DOlocationID']

train_df.loc[:, categorical] = fill_categorical(train_df, categorical)

train_df['PUlocationID'].value_counts(normalize=True)

-1.0      8.352733e-01
 221.0    7.505681e-03
 206.0    6.124383e-03
 129.0    4.846706e-03
 115.0    3.678054e-03
              ...     
 111.0    4.505211e-06
 27.0     3.604169e-06
 34.0     2.703126e-06
 2.0      1.802084e-06
 110.0    9.010421e-07
Name: PUlocationID, Length: 262, dtype: float64

In [8]:
X_train = matrix_transform(train_df, categorical)

In [9]:
X_train.get_shape()

(1109826, 2)

In [10]:
target = 'duration'
y_train = train_df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

11.415432830521663

## Test Predictions

In [11]:
test_df = pd.read_parquet('./data/fhv_tripdata_2021-02.parquet')

In [12]:
test_df['duration'] = calculate_duration(test_df)

test_df = filter_duration(test_df, 1, 60)

categorical = ['PUlocationID', 'DOlocationID']

test_df.loc[:, categorical] = fill_categorical(test_df, categorical)

X_test = matrix_transform(test_df, categorical)

In [13]:
target = 'duration'
y_test = test_df[target].values

y_pred_test = lr.predict(X_test)

mean_squared_error(y_test, y_pred_test, squared=False)

11.85822362355935