In [1]:
import pandas as pd
import pickle
import numpy as np

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge


from sklearn.metrics import mean_squared_error

In [4]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)
    #df = df[df.trip_type == 2]

    #df = df[(df.duration >= 1) & (df.duration <=60)] #only to apply after assessing the variables

    #categorical = ['PUlocationID','DOlocationID']

    #df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
df_train = read_dataframe('./data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('./data/fhv_tripdata_2021-02.parquet')

In [6]:
#df_jan = pd.read_parquet('./data/fhv_tripdata_2021-01.parquet')
#df_feb = pd.read_parquet('./data/fhv_tripdata_2021-02.parquet')
#len(df_jan), len(df_feb)

In [7]:
l_in_train = len(df_train)
l_in_val = len(df_val)
l_in_train , l_in_val

(1154112, 1037692)

In [8]:
df_train.dtypes
df_val.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
duration                         float64
dtype: object

In [9]:
#df_jan.dtypes
#df_feb.dtypes

In [10]:
df_train['duration'].describe()

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

In [11]:
df_train['duration'].mean()

19.1672240937939

In [12]:
#records drop with duration conditions
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <=60)]
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <=60)]
l_fil_train = len(df_train)
l_fil_val = len(df_val)

drop_fil_train = l_in_train - l_fil_train
drop_fil_val = l_in_val - l_fil_val

drop_fil_train, drop_fil_val

(44286, 47579)

In [13]:
def rep_nan(df):
    categorical = ['PUlocationID','DOlocationID']
    df[categorical] = df[categorical].astype(float).fillna(value=-1)
    df[categorical].astype(str)
    return df

In [14]:
df_train = rep_nan(df_train)
df_val = rep_nan(df_val)

In [15]:
#fraction of pickup location id replaced with -1
len(df_train[df_train['PUlocationID']== -1])/len(df_train)

0.8352732770722617

In [17]:
categorical = ['PUlocationID','DOlocationID']
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

In [18]:
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [19]:
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [20]:
X_val

<990113x525 sparse matrix of type '<class 'numpy.float64'>'
	with 1980223 stored elements in Compressed Sparse Row format>

In [21]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [22]:
#on the train data
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred , squared = False)

10.5285191072072

In [23]:
#on the val data
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred , squared = False)

11.014283163400654