In [68]:
import pandas as pd
import os, requests
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression


In [9]:
data_link1 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet'
data_link2 = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet'

In [17]:
# read the dataset
data_dir = './data/'
data_jan = './data/yellow_tripdata_2022-01.parquet'
data_feb = './data/yellow_tripdata_2022-02.parquet'

df_jan = pd.read_parquet(data_jan)
df_feb = pd.read_parquet(data_feb)

In [21]:
num_cols_jan = len(df_jan.columns)
print('number of cols in jan data: ', num_cols_jan)

number of cols in jan data:  19


In [26]:
df_jan['duration'] = df_jan.tpep_dropoff_datetime - df_jan.tpep_pickup_datetime
df_jan.duration = df_jan.duration.apply(lambda td: td.total_seconds() / 60)
df_jan['duration'].describe()

count    2.463931e+06
mean     1.421220e+01
std      4.644531e+01
min     -3.442400e+03
25%      6.316667e+00
50%      1.018333e+01
75%      1.616667e+01
max      8.513183e+03
Name: duration, dtype: float64

In [28]:
df = df_jan[(df_jan.duration>=1) & (df_jan.duration<=60)]

In [31]:
len(df)/len(df_jan)

0.9827547930522406

In [35]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [36]:
df_train = read_dataframe(data_jan)
df_val = read_dataframe(data_feb)


In [37]:
len(df_train), len(df_val)

(2421440, 2918187)

In [45]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)



In [64]:
train_dicts

[{'PU_DO': '142_236', 'trip_distance': 3.8},
 {'PU_DO': '236_42', 'trip_distance': 2.1},
 {'PU_DO': '166_166', 'trip_distance': 0.97},
 {'PU_DO': '114_68', 'trip_distance': 1.09},
 {'PU_DO': '68_163', 'trip_distance': 4.3},
 {'PU_DO': '138_161', 'trip_distance': 10.3},
 {'PU_DO': '233_87', 'trip_distance': 5.07},
 {'PU_DO': '238_152', 'trip_distance': 2.02},
 {'PU_DO': '166_236', 'trip_distance': 2.71},
 {'PU_DO': '236_141', 'trip_distance': 0.78},
 {'PU_DO': '141_229', 'trip_distance': 1.91},
 {'PU_DO': '114_90', 'trip_distance': 0.82},
 {'PU_DO': '234_113', 'trip_distance': 0.73},
 {'PU_DO': '246_79', 'trip_distance': 2.16},
 {'PU_DO': '43_140', 'trip_distance': 1.43},
 {'PU_DO': '239_151', 'trip_distance': 1.58},
 {'PU_DO': '148_141', 'trip_distance': 4.2},
 {'PU_DO': '237_107', 'trip_distance': 2.2},
 {'PU_DO': '7_7', 'trip_distance': 0.2},
 {'PU_DO': '107_263', 'trip_distance': 3.9},
 {'PU_DO': '263_107', 'trip_distance': 3.2},
 {'PU_DO': '161_161', 'trip_distance': 0.0},
 {'PU_DO

In [58]:
len(dv.feature_names_)

20667

In [70]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [71]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

4.79728496240072

In [72]:

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

5.530390602049272