In [33]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
import warnings

warnings.filterwarnings('ignore')

In [34]:
jan_data_path = "../../../data/yellow_tripdata_2023-01.parquet"
feb_data_path = "../../../data/yellow_tripdata_2023-02.parquet"

In [35]:
df = pd.read_parquet(jan_data_path)
# feb_df = pd.read_parquet(feb_data_path)

In [36]:
print(df.columns.shape) # Answer for Q1 = 19

(19,)


In [37]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [38]:
df.iloc[1000]

VendorID                                   2
tpep_pickup_datetime     2023-01-01 00:51:44
tpep_dropoff_datetime    2023-01-01 00:54:54
passenger_count                          1.0
trip_distance                           0.52
RatecodeID                               1.0
store_and_fwd_flag                         N
PULocationID                             143
DOLocationID                             239
payment_type                               1
fare_amount                              5.1
extra                                    1.0
mta_tax                                  0.5
tip_amount                              3.03
tolls_amount                             0.0
improvement_surcharge                    1.0
total_amount                           13.13
congestion_surcharge                     2.5
airport_fee                              0.0
Name: 1000, dtype: object

In [39]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
print(df.duration.std())
old_len = df.shape[0]

42.59435124195458


In [40]:
df = df[(df.duration >= 1) & (df.duration <= 60)]
new_len = df.shape[0]
ratio = new_len / old_len
print(ratio)

0.9812202822125979


In [41]:
categorical = ['PULocationID', 'DOLocationID']

df[categorical] = df[categorical].astype(str)

In [42]:
train_dicts = df[categorical].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

7.649261927686161

In [43]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [44]:
df_train = read_dataframe(jan_data_path)
df_val = read_dataframe(feb_data_path)

In [45]:
len(df_train), len(df_val)

(3009173, 2855951)

In [46]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [47]:
categorical = ['PU_DO']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [48]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [49]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

16.33487387327494