In [1]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [2]:
# Q1. downloading the data
df_train = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')

len(df_train.columns) # 19

19

In [3]:
# Q2. computing duration
df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)



In [4]:
df_train[['duration']].std() # 42.59

duration    42.594351
dtype: float64

In [5]:
# Q3. dropping outliers
previous_records = df_train.duration.count()
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
current_records = df_train.duration.count()

(previous_records - current_records) * 100.0 / previous_records # 2%


1.8779717787402104

In [6]:
df_train[(df_train.duration < 1) & (df_train.duration > 60)]

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,duration


In [7]:
# Q4. one-hot encoding
dv = DictVectorizer()
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df_train[categorical] = df_train[categorical].astype(str)

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

X_train # 516 columns

<3009173x516 sparse matrix of type '<class 'numpy.float64'>'
	with 9027519 stored elements in Compressed Sparse Row format>

In [8]:
# Q5. training a model
target = 'duration'
y_train = df_train[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False) # 7.65



7.658559076531841

In [9]:
# Q6. evaluating the model

# generic function to set up data
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

df_val = read_dataframe('./data/yellow_tripdata_2023-02.parquet')
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_train = df_train[target].values
y_val = df_val[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False) # 7.82



7.820057137415617