In [None]:
import requests
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import root_mean_squared_error
import xgboost
import optuna
import pickle

## Data Loading

In [None]:
!mkdir data

In [None]:
!mkdir models

In [None]:
files = [('green_tripdata_2024-10.parquet', './data'), 
         ('green_tripdata_2024-11.parquet', './data')]

print("Downloading started:...")

for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    save_path = f"{path}/{file}"
    response = requests.get(url, stream=True)

    with open(save_path, "wb") as handle:
        for data in response.iter_content(chunk_size=1024):
            handle.write(data)
    

In [None]:
train_data = pd.read_parquet("data/green_tripdata_2024-10.parquet")
test_data = pd.read_parquet("data/green_tripdata_2024-11.parquet")

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
num_features = ['total_amount', 'trip_distance', 'passenger_count']
cat_features = ['PULocationID', 'DOLocationID']

In [None]:
def preprocessing(data):
    data['duration'] = data['lpep_dropoff_datetime'] - data['lpep_pickup_datetime']
    data.duration = data.duration.apply(lambda td: td.total_seconds()/60)
    data = data[(data.duration >= 3.) & (data.duration <= 90.)] 
    data.fillna(0, inplace=True) #maybe debug this later
    return data

In [None]:
with open('models/preprocessing.bin', 'wb') as f_out:
    pickle.dump(preprocessing, f_out)

In [None]:
X_train = preprocessing(train_data)[num_features + cat_features]
y_train = preprocessing(train_data)['duration']

X_test = preprocessing(test_data)[num_features + cat_features]
y_test = preprocessing(test_data)['duration']

## Baseline model

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
train_pred = model.predict(X_train)
root_mean_squared_error(train_pred, y_train)

In [None]:
test_pred = model.predict(X_test)
root_mean_squared_error(test_pred, y_test)

In [None]:
test_pred.mean(), test_pred.std()