# Import dependencies

In [None]:
import pandas as pd
# import pickle
# %pip install pyarrow

# import seaborn as sns
# import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import root_mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_parquet("./data/yellow_tripdata_2023-01.parquet")

## Q1: Downloading the data
Read the data for January. How many columns are there?

In [None]:
df_train.shape

## Q2. Computing duration
What's the standard deviation of the trips duration in January?

In [None]:
df_train

In [None]:
# df_train
df_train.tpep_dropoff_datetime = pd.to_datetime(df_train.tpep_dropoff_datetime)
df_train.tpep_pickup_datetime = pd.to_datetime(df_train.tpep_pickup_datetime)

df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)

categorical = ['PULocationID', 'DOLocationID']
numerical = []

df_train[categorical] = df_train[categorical].astype(str)

# df_val
df_val = pd.read_parquet("./data/yellow_tripdata_2023-02.parquet")
df_val.tpep_dropoff_datetime = pd.to_datetime(df_val.tpep_dropoff_datetime)
df_val.tpep_pickup_datetime = pd.to_datetime(df_val.tpep_pickup_datetime)

df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

df_val = df_val[((df_val.duration >= 1) & (df_val.duration <= 60))]
df_val[categorical] = df_val[categorical].astype(str)

In [None]:
round(df_train.duration.std(), 2)

## Q3. Dropping outliers
There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive). What fraction of the records left after you dropped the outliers?

In [None]:
df_train.info()

In [None]:
df_train = df_train[((df_train.duration >= 1) & (df_train.duration <= 60))]
df_train.info()

In [None]:
100 - ((3066766- 3009173)*100/3009173)
# 98%

## Q4. One-hot encoding
Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
- Fit a dictionary vectorizer
- Get a feature matrix from it
- What's the dimensionality of this matrix (number of columns)?

In [None]:
dv = DictVectorizer()

train_dict = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dict)

In [None]:
X_train
# 515 columns

## Q5. Training a model
- Train a plain linear regression model with default parameters, where duration is the response variable
- Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [None]:
target = 'duration'
y_train = df_train[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

print(root_mean_squared_error(y_train, y_pred))

## Q6. Evaluating the model
Apply this model to the validation dataset (February 2023).

What's the RMSE on validation?

In [None]:
dv = DictVectorizer()

test_dict = df_val[categorical + numerical].to_dict(orient="records")
X_val = dv.fit_transform(test_dict)
y_val = df_val[target].values
print(root_mean_squared_error(y_val, y_pred))