# Homework
The goal of this homework is to train a simple model for predicting the duration of a ride.

## Load libraries required

In [None]:
import pandas as pd
import numpy as np
import requests
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

# ml libraries
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import root_mean_squared_error

## Load data

In [None]:
jan_df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
feb_df = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

### Q1. Downloading the data
- We'll use the same  [NYC taxi dataset](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page), but instead of "Green Taxi Trip  Records", we'll use "Yellow Taxi Trip Records".

- Download the data for January and February 2023.

- Read the data for January. How many columns are there?

In [None]:
print(f'''
There are {len(jan_df.columns)} columns.
''')

### Q2. Computing duration
- Now let's compute the duration variable. It should contain the duration of a ride in minutes.

- What's the standard deviation of the trips duration in January?

In [None]:
jan_df.head()

In [None]:
# steps we create the duration column from pickup and dropoff datetime
jan_df['duration(min)'] = (jan_df['tpep_dropoff_datetime'] - jan_df['tpep_pickup_datetime']).dt.total_seconds() / 60
results = jan_df['duration(min)'].describe()
print(f'''
the std of the duration is {results['std']:.2f} minutes
''')

### Q3. Dropping outliers
- Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

- What fraction of the records left after you dropped the outliers?

In [None]:
# removing outliers with duration > 60 minutes
value_before = len(jan_df)
jan_df = jan_df[(jan_df['duration(min)'] >= 1) & (jan_df['duration(min)'] <= 60)]
jan_df['duration(min)'].describe()
print(f'''
the numer of rows before removing outliers is {value_before}
The number of rows after removing outliers is {len(jan_df)}.
The percentage of rows removed is {100 * (value_before - len(jan_df)) / value_before:.2f}%
The percentage of rows kept is {100 * len(jan_df) / value_before:.2f}%
''')

### Q4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
- Fit a dictionary vectorizer
- Get a feature matrix from it

What's the dimensionality of this matrix (number of columns)?

In [None]:
# one-hot encode the oickup and dropoff locations ids
categorical = ['PULocationID', 'DOLocationID']
jan_df[categorical] = jan_df[categorical].astype(str)

train_dic = jan_df[categorical].to_dict(orient='records')
dv= DictVectorizer()
X_train = dv.fit_transform(train_dic)

# Get number of features (columns)
n_features = X_train.shape[1]
print(f'Number of features (columns) in the matrix: {n_features}')

### Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters, where duration is the response variable
- Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [None]:
lr  = LinearRegression()
y_train = jan_df['duration(min)'].values
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
rmse = root_mean_squared_error(y_train, y_pred)
print(f'''
The RMSE of the model is {rmse:.2f} minutes
''')

### Q6. Evaluating the model
Now let's apply this model to the validation dataset (February 2023).

- What's the RMSE on validation?

In [None]:
# Calculate duration for February data
feb_df['duration(min)'] = (feb_df['tpep_dropoff_datetime'] - feb_df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Filter outliers like we did with January data
feb_df = feb_df[(feb_df['duration(min)'] >= 1) & (feb_df['duration(min)'] <= 60)]

# Prepare features - convert to string type
feb_df[categorical] = feb_df[categorical].astype(str)

# Create feature matrix using the same DictVectorizer
val_dict = feb_df[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)  # Note: using transform, not fit_transform

# Make predictions
y_val = feb_df['duration(min)'].values
y_pred = lr.predict(X_val)

# Calculate RMSE
val_rmse = root_mean_squared_error(y_val, y_pred)
print(f'RMSE on validation data: {val_rmse:.2f}')