In [29]:
!python -V

Python 3.10.9


# Homework week 1
The goal of this homework is to train a simple model for predicting the duration of a ride - similar to what we did in the module of the first week.

In [30]:
import pandas as pd
import statistics

In [31]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

## Q1. Downloading the data
Download the data of the "Yellow Taxi Trip Records" for January and February 2022.  
Read the data for January. How many columns are there?

* 16
* 17
* 18
* 19

In [32]:
df = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')

In [33]:
print(f"There are {df.shape[1]} columns.")

There are 19 columns.


In [34]:
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

## Q2. Computing duration
Now let's compute the duration variable. It should contain the duration of a ride in minutes.  
What's the standard deviation of the trips duration in January?

* 41.45
* 46.45
* 51.45
* 56.45

In [35]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [36]:
print(f"The standard deviation ot the trips in January is {round(statistics.stdev(df.duration), 2)}.")

The standard deviation ot the trips in January is 46.45.


## Q3. Dropping outliers
Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).  
  
What fraction of the records left after you dropped the outliers?

* 90%
* 92%
* 95%
* 98%

In [37]:
df_num_rows_outliers = df.shape[0] # number of row with outliers
df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

In [38]:
print(f"The fraction ot the records left after dropping the outliers is {round(df.shape[0] / df_num_rows_outliers * 100)}%.")

The fraction ot the records left after dropping the outliers is 98%.


## Q4. One-hot encoding
Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.  
1. Turn the dataframe into a list of dictionaries
2. Fit a dictionary vectorizer
3. Get a feature matrix from it  
  
What's the dimensionality of this matrix (number of columns)?

* 2
* 155
* 345
* 515
* 715

In [39]:
# 1. Turn the dataframe into a list of dictionaries
categorical = ['PULocationID', 'DOLocationID']

df[categorical] = df[categorical].astype(str)

train_dicts = df[categorical].to_dict(orient='records')

# 2. Fit a dictionary vectorizer and 3. Get a feature matrix from it
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [40]:
print(f"The dimensionality of the feature matrix (number of columns) is {X_train.shape[1]}.")

The dimensionality of the feature matrix (number of columns) is 515.


## Q5. Training a model
Now let's use the feature matrix from the previous step to train a model.  
  
1. Train a plain linear regression model with default parameters
2. Calculate the RMSE of the model on the training data
3. What's the RMSE on train?

* 6.99
* 11.99
* 16.99
* 21.99

In [41]:
# 1. Train a plain linear regression model with default parameters
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

# 2.  Calculate the RMSE of the model on the training data
rmse = mean_squared_error(y_train, y_pred, squared=False)

# 3. What's the RMSE on train?
print(f"The RMSE on the training data is {round(rmse, 2)}.")

The RMSE on the training data is 6.99.


## Q6. Evaluating the model
Now let's apply this model to the validation dataset (February 2022).  
  
What's the RMSE on validation?

* 7.79
* 12.79
* 17.79
* 22.79

In [42]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [43]:
df_val = read_dataframe('./data/yellow_tripdata_2022-02.parquet')

In [44]:
# 1. Turn the dataframe into a list of dictionaries
val_dicts = df_val[categorical].to_dict(orient='records')

# 2. Fit a dictionary vectorizer and 3. Get a feature matrix from it
X_val = dv.transform(val_dicts)

# 3. Apply the linear regression model to the validation dataset
y_val = df_val[target].values
y_pred = lr.predict(X_val)

# 4.  Calculate the RMSE of the model on the validation data
rmse_val = mean_squared_error(y_val, y_pred, squared=False)

In [45]:
print(f"The RMSE on the validation data is {round(rmse_val, 2)}.")

The RMSE on the validation data is 7.79.
