# Homework

In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import LinearRegression

In [2]:
def clean(df):
  # Duration in minutes

  df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']) / pd.Timedelta(minutes=1)

  print('Duration standard deviation', df['duration'].std())

  # Remove outliers out of bounds 1 - 60 minutes

  df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

  return df


# Q1. Downloading the data

Read the data for January. How many columns are there?

In [3]:
df_jan = pd.read_parquet('yellow_tripdata_2023-01.parquet')
print('Columns count:', len(df_jan.columns))

Columns count: 19


# Q2. Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the standard deviation of the trips duration in January?

In [4]:
initial_count = len(df_jan)

# Calculate standard deviation
df_jan = clean(df_jan)

Duration standard deviation 42.59435124195458


# Q3. Dropping outliers

Next, we need to check the distribution of the duration variable. There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive).

What fraction of the records left after you dropped the outliers?

In [5]:
# fraction of records left after removing outliers

final_count = len(df_jan)

fraction = (final_count / initial_count) * 100

print('Fraction after dropping outliers:', fraction)

Fraction after dropping outliers: 98.1220282212598


# Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

- Turn the dataframe into a list of dictionaries (remember to re-cast the ids to strings - otherwise it will label encode them)
- Fit a dictionary vectorizer
- Get a feature matrix from it

What's the dimensionality of this matrix (number of columns)?

In [6]:
def dict_extract(df):
  categorical = ['PULocationID', 'DOLocationID']
  df[categorical] = df[categorical].astype(str)
  df_dict = df[categorical].to_dict(orient='records')

  return df_dict

In [7]:
dict_jan = dict_extract(df_jan)

dv = DictVectorizer()
X_train = dv.fit_transform(dict_jan)

print('Dimensionality of matrix', len(dv.feature_names_))

Dimensionality of matrix 515


# Q5. Training a model

Now let's use the feature matrix from the previous step to train a model.

- Train a plain linear regression model with default parameters, where duration is the response variable
- Calculate the RMSE of the model on the training data

What's the RMSE on train?

In [8]:
y_train = df_jan['duration'].values
lr = LinearRegression()
lr.fit(X_train, y_train)

In [9]:
y_pred = lr.predict(X_train)
print('RMSE on train data:', root_mean_squared_error(y_train, y_pred))

RMSE on train data: 7.649261932106969


# Q6. Evaluating the model

Now let's apply this model to the validation dataset (February 2023).

What's the RMSE on validation?

In [10]:
df_feb = pd.read_parquet('yellow_tripdata_2023-02.parquet')

In [11]:
df_feb = clean(df_feb)

dict_feb = dict_extract(df_feb)
X_val = dv.transform(dict_feb)

y_val = df_feb['duration'].values

Duration standard deviation 42.84210176105113


In [12]:
y_pred = lr.predict(X_val)
print('RMSE on validation data:', root_mean_squared_error(y_val, y_pred))

RMSE on validation data: 7.811818743246608
