In [36]:
import pandas as pd
import numpy as np
df_Jan = pd.read_parquet(path="Data/yellow_tripdata_2023-01.parquet")
df_Feb = pd.read_parquet(path="Data/yellow_tripdata_2023-02.parquet")
pd.options.mode.chained_assignment = None 
from sklearn.linear_model import LinearRegression


### Question 1

In [37]:
print(f"There are {df_Jan.shape[1]} columns in this dataset.")

There are 19 columns in this dataset.


### Question 2

In [38]:
def get_duration(row : pd.Series) -> float:
    """Returns the duration between dropoff and pickup times

    Args:
        row (pd.Series): Slice of dataframe containing "tpep_pickup_datetime" and "tpep_dropoff_datetime"

    Returns:
        float: The duration of the trip
    """
    
    duration = row["tpep_dropoff_datetime"] - row["tpep_pickup_datetime"]
    minutes = duration.total_seconds() / 60.0
    return minutes

In [39]:
df_Jan["duration"] = df_Jan.apply(get_duration, axis = 1)
cov = np.cov(df_Jan.duration, bias = False) # We use (N - 1) because we want to calculate sample covariance
print(f"The standard deviation of the trips in January is {np.sqrt(cov)}.")

The standard deviation of the trips in January is 42.59435124195483.


### Question 3

In [40]:
rows = (df_Jan.duration >= 1) & (df_Jan.duration <= 60)
print(f"{rows.sum() / df_Jan.shape[0] * 100}% of the data is left")
df_Jan_subset = df_Jan.loc[rows, :]

98.1220282212598% of the data is left


### Question 4

In [41]:
from sklearn.feature_extraction import DictVectorizer
df_Jan_subset['DOID'] = df_Jan_subset['DOLocationID'].astype(str)
df_Jan_subset['PUID'] = df_Jan_subset['PULocationID'].astype(str)
X_dict = df_Jan_subset[['DOID', 'PUID']].to_dict('records')
transformer = DictVectorizer()
X_train = transformer.fit_transform(X_dict)
print(f"The new matrix has {X_train.shape[1]} columns.")

The new matrix has 515 columns.


### Question 5

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse
model = LinearRegression()
model.fit(X = X_train, y = df_Jan_subset['duration'])
predictions = model.predict(X_train)
rmse = np.sqrt(mse(df_Jan_subset['duration'], predictions))
print(f"The RMSE of our model is: {rmse}.")

The RMSE of our model is: 7.649261822035489.


### Question 6

In [44]:
df_Feb['DOID'] = df_Feb['DOLocationID'].astype(str)
df_Feb['PUID'] = df_Feb['PULocationID'].astype(str)

df_Feb['duration'] = df_Feb.apply(get_duration, axis = 1)
rows_test = (df_Feb.duration >= 1) & (df_Feb.duration <= 60)
df_Feb_subset = df_Feb.loc[rows_test, :]

X_test = transformer.transform(df_Feb_subset[['DOID', 'PUID']].to_dict('records'))
y_test = df_Feb_subset['duration']

test_predictions = model.predict(X_test)
test_rmse = np.sqrt(mse(y_test, test_predictions))
print(f"The test-RMSE is: {test_rmse}.")

The test-RMSE is: 7.811821332387183.
