In [1]:
!python3 -V

Python 3.12.3


In [2]:
from typing import Callable
from typing_extensions import List
import pandas as pd
import numpy
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [3]:
# Functions
def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    return df[(df.duration >= 1) & (df.duration <= 60)].copy()

def prepare_dataframe(file: str, cleanup_function: Callable = None) -> List[any]:
    
    df = pd.read_parquet(file)
    
    df_number_columns = len(df.columns)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    trips_duration_std = df.duration.std()
    
    size_with_outliers = len(df)

    if cleanup_function: 
        df = cleanup_function(df)
    
    size_without_outliers = len(df)
    
    return [df, df_number_columns, trips_duration_std, size_with_outliers, size_without_outliers]

def vectorize(df: pd.DataFrame, dv: DictVectorizer, fit: bool) -> List[any]:
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    data_dicts = df[categorical].to_dict(orient='records')
        
    if fit:
        x_data = dv.fit_transform(data_dicts)
    else:
        x_data = dv.transform(data_dicts)
    
    target = 'duration'
    y_data = df[target].values

    return [x_data, y_data]

def train_model(x_data: DictVectorizer, y_data: numpy.ndarray, model: any, fit: bool) -> float:
    
    if fit:
        model.fit(x_data, y_data)
    
    y_pred = model.predict(x_data)
    
    rmse = root_mean_squared_error(y_data, y_pred)

    return rmse

In [4]:
# Load the January data
df_jan, df_number_columns, trips_duration_std, \
size_with_outliers, size_without_outliers = prepare_dataframe(
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet', 
    clean_data
)

# Q1 - the number of columns
print(f"Q1: The January 2023 file has {df_number_columns} columns.")

# Q2: the standard deviation of the trips duration in January?
print(f"Q2: Standard deviation for the 'duration' column = {trips_duration_std}")

# Q3: size before duration 
print(f"Q3: The percentage left after dropping the outliers in 'duration' is {(size_without_outliers/size_with_outliers)*100:.0f}%")

dv = DictVectorizer()
lr = LinearRegression()


Q1: The January 2023 file has 19 columns.
Q2: Standard deviation for the 'duration' column = 42.59435124195458
Q3: The percentage left after dropping the outliers in 'duration' is 98%


In [5]:
# Training data
x_data, y_data = vectorize(df_jan, dv, fit=True)
rmse = train_model(x_data, y_data, lr, fit=True)

# Q4 - What's the dimensionality of this matrix (number of columns)?
matrix_dimension = x_data.shape[1]
print(f"Q4: The dimensionality of the matric = {matrix_dimension}")

# Q5 - What's the RMSE on train 
print(f"Q5: What's the RMSE on train? (Jan) = {rmse}")

Q4: The dimensionality of the matric = 515
Q5: What's the RMSE on train? (Jan) = 7.649262236295703


In [6]:
# Load the file for February
df_feb, _, _, _, _ = prepare_dataframe(
    'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet', 
    clean_data
)

In [7]:
# Evaluation data
x_data, y_data = vectorize(df_feb, dv, fit=False)
rmse = train_model(x_data, y_data, lr, fit=False)

# Q6 - What's the RMSE on eval for February 2023?
print(f"Q6: What's the RMSE on eval? (Feb) = {rmse}")


Q6: What's the RMSE on eval? (Feb) = 7.811812822882009
