In [2]:
# mlflow setup

import mlflow

mlflow.set_experiment("w2-nyc-taxi-experiment")

# mlflow.set_tracking_uri(uri="sqlite:///mlflow.db")

mlflow.sklearn.autolog()

In [3]:
import pandas as pd
import numpy as np

yellow_taxi = pd.read_parquet("./data/yellow-taxi-2301.parquet")

def duration_in_minutes(df):
    duration = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    return duration / pd.Timedelta(minutes=1)

def prepare_data(df, samp: bool=False):
    df['duration'] = duration_in_minutes(df)
    filtered_df = df[ (df.duration >= 1) & (df.duration <= 60) ].copy()
    
    #sampling 
    if samp:
        return filtered_df.sample(frac=0.05, random_state=42)
    else:
        return filtered_df
    

# sampling
df = prepare_data(yellow_taxi, samp = True)

categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

# make sure 'DOLocationID' is str type, otherwise the dictvectorizer
# will treat it as numeric, hence not one-hot encoding it

def prepare_dict(df):
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance']

    df[categorical] = df[categorical].astype(str)

    # turn to dict
    return df[categorical + numerical].to_dict(orient='records')

train_dicts = prepare_dict(df)

from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()

# feature matrix array:
X_train = vec.fit_transform(train_dicts)
y_train = df.duration.values

# Validation set
yellow_taxi_2 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

# sampling
df2 = prepare_data(yellow_taxi_2, samp = True)

X_val = vec.transform(prepare_dict(df2))
y_val = df2.duration.values

In [4]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

reg = linear_model.LinearRegression()

reg.fit(X_train, y_train)

y_pred = reg.predict(X_val)

mse = mean_squared_error(y_val, y_pred)

2024/11/08 17:10:02 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '8395f95f33514e04ba58c3fe7a792fc1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
