In [None]:
!python -V

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [None]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

## Load data

Data information :
- `lpep_pickup_datetime` : The date and time when the meter was engaged.
- `lpep_dropoff_datetime` : The date and time when the meter was disengaged.
- `PULocationID` : TLC Taxi Zone in which the taximeter was engaged. 
- `DOlocationID` : TLC Taxi Zone in which the taximeter was disengaged. 
- `trip_distance` : The elapsed trip distance in miles reported by the taximeter.

In [None]:
%time df = pd.read_parquet('./data/green_tripdata_2021-01.parquet')
print(df.shape)
df.head(5)

## Data analysis & transformation

In [None]:
# Data analysis
df.dtypes

In [None]:
# Data transformation
df["duration"] = df["lpep_dropoff_datetime"] - df["lpep_pickup_datetime"]
df["duration"]

In [None]:
td = df.duration.iloc[0]
td

In [None]:
td.total_seconds()

In [None]:
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
df.duration

In [None]:
df.duration.describe(percentiles=[0.01, 0.02, 0.05, 0.95, 0.98, 0.99])
# 99% of trips have a duration < 67.16 minutes
# we can keep duration values >= 1 and <= 60

In [None]:
# apply filter
df = df[(df["duration"] >= 1) & (df["duration"] <= 60)]
# check distribution
df.duration.describe()

In [None]:
sns.kdeplot(data=df, x="duration")

In [None]:
# selection of variables
df[["PULocationID", "DOLocationID"]].nunique()

In [None]:
df["PULocationID"].value_counts()

In [None]:
sns.histplot(data=df, x="PULocationID", legend=True)
sns.histplot(data=df, x="DOLocationID", ax=plt.gca(), color="orange", legend=True)
plt.legend()
plt.show()

In [None]:
# convert trip_distance unit from miles to km
df.trip_distance = df.trip_distance.apply(lambda x: x / 1.609344)
sns.kdeplot(data=df, x="trip_distance")

In [None]:
df.trip_distance.describe(percentiles=[0.01, 0.02, 0.05, 0.95, 0.98, 0.99])
# variable trip_distance also contains outliers

In [None]:
# apply filter on trip_distance
print(df.shape)
max = np.percentile(df.trip_distance, q=99)
df = df[(df["trip_distance"] > 0) & (df["trip_distance"] <= max)]
print(df.shape)

In [None]:
sns.kdeplot(data=df, x="trip_distance")

## Target & feature selection

In [None]:
# feature selection
# categorical variables :
# cat_cols = ["PULocationID", "DOLocationID"]
# since both variables do not really give information separately, we should link them together to represent their interaction
# (kind of like a graph dependancy)
# convert int to str
df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
df["PU_DO"] = df["PULocationID"] + "_" + df["DOLocationID"]
cat_cols = ["PU_DO"]

# numerical variables
num_cols = ["trip_distance"]

In [None]:
# apply one-hot-encoding
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(handle_unknown="ignore")

# test on "PULocationID"
Xnew = encoder.fit_transform(df[["PULocationID"]])
Xnew

In [None]:
# define target and features
X = df[cat_cols + num_cols]
y = df["duration"]
y_log = np.log(df["duration"])

print(X.shape, y.shape, y_log.shape)

In [None]:
sns.kdeplot(y)
plt.show()

sns.kdeplot(y_log)
plt.show()

In [None]:
# split train/val datasets
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, train_size=0.8, random_state=42
)
print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)

In [None]:
# define data transformation
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(
    [
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols),
    ]
)

# apply data transformation
# fit_transform on training dataset
# transform on validation dataset

X_train = ct.fit_transform(X_train)
X_valid = ct.transform(X_valid)

print(X_train.shape, X_valid.shape)

## Model training & evaluation

In [None]:
# model design

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_valid)

# plt.scatter(y_pred, y_valid)
sns.kdeplot(y_valid, label="true_values")
sns.kdeplot(y_pred, label="predictions")
plt.legend()
plt.show()

# scores
## r2
plt.scatter(y_valid, y_pred, alpha=0.5)
plt.ylabel("true_values")
plt.xlabel("predictions")
# add y=x
sample_y = np.arange(y_valid.min(), y_valid.max())
plt.plot(sample_y, sample_y, color="k")
plt.show()

r2 = r2_score(y_valid, y_pred)

## rmse
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print("r2 : ", r2)
print("rmse: ", rmse)

In [None]:
rf = RandomForestRegressor(max_depth=10, n_estimators=500)
%time rf.fit(X_train, y_train)

y_pred = rf.predict(X_valid)

# plt.scatter(y_pred, y_valid)
sns.kdeplot(y_valid, label="true_values")
sns.kdeplot(y_pred, label="predictions")
plt.legend()
plt.show()

# scores
## r2
plt.scatter(y_valid, y_pred, alpha=0.5)
plt.ylabel("true_values")
plt.xlabel("predictions")
# add y=x
sample_y = np.arange(y_valid.min(), y_valid.max())
plt.plot(sample_y, sample_y, color="k")
plt.show()

r2 = r2_score(y_valid, y_pred)

## rmse
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print("r2 : ", r2)
print("rmse: ", rmse)

## Tracking experiments with MLFlow

In [None]:
## level 1 : tracking manual runs

with mlflow.start_run():
    mlflow.set_tag("developer", "elizabeth")

    mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.parquet")
    mlflow.log_param("train_size", 0.8)

    alpha = 0.01
    mlflow.log_param("alpha", alpha)

    ridge = Ridge(alpha=alpha)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_valid)

    # plt.scatter(y_pred, y_valid)
    sns.kdeplot(y_valid, label="true_values")
    sns.kdeplot(y_pred, label="predictions")
    plt.legend()
    plt.show()

    # scores
    ## r2
    plt.scatter(y_valid, y_pred, alpha=0.5)
    plt.ylabel("true_values")
    plt.xlabel("predictions")
    # add y=x
    sample_y = np.arange(y_valid.min(), y_valid.max())
    plt.plot(sample_y, sample_y, color="k")
    plt.show()

    r2 = r2_score(y_valid, y_pred)
    mlflow.log_metric("r2_score", r2)

    ## rmse
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print("r2 : ", r2)
    print("rmse: ", rmse)
    mlflow.log_metric("rmse", rmse)

In [None]:
## level 2 : tracking hyperparametrization of models (ex: randomforest)

from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    ParameterGrid,
)

# define parameter grid
param_grid = {
    "n_estimators": [100, 300, 500],
    "max_depth": [10, 20, 30],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4],
    #'max_features': ['sqrt', 'log2'],
    #    'bootstrap': [True, False]
}

for params in ParameterGrid(param_grid):
    # define the model
    rf = RandomForestRegressor(**params, random_state=42)
    rf.fit(X_train, y_train)  # train on your training data

    y_pred = model.predict(X_valid)  # predict on validation data
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    r2 = r2_score(y_valid, y_pred)

    # Log metrics and parameters in MLflow for each set of hyperparameter
    with mlflow.start_run(run_name="RF_GridSearchCV", nested=True):
        mlflow.set_tag("developer", "elizabeth")
        mlflow.set_tag("model", "randomforest")
        mlflow.log_param("train-data-path", "./data/green_tripdata_2021-01.parquet")
        mlflow.log_param("train_size", 0.8)
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2_score", r2)

    # save the best model
    if rmse < best_rmse:
        best_rmse = rmse
        best_rf = rf

## Model & transformers save parameters

In [None]:
import pickle

In [None]:
%%time 
# Save models
# here we only save the models that were changed thanks to the experiment tracking
with open("models/lr.bin", "wb") as f_out:
    pickle.dump((ct, lr), f_out)

with open("models/best_rf.bin", "wb") as f_out:
    pickle.dump((ct, best_rf), f_out)
