<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
!pip install mlflow -q
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn # Wrapper pour scikit-learn

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

sns.set(rc={'figure.figsize':(14,10)})

def mae(y_star, y_pred):
    return 1 / len(y_star) * np.sum(np.abs(y_star - y_pred))

In [7]:
data = None

print("Chargement des données ...")
for chunk in pd.read_csv(
    #"https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2019-03.csv",
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2018-01.csv",
    chunksize=1000000):
    data = chunk
    break
data['tpep_pickup_datetime'] = pd.to_datetime(data['tpep_pickup_datetime'])
data['tpep_dropoff_datetime'] = pd.to_datetime(data['tpep_dropoff_datetime'])

Chargement des données ...


In [8]:
# Création des colonnes liés au pickup
data['pickup_hour'] = data['tpep_pickup_datetime'].dt.hour
data['pickup_minute'] = data['tpep_pickup_datetime'].dt.minute
data['pickup_second'] = data['tpep_pickup_datetime'].dt.second
# Durée réelle du trajet en secondes
data['duration'] = (data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime']).dt.seconds
data_base = data[(data['duration'] < 3600) & (data['duration'] > 60)]

In [9]:
data_base = data_base.sample(frac=0.1, replace=True)

X = data_base[['trip_distance', 'PULocationID', 'DOLocationID', 'pickup_hour',
          'pickup_minute', 'pickup_second']]
y = data_base['duration']

# Construction de notre base d'apprentissage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
X_train.head()

Unnamed: 0,trip_distance,PULocationID,DOLocationID,pickup_hour,pickup_minute,pickup_second
456090,0.59,230,163,21,48,48
291909,1.13,170,107,9,40,11
925410,1.3,143,238,10,43,19
232651,3.28,230,137,23,13,55
63083,0.7,263,263,3,32,58


In [10]:
EXPERIMENT_ID = 1

n_estimators = 20
max_depth = 10

# Identification de l'interface MLflow
mlflow.set_tracking_uri("file://" + os.path.expanduser('~/mlruns'))

mlflow.set_experiment("data-engineer-duration")

with mlflow.start_run() as run:
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
    rf.fit(X_train, y_train)  # Processus d'optimisation de l'arbre
    
    mae_score = mae(y_test, rf.predict(X_test))
    
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    
    mlflow.log_metric("mae", mae_score)
    
    print(mlflow.get_artifact_uri())
    mlflow.sklearn.log_model(rf, "model")

INFO: 'data-engineer-duration' does not exist. Creating a new experiment
file://C:\Users\Megaport/mlruns/1/55743c6e7a5f4a9c884f1ea817971a80/artifacts


In [11]:
from mlflow.tracking import MlflowClient

client = MlflowClient(
    tracking_uri="file://" + os.path.expanduser('~/mlruns')
)

client.get_metric_history(run.info.run_id, key='mae')

[<Metric: key='mae', step=0, timestamp=1610901392665, value=176.21993538115157>]

In [12]:
def train_rf(n_estimators, max_depth):
    
    with mlflow.start_run() as run:
        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
        rf.fit(X_train, y_train)  # Processus d'optimisation de l'arbre

        mae_score = mae(y_test, rf.predict(X_test))

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        mlflow.log_metric("mae", mae_score)
        mlflow.sklearn.log_model(rf, "model")
        
train_rf(10, 10)
train_rf(50, 15)
train_rf(100, 20)

In [13]:
!pip install google-cloud-storage -q

ERROR: google-api-core 1.25.0 has requirement google-auth<2.0dev,>=1.21.1, but you'll have google-auth 1.11.2 which is incompatible.


In [14]:
# Nouvel URI de l'interface MLflow
mlflow.set_tracking_uri("http://34.65.91.126:5000")

In [15]:
# Authentification à Google Cloud avec la clé correspondant au compte de service MLflow
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'storage-key.json'
from google.cloud import storage
client = storage.Client()

In [16]:
def plot_mae(X, y, model):
    """
    Il est aussi pertinent de logger les graphiques sous forme d'artifacts.
    """
    fig = plt.figure()
    plt.scatter(y, model.predict(X))
    plt.xlabel("Durée réelle du trajet")
    plt.ylabel("Durée estimée du trajet")
    
    image = fig
    fig.savefig("MAE.png")
    plt.close(fig)
    return image

def train_rf(n_estimators, max_depth):
    
    with mlflow.start_run(experiment_id=1):
        rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth)
        rf.fit(X_train, y_train)  # Processus d'optimisation de l'arbre

        mae_score = mae(y_test, rf.predict(X_test))
        plot_mae(X_test, y_test, rf)

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)

        mlflow.log_metric("mae", mae_score)
        mlflow.log_artifact("MAE.png")
        mlflow.sklearn.log_model(rf, "model")
        
train_rf(10, 10)