In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Load data from feature store

In [12]:
x_path="../feature_store/X.npy"
y_path="../feature_store/y.npy"

In [13]:
# %pip install s3fs

In [14]:
from s3fs.core import S3FileSystem
import numpy as np
def get_s3(s3_path):
    # If you have a s3 storage on AWS, let's run this command
    # s3 = S3FileSystem()
    # return np.load(s3.open(s3_path), allow_pickle=True)
    # In case run on simulated feature_store on the local folder
    return np.load(s3_path, allow_pickle=True)

In [16]:
X = get_s3(x_path)
y = get_s3(y_path)
print(X.shape, y.shape)

(42100, 17) (42100,)


In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.80)

# Using mlflow to tracking model training experiments

In [21]:
# %pip install mlflow

You need to start mlflow server on local first on endpoint `http://127.0.0.1:5000`. Running this command on the terminal to start mlflow ui.

```
$ mlflow ui
```

Next, tracking the model output, metrics, and parameters of a training experiment

In [25]:
import mlflow

In [35]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [36]:
mlflow.set_experiment("hotel_occupancy")

2024/05/02 11:34:26 INFO mlflow.tracking.fluent: Experiment with name 'hotel_occupancy' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/369295605872091803', creation_time=1714624466629, experiment_id='369295605872091803', last_update_time=1714624466629, lifecycle_stage='active', name='hotel_occupancy', tags={}>

In [37]:
params = {"n_estimators":120,"max_depth":20,"n_jobs":-1}
clf = RandomForestClassifier(**params)

In [38]:
from sklearn.metrics import roc_auc_score, confusion_matrix

mlflow can support to track `parameters, metrics, and artifacts` within a context manager initialized by `mlflow.start_run()`

In [39]:
with mlflow.start_run():
    clf = clf.fit(X_train,y_train)
    preds = clf.predict(X_test)
    pred_proba = clf.predict_proba(X_test)[:,1]
    cm = confusion_matrix(y_test,preds)
    auc=roc_auc_score(y_test,pred_proba)
    t_n, f_p, f_n, t_p = cm.ravel()
    # Tracking the metrics of model evaluation on test
    mlflow.log_metrics({"tn":t_n,'fp':f_p,'fn':f_n,'tp':t_p,"auc":auc})
    # Tracking the forecasting result of a model artifact named rf_model with input X_test[0:,]
    mlflow.sklearn.log_model(clf,'rf_model',input_example=X_test[0:,])
    # Tracking traning parameters of the model
    mlflow.log_params(params)