In [1]:
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import classification_report

In [6]:
X, y = make_classification(n_samples=1000, n_features=20, n_classes=2, weights=[0.8, 0.2], 
                    n_informative=10, n_redundant=10)

In [7]:
np.unique(y, return_counts=True)

(array([0, 1]), array([800, 200], dtype=int64))

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, stratify=y, test_size=0.2)

In [9]:
lr_params = {
    "solver":"lbfgs",
    "max_iter":1000,
    "multi_class":"auto",
    "random_state":42
}
lr = LogisticRegression(**lr_params)
lr.fit(X_train, y_train)
pred = lr.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.95       160
           1       0.86      0.75      0.80        40

    accuracy                           0.93       200
   macro avg       0.90      0.86      0.88       200
weighted avg       0.92      0.93      0.92       200



In [15]:
report_dict = classification_report(y_test, pred, output_dict=True)
report_dict

{'0': {'precision': 0.9393939393939394,
  'recall': 0.96875,
  'f1-score': 0.9538461538461539,
  'support': 160.0},
 '1': {'precision': 0.8571428571428571,
  'recall': 0.75,
  'f1-score': 0.7999999999999999,
  'support': 40.0},
 'accuracy': 0.925,
 'macro avg': {'precision': 0.8982683982683983,
  'recall': 0.859375,
  'f1-score': 0.8769230769230769,
  'support': 200.0},
 'weighted avg': {'precision': 0.922943722943723,
  'recall': 0.925,
  'f1-score': 0.923076923076923,
  'support': 200.0}}

## MLFlow

In [11]:
import sys
print(sys.executable)

C:\Users\User\AppData\Local\Programs\Python\Python311\python.exe


In [12]:
# import sys
# !{sys.executable} -m pip install mlflow
import mlflow

In [13]:
report_dict['accuracy']

0.925

In [14]:
report_dict['0']['recall']

0.96875

In [20]:
mlflow.set_experiment("First Experiment")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

with mlflow.start_run():
    mlflow.log_params(lr_params)
    mlflow.log_metrics({
        "accuracy":report_dict['accuracy'],
        "precision_class_0":report_dict['0']['precision'],
        "precision_class_1":report_dict['1']['precision'],
        "recall_class_0":report_dict['0']['recall'],
        "recall_class_1":report_dict['1']['recall'],
    })
    mlflow.sklearn.log_model(lr, "Logistic Regression")

2024/08/15 01:50:54 INFO mlflow.tracking.fluent: Experiment with name 'First Experiment' does not exist. Creating a new experiment.
2024/08/15 01:51:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run legendary-hound-399 at: http://127.0.0.1:5000/#/experiments/271576795570987654/runs/6e4d77b18c76474282c89074594ef9cd.
2024/08/15 01:51:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/271576795570987654.


## Training more models to log into MLFlow

In [22]:
rf_params = {
    "n_estimators":500,
    "criterion":"gini",
    "bootstrap":True
}
rf = RandomForestClassifier(**rf_params)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       160
           1       0.94      0.75      0.83        40

    accuracy                           0.94       200
   macro avg       0.94      0.87      0.90       200
weighted avg       0.94      0.94      0.94       200



In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn_params = {
    "n_neighbors":20,
}
knn = KNeighborsClassifier(**knn_params)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print(classification_report(y_test, knn_pred))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95       160
           1       0.96      0.60      0.74        40

    accuracy                           0.92       200
   macro avg       0.93      0.80      0.84       200
weighted avg       0.92      0.92      0.91       200



In [24]:
from sklearn.svm import SVC
svm_params = {
    "kernel":"rbf",
    "C":1.0
}
svm = SVC(**svm_params)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print(classification_report(y_test, svm_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       160
           1       0.97      0.78      0.86        40

    accuracy                           0.95       200
   macro avg       0.96      0.88      0.92       200
weighted avg       0.95      0.95      0.95       200



In [25]:
np.unique(y, return_counts=True)

(array([0, 1]), array([800, 200], dtype=int64))

In [26]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_train_sampled, y_train_sampled = smt.fit_resample(X_train, y_train)
np.unique(y_train_sampled, return_counts=True)

(array([0, 1]), array([639, 639], dtype=int64))

In [39]:
!{sys.executable} -m pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
     -------------------------------------- 124.9/124.9 MB 1.2 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.1.1



[notice] A new release of pip available: 22.3.1 -> 24.2
[notice] To update, run: C:\Users\User\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


In [42]:
import xgboost as xgb
xgb_classifier = xgb.XGBClassifier(eval_metric='logloss')
xgb_classifier.fit(X_train_sampled, y_train_sampled)
xgb_pred = xgb_classifier.predict(X_test)
print(classification_report(y_test, xgb_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       160
           1       0.90      0.88      0.89        40

    accuracy                           0.95       200
   macro avg       0.93      0.93      0.93       200
weighted avg       0.95      0.95      0.95       200



## Track Experiments Using MLFlow

In [45]:
models = [
    (
        "Logistic Regression",
        LogisticRegression(C=1, solver="liblinear", max_iter=500),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest",
        RandomForestClassifier(criterion="entropy", bootstrap=True, n_estimators=500),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "KNN",
        KNeighborsClassifier(n_neighbors=20),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "SVM",
        SVC(C=1.0, kernel="rbf"),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBoost with SMOTETomek",
        xgb.XGBClassifier(eval_metric='logloss'),
        (X_train_sampled, y_train_sampled),
        (X_test, y_test)
    )
]

In [46]:
reports = []

for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [47]:
reports

[{'0': {'precision': 0.9393939393939394,
   'recall': 0.96875,
   'f1-score': 0.9538461538461539,
   'support': 160.0},
  '1': {'precision': 0.8571428571428571,
   'recall': 0.75,
   'f1-score': 0.7999999999999999,
   'support': 40.0},
  'accuracy': 0.925,
  'macro avg': {'precision': 0.8982683982683983,
   'recall': 0.859375,
   'f1-score': 0.8769230769230769,
   'support': 200.0},
  'weighted avg': {'precision': 0.922943722943723,
   'recall': 0.925,
   'f1-score': 0.923076923076923,
   'support': 200.0}},
 {'0': {'precision': 0.9294117647058824,
   'recall': 0.9875,
   'f1-score': 0.9575757575757576,
   'support': 160.0},
  '1': {'precision': 0.9333333333333333,
   'recall': 0.7,
   'f1-score': 0.8,
   'support': 40.0},
  'accuracy': 0.93,
  'macro avg': {'precision': 0.9313725490196079,
   'recall': 0.84375,
   'f1-score': 0.8787878787878789,
   'support': 200.0},
  'weighted avg': {'precision': 0.9301960784313726,
   'recall': 0.93,
   'f1-score': 0.9260606060606061,
   'support':

In [57]:
mlflow.set_experiment("Test Experiments")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for i, elements in enumerate(models):
    model_name = elements[0]
    model = elements[1]
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", report['accuracy'])
        mlflow.log_metric("precision_class_0", report_dict['0']['precision'])
        mlflow.log_metric("precision_class_1", report_dict['1']['precision'])
        mlflow.log_metric("recall_class_0", report_dict['0']['recall'])
        mlflow.log_metric("recall_class_1", report_dict['1']['recall'])
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")

2024/08/15 02:54:46 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/305430055493993902/runs/b142b1aff2d7472ead9dca9a4654f0bd.
2024/08/15 02:54:46 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/305430055493993902.
2024/08/15 02:54:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/305430055493993902/runs/ee7a313149804c6aadd5ef5768c6e3c7.
2024/08/15 02:54:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/305430055493993902.
2024/08/15 02:55:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN at: http://127.0.0.1:5000/#/experiments/305430055493993902/runs/3ad787df500643a5ba21c7c126280624.
2024/08/15 02:55:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/305430055493993902.

## Now, will add parameters dictionary for each individual model

In [72]:
models = [
    (
        "Logistic Regression",
        {"C":1, "solver":"liblinear", "max_iter":500},
        LogisticRegression(),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest",
        {"criterion":"entropy", "bootstrap":True, "n_estimators":500},
        RandomForestClassifier(),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "KNN",
        {"n_neighbors":20},
        KNeighborsClassifier(),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "SVM",
        {"C":1.0, "kernel":"rbf"},
        SVC(),
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBoost with SMOTETomek",
        {"eval_metric":'logloss'},
        xgb.XGBClassifier(),
        (X_train_sampled, y_train_sampled),
        (X_test, y_test)
    )
]

In [73]:
reports = []

for model_name, params, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    
    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

In [75]:
mlflow.set_experiment("New Experiments")
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

for i, elements in enumerate(models):
    model_name = elements[0]
    params = elements[1]
    model = elements[2]
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):
        mlflow.log_params(params)
        mlflow.log_metrics({
            "accuracy":report['accuracy'],
            "precision_class_0":report['0']['precision'],
            "precision_class_1":report['1']['precision'],
            "recall_class_0":report['0']['recall'],
            "recall_class_1":report['1']['recall'],
        })
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")

2024/08/15 18:50:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://127.0.0.1:5000/#/experiments/532443340321630467/runs/32be8c8fba46405191ecb4fbe34a0928.
2024/08/15 18:50:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/532443340321630467.
2024/08/15 18:50:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://127.0.0.1:5000/#/experiments/532443340321630467/runs/f1313c9c3dae4996b668e308049b7afa.
2024/08/15 18:50:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/532443340321630467.
2024/08/15 18:50:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNN at: http://127.0.0.1:5000/#/experiments/532443340321630467/runs/fe5a43f735bb417ea68d666f6e93aff9.
2024/08/15 18:50:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/532443340321630467.

## REGISTER THE MODEL TO MODEL REGISTRY

model uri should be in this format

`runs/{run_id}/model`

you can specify model name as anything, also we are just taking run id as input but u can pass it directly as well

In [78]:
model_name = "XGBoost-SMOTETomek"
model_run_id = input("Enter run id: ")
model_uri = f"runs:/{model_run_id}/model"

result = mlflow.register_model(
    model_uri=model_uri, name=model_name
)

Enter run id: 0e6dfbc042ca4666b0976a4f3e734de9


Successfully registered model 'XGBoost-SMOTETomek'.
2024/08/15 19:57:41 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost-SMOTETomek, version 1
Created version '1' of model 'XGBoost-SMOTETomek'.


## LOAD MLFLOW MODEL

Lets say we wanna load our Challenger model, since its XGBoost, will use `XGBoost`, if it were sklearn we would write `sklearn`

In [82]:
model_version = 1
model_name = "XGBoost-SMOTETomek"
model_uri = f"models:/{model_name}/{model_version}"

loaded_model = mlflow.xgboost.load_model(model_uri=model_uri)

pred = loaded_model.predict(X_test)
pred[:5]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([0, 0, 0, 0, 1])

In [83]:
model_version = 1
model_name = "XGBoost-SMOTETomek"
model_uri = f"models:/{model_name}@challenger"

loaded_model = mlflow.xgboost.load_model(model_uri=model_uri)

pred = loaded_model.predict(X_test)
pred[:5]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([0, 0, 0, 0, 1])

## Put the Model from Development Environment to Production Environment

In [85]:
development_model_uri = f"models:/{model_name}@challenger"
production_model = "XGBoost-production"

client = mlflow.MlflowClient()
client.copy_model_version(src_model_uri=development_model_uri, dst_name=production_model)

Successfully registered model 'XGBoost-production'.
Copied version '1' of model 'XGBoost-SMOTETomek' to version '1' of model 'XGBoost-production'.


<ModelVersion: aliases=[], creation_timestamp=1723732892729, current_stage='None', description='', last_updated_timestamp=1723732892729, name='XGBoost-production', run_id='0e6dfbc042ca4666b0976a4f3e734de9', run_link='', source='models:/XGBoost-SMOTETomek/1', status='READY', status_message='', tags={}, user_id='', version='1'>

## Download the production model and test it

In [86]:
model_uri = f"models:/{production_model}@champion"
loaded_model = mlflow.xgboost.load_model(model_uri=model_uri)

pred = loaded_model.predict(X_test)
pred[:5]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

array([0, 0, 0, 0, 1])

## Tracking but on Dagshub server
1. First create a Github Repo
2. Upload the Notebook where you was doing MLFlow Experiment Tracking
3. Go to dagshub and create an account
4. Create a new repo in dagshub or connect to that repo u made in Github directly
5. In ur dagshub repo, click on `Remote`, then `Experiments` then just copy the `Using Mlflow tracking` code and paste it as below
6. Set environment variables credential 
7. Do the same thing where you log all the details, but in `uri`, put the dagshub link