In [1]:
import mlflow
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier

In [2]:
data = fetch_covtype()
X, y = data.data, data.target

In [3]:
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

In [4]:
df_sampled = df.sample(n=1000, random_state=42)

In [5]:
X_sampled = df_sampled.drop(columns=['target'])
y_sampled = df_sampled['target']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=42, stratify=y_sampled)

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver='saga'),
    "Decision Tree": DecisionTreeClassifier(max_depth=10),
    "CatBoost": CatBoostClassifier(verbose=0)
}

In [9]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"{name}: Accuracy = {acc:.4f}")

Logistic Regression: Accuracy = 0.7350
Decision Tree: Accuracy = 0.6600
CatBoost: Accuracy = 0.7800


In [10]:
# mlflow.set_tracking_uri("http://127.0.0.1:5000")
# mlflow.create_experiment("new_experiment")
# mlflow.set_experiment("new_experiment")

In [43]:
# Start MLflow experiment
mlflow.set_experiment("experiment_6")

<Experiment: artifact_location='file:///home/chen/Desktop/lectures/10x-ML-engineer/mlflow/mlruns/593170406082354387', creation_time=1738829665802, experiment_id='593170406082354387', last_update_time=1738829665802, lifecycle_stage='active', name='experiment_6', tags={}>

In [44]:
input_example = X_train[0]

In [45]:
input_example = input_example.reshape(1, -1)

In [None]:

results = {}
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        # Train model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)

        # Log model parameters
        if hasattr(model, "get_params"):
            params = model.get_params()
            mlflow.log_params(params)

        # Log additional metadata
        mlflow.log_param("model_name", name)
        mlflow.log_param("dataset_size", len(X_sampled))
        mlflow.log_param("test_size_ratio", 0.2)
        mlflow.log_param("feature_scaling", "StandardScaler")

        # Log metrics
        mlflow.log_metric("accuracy", acc)

        # Log model itself
        mlflow.sklearn.log_model(model, name, input_example=input_example)

        results[name] = acc
        print(f"{name}: Accuracy = {acc:.4f}")

print("\n✅ Experiment tracking complete. Run `mlflow ui` to view results.")


In [38]:
import mlflow
print(mlflow.get_tracking_uri())


file:///home/chen/Desktop/lectures/10x-ML-engineer/mlflow/mlruns


# mlflow Registry

In [19]:
mdl = LogisticRegression()
mdl.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [20]:
acc = accuracy_score(y_test, y_pred)
print(acc)

0.78


In [22]:
model = mdl

with mlflow.start_run():
    # Log model parameters (optional)
    mlflow.log_param("model_type", "logistic_regression")

    # Log the trained model
    model_uri = "logistic_regression"
    mlflow.sklearn.log_model(model, model_uri, input_example=input_example)

    # Register the model in the MLflow Model Registry
    result = mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/{model_uri}", "logistic_regression")

print(f"Model registered as: {result.name}, version: {result.version}")


Model registered as: logistic_regression, version: 2


Registered model 'logistic_regression' already exists. Creating a new version of this model...
Created version '2' of model 'logistic_regression'.
