1. AutoML built on top of sklearn and optuna [Clean-up]
2. Abstract and include mlflow as part of it. Mlflow should be storing directly in AWS S3 [Done]
3. [Opt] Add code to include memory profiling of the code
4. Model serving
5. Enhance eval module
6. Generate a class to handle all storage functionality per user

Testing

In [2]:
import mlflow
import mlflow.sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Set up MLflow tracking URI
mlflow.set_tracking_uri("http://localhost:5000")

# Load dataset
iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set experiment
mlflow.set_experiment("iris_test")

# Train and log model
with mlflow.start_run():
    model = LogisticRegression(max_iter=200)
    model.fit(X_train, y_train)
    
    # Predict and evaluate
    predictions = model.predict(X_test)
    acc = accuracy_score(y_test, predictions)

    # Log model and metrics
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(model, "logistic_regression_model")

    print(f"Logged to MLflow with accuracy: {acc}")




Logged to MLflow with accuracy: 1.0
🏃 View run big-cow-401 at: http://localhost:5000/#/experiments/1/runs/826b09da3dc64f38a6eddfe36e4b9f8c
🧪 View experiment at: http://localhost:5000/#/experiments/1


In [3]:
from pathlib import Path

current_path = Path.cwd()
print(f"Current working directory: {current_path}")

Current working directory: c:\Users\dusad\Documents\Projects\agnei_consulting\mlutils\Notebooks


### Attempt downloading with Kaggle

In [1]:
from mlutils.utils.kaggle import fetch_kaggle_dataset
from mlutils.utils.io import find_git_root
_dir = find_git_root() / "data"
fetch_kaggle_dataset(dataset_name="blastchar/telco-customer-churn", 
                     path = _dir)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/blastchar/telco-customer-churn?dataset_version_number=1...


100%|██████████| 172k/172k [00:00<00:00, 189kB/s]

Extracting files...





'c:/Users/dusad/Documents/Projects/agnei_consulting/mlutils/data\\datasets\\blastchar\\telco-customer-churn\\versions\\1'

In [4]:
_dir

WindowsPath('c:/Users/dusad/Documents/Projects/agnei_consulting/mlutils/data')

In [1]:
blob_name = "folder/uploaded_file.txt"
local_file_path = "a.txt"

from mlutils.cloud.azure import upload_file_to_blob
upload_file_to_blob(blob_name=blob_name, 
                     local_file_path=local_file_path, 
                    )

File 'a.txt' uploaded to blob 'folder/uploaded_file.txt' in container 'expt'.


File a.txt uploaded to folder/your_file.txt.


In [4]:
from mlutils.cloud.gcp import upload_blob

# GCS bucket and file info
bucket_name = "expt-mandrakebio"
destination_blob_name = "folder/your_file.txt"  # GCS path
local_file_path = "a.txt"  # Local file to upload


upload_blob(bucket_name, destination_blob_name, local_file_path)

File a.txt uploaded to folder/your_file.txt.


In [11]:
import sklearn
getattr(sklearn.linear_model, 'LogisticRegression')

sklearn.linear_model._logistic.LogisticRegression

In [1]:
from mlutils.utils.io import find_git_root
_dir = find_git_root() / "config"

import sys
sys.path.append(str(_dir))

import config

In [2]:
from config.binary_classifier import models_param_grid

In [None]:
from config.binary_classifier import models_param_grid


  from .autonotebook import tqdm as notebook_tqdm


{'LogisticRegression': {'param_grid': {'model__C': FloatDistribution(high=10.0, log=True, low=0.01, step=None),
   'model__penalty': CategoricalDistribution(choices=('l2',)),
   'model__solver': CategoricalDistribution(choices=('lbfgs', 'liblinear')),
   'model__max_iter': IntDistribution(high=1000, log=False, low=10, step=1)},
  'model': LogisticRegression()},
 'GradientBoostingClassifier': {'param_grid': {'model__n_estimators': IntDistribution(high=200, log=False, low=100, step=1),
   'model__learning_rate': FloatDistribution(high=0.2, log=True, low=0.01, step=None),
   'model__max_depth': IntDistribution(high=5, log=False, low=3, step=1)},
  'model': GradientBoostingClassifier()}}

In [4]:
from mlutils.utils.io import split_train_test, read_local_data
from mlutils.automl.train import model_tune
from mlutils.utils.config import clean_model_params, param_grid_fix

HYPERPARAMS_DICT = param_grid_fix(models_param_grid, 'optuna_search')

In [8]:
X, y = read_local_data(path = '../data/datasets/blastchar/telco-customer-churn/versions/1/WA_Fn-UseC_-Telco-Customer-Churn.csv',
                       target_col = 'Churn')

for model_name in HYPERPARAMS_DICT.keys():
    model = HYPERPARAMS_DICT[model_name]['model']
    param_grid = HYPERPARAMS_DICT[model_name]['param_grid']
    
    model_tune(X, y, model_name, model, param_grid, search_algo='bayesian', mlflow_expt_name='optuna-search-imbalanced')



2025/06/08 13:14:06 INFO mlflow.tracking.fluent: Experiment with name 'optuna-search-imbalanced' does not exist. Creating a new experiment.


---LogisticRegression----


  searcher = OptunaSearchCV(pipeline, param_grid, cv=5, scoring="recall", n_trials=50, n_jobs=-1)
[I 2025-06-08 13:14:16,747] A new study created in memory with name: no-name-0e1545da-07d5-4db8-a6d7-bb4442ef20ac
[I 2025-06-08 13:14:27,800] Trial 10 finished with value: 0.7807580824972129 and parameters: {'model__C': 0.33715777675176695, 'model__penalty': 'l2', 'model__solver': 'liblinear', 'model__max_iter': 945}. Best is trial 10 with value: 0.7807580824972129.
[I 2025-06-08 13:14:27,832] Trial 8 finished with value: 0.7814269788182833 and parameters: {'model__C': 0.2348576321023204, 'model__penalty': 'l2', 'model__solver': 'liblinear', 'model__max_iter': 678}. Best is trial 8 with value: 0.7814269788182833.
[I 2025-06-08 13:14:28,074] Trial 9 finished with value: 0.7874448160535117 and parameters: {'model__C': 0.01638797082412542, 'model__penalty': 'l2', 'model__solver': 'liblinear', 'model__max_iter': 64}. Best is trial 9 with value: 0.7874448160535117.
[I 2025-06-08 13:14:28,120] T

🏃 View run LogisticRegression at: http://localhost:5000/#/experiments/2/runs/e483770756ce49db9f7da721e8ccdcd9
🧪 View experiment at: http://localhost:5000/#/experiments/2
---GradientBoostingClassifier----


  searcher = OptunaSearchCV(pipeline, param_grid, cv=5, scoring="recall", n_trials=50, n_jobs=-1)
[I 2025-06-08 13:15:34,748] A new study created in memory with name: no-name-ccad7d8c-e15d-4407-bccc-0ea19eff8d20
[I 2025-06-08 13:16:16,382] Trial 0 finished with value: 0.7694024526198439 and parameters: {'model__n_estimators': 111, 'model__learning_rate': 0.015504088275512027, 'model__max_depth': 3}. Best is trial 0 with value: 0.7694024526198439.
[I 2025-06-08 13:16:32,711] Trial 10 finished with value: 0.5828807134894092 and parameters: {'model__n_estimators': 122, 'model__learning_rate': 0.12184851078903709, 'model__max_depth': 4}. Best is trial 0 with value: 0.7694024526198439.
[I 2025-06-08 13:16:36,727] Trial 3 finished with value: 0.7199397993311036 and parameters: {'model__n_estimators': 132, 'model__learning_rate': 0.018739194009980333, 'model__max_depth': 4}. Best is trial 0 with value: 0.7694024526198439.
[I 2025-06-08 13:16:37,750] Trial 9 finished with value: 0.616985507246

🏃 View run GradientBoostingClassifier at: http://localhost:5000/#/experiments/2/runs/ff5853f0a0ae4b4ab15019652912316a
🧪 View experiment at: http://localhost:5000/#/experiments/2


In [1]:
from mlutils.utils.mlflow_utils import fetch_model

# Replace with your registered model name
model_name = "test"

model = fetch_model(model_name)  
model

  from .autonotebook import tqdm as notebook_tqdm
2025-06-08 14:05:42,574 - mlflow-utils - INFO - Latest version: 1
2025-06-08 14:05:42,577 - mlflow-utils - INFO - Status: None
2025-06-08 14:05:42,579 - mlflow-utils - INFO - Artifact URI: s3://expt/mlflow/2/ff5853f0a0ae4b4ab15019652912316a/artifacts/model
Downloading artifacts: 100%|██████████| 5/5 [00:02<00:00,  2.21it/s]
