In [1]:
# fixing directory to root of project
import git
import os
import sys

repo = git.Repo(".", search_parent_directories=True)
os.chdir(repo.working_tree_dir)
sys.path.append(repo.working_tree_dir)

import pandas as pd
import numpy as np
from scipy.stats import uniform, loguniform, randint
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from src.utils.utils import int_loguniform
from src.modelling.pipeline.ml_pipeline import (
    preprocess_features,
    preprocess_target,
    FilterFeatures,
    model_pipeline,
    )


from sklearn.datasets import load_iris


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
iris = load_iris()

iris_data = pd.DataFrame(np.concatenate((iris.data, np.array([iris.target]).T), axis=1), columns=iris.feature_names + ['target'])
iris_data = iris_data[iris_data["target"].isin([2.0, 1.0])]
iris_data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
50,7.0,3.2,4.7,1.4,1.0
51,6.4,3.2,4.5,1.5,1.0
52,6.9,3.1,4.9,1.5,1.0
53,5.5,2.3,4.0,1.3,1.0
54,6.5,2.8,4.6,1.5,1.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


In [3]:
# set the pipeline code running with a logistic regression and a decision tree classifier model
# see where it breaks, remove those bits of code temporarily and get the core pipeline working
# build some logic into the broken bits of the code to run differently when a classification task is running, get the code working agian with all bits added back in
# code the classification logic

In [4]:
# target variables
target_var_list = ["target"]
# drop any unecessary variables from the model. In this case, we are dropping the geographical identifier.
drop_variables = []
# model dictionary and hyperparameter search space
model_param_dict = {
    LogisticRegression(): {},
    RandomForestClassifier(): {}
}
# optional controls:
# select features list - use to subset specific features of interest, if blank it will use all features.
# change feature_filter__filter_features hyperparam when using this
select_features_list = []
# optional - user specified model for evaluation plots. e.g. user_model = "Lasso"
# if left blank out the best performing model will be used for the evaluation plots
user_model = ""
# shortened feature name label for evaluation plots
col_labels = {}

# run pipeline for all models
for target_var in target_var_list:
    # pre-processing
    # drop cols, convert to set to drop unique cols only
    cols_to_drop = list(set([target_var] + drop_variables))
    features = preprocess_features(df=iris_data, cols_to_drop=cols_to_drop)
    target_df = preprocess_target(df=iris_data, target_col=target_var)

    # run model pipeline
    model_pipeline(
        model_param_dict=model_param_dict,
        target_var=target_var,
        target_df=target_df,
        feature_df=features,
        id_col="",
        original_df=iris_data,
        output_path="outputs",
        output_label="classification_demo",
        col_label_map=col_labels,
        user_evaluation_model=user_model,
    )


Number of cores available for parallel processing: 4
LogisticRegression




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1361.35it/s] 
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 925.08it/s] 
2025/03/31 16:19:02 INFO mlflow.sklearn.utils: Logging the 50 best runs, no runs will be omitted.


{'mean_fit_time': array([0.01045079]), 'std_fit_time': array([0.00703152]), 'mean_score_time': array([0.00650473]), 'std_score_time': array([0.00863728]), 'params': [{}], 'split0_test_f1_macro': array([1.]), 'split1_test_f1_macro': array([0.87301587]), 'split2_test_f1_macro': array([0.9372549]), 'split3_test_f1_macro': array([1.]), 'split4_test_f1_macro': array([0.9372549]), 'mean_test_f1_macro': array([0.94950514]), 'std_test_f1_macro': array([0.0474346]), 'rank_test_f1_macro': array([1]), 'split0_train_f1_macro': array([0.95311355]), 'split1_train_f1_macro': array([0.96871945]), 'split2_train_f1_macro': array([0.96875]), 'split3_train_f1_macro': array([0.98437118]), 'split4_train_f1_macro': array([0.98437118]), 'mean_train_f1_macro': array([0.97186507]), 'std_train_f1_macro': array([0.01169636]), 'split0_test_accuracy': array([1.]), 'split1_test_accuracy': array([0.875]), 'split2_test_accuracy': array([0.9375]), 'split3_test_accuracy': array([1.]), 'split4_test_accuracy': array([0.93

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1168.70it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1310.84it/s]
2025/03/31 16:19:18 INFO mlflow.sklearn.utils: Logging the 50 best runs, no runs will be omitted.


{'mean_fit_time': array([0.18897462]), 'std_fit_time': array([0.05190701]), 'mean_score_time': array([0.00941982]), 'std_score_time': array([0.00247217]), 'params': [{}], 'split0_test_f1_macro': array([1.]), 'split1_test_f1_macro': array([0.87301587]), 'split2_test_f1_macro': array([0.875]), 'split3_test_f1_macro': array([0.875]), 'split4_test_f1_macro': array([0.9372549]), 'mean_test_f1_macro': array([0.91205415]), 'std_test_f1_macro': array([0.05027834]), 'rank_test_f1_macro': array([1]), 'split0_train_f1_macro': array([1.]), 'split1_train_f1_macro': array([1.]), 'split2_train_f1_macro': array([1.]), 'split3_train_f1_macro': array([1.]), 'split4_train_f1_macro': array([1.]), 'mean_train_f1_macro': array([1.]), 'std_train_f1_macro': array([0.]), 'split0_test_accuracy': array([1.]), 'split1_test_accuracy': array([0.875]), 'split2_test_accuracy': array([0.875]), 'split3_test_accuracy': array([0.875]), 'split4_test_accuracy': array([0.9375]), 'mean_test_accuracy': array([0.9125]), 'std_t