In [15]:
# fixing directory to root of project
import git
import os
import sys

repo = git.Repo(".", search_parent_directories=True)
os.chdir(repo.working_tree_dir)
sys.path.append(repo.working_tree_dir)

import pandas as pd
import numpy as np
from scipy.stats import uniform, loguniform, randint
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from src.utils.utils import int_loguniform
from src.modelling.pipeline.ml_pipeline import (
    preprocess_features,
    preprocess_target,
    FilterFeatures,
    model_pipeline,
    )


from sklearn.datasets import load_iris


In [16]:
# Load datasets
def load_sipher_datasets():
    sipher = pd.read_csv("Q:/SDU/simulation-modelling/data/SIPHER/synth_pop_dataset/csv/sp_ind_wavek_census2011_est2020_8cons.csv")
    us_ind = pd.read_table("Q:/SDU/simulation-modelling/data/SIPHER/understanding_society/tab/ukhls/k_indresp.tab")
    return sipher, us_ind


def get_individuals(us_ind):
    ind_vars = ['pidp', 'k_dvage', 'k_health', 'k_helphours1','k_helphoursb1',
                'k_anypaya', 'k_payamta', 'k_allcosta',
                'k_hospc1', 'k_hospdc1',
                'k_brainnervtypn1', 'k_brainnervtypn2', 'k_brainnervtypn3',
                'k_brainnervtypn4', 'k_brainnervtypn5', 'k_brainnervtypn6']
    sipher_individuals = us_ind[ind_vars]
    return sipher_individuals


sipher, us_ind = load_sipher_datasets()
sipher_data = get_individuals(us_ind)


In [17]:
sipher_data["has_social_care"] = 0
sipher_data.loc[sipher_data["k_helphours1"] > 0, ["has_social_care"]] = 1

sipher_data["has_disability"] = 0
sipher_data.loc[sipher_data["k_health"] == 1, ["has_disability"]] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["has_social_care"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["has_disability"] = 0


In [18]:
# has been to hospital due to condition
sipher_data["hospital_condition"] = 0
sipher_data.loc[sipher_data["k_hospc1"] == 1, ["hospital_condition"]] = 1
neurological_conditions = ['k_brainnervtypn1', 'k_brainnervtypn2', 'k_brainnervtypn3','k_brainnervtypn4', 'k_brainnervtypn5', 'k_brainnervtypn6']
sipher_data["neuro_condition"] = 0
sipher_data.loc[(sipher_data[neurological_conditions] == 1).any(axis="columns"), "neuro_condition"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["hospital_condition"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["neuro_condition"] = 0


In [19]:
sipher_data = sipher_data[["pidp", "has_social_care", "has_disability", "k_dvage", "hospital_condition", "neuro_condition"]]

In [20]:
# target variables
target_var_list = ["has_social_care"]
# drop any unecessary variables from the model. In this case, we are dropping the geographical identifier.
drop_variables = []
# model dictionary and hyperparameter search space
model_param_dict = {
    LogisticRegression(): {},
    RandomForestClassifier(): {}
}
# optional controls:
# select features list - use to subset specific features of interest, if blank it will use all features.
# change feature_filter__filter_features hyperparam when using this
select_features_list = []
# optional - user specified model for evaluation plots. e.g. user_model = "Lasso"
# if left blank out the best performing model will be used for the evaluation plots
user_model = ""
# shortened feature name label for evaluation plots
col_labels = {}

# run pipeline for all models
for target_var in target_var_list:
    # pre-processing
    # drop cols, convert to set to drop unique cols only
    cols_to_drop = list(set([target_var] + drop_variables))
    features = preprocess_features(df=sipher_data, cols_to_drop=cols_to_drop)
    target_df = preprocess_target(df=sipher_data, target_col=target_var)

    # run model pipeline
    model_pipeline(
        model_param_dict=model_param_dict,
        target_var=target_var,
        target_df=target_df,
        feature_df=features,
        id_col="pidp",
        original_df=sipher_data,
        output_path="outputs",
        output_label="classification_sipher",
        col_label_map=col_labels,
        user_evaluation_model=user_model,
    )


Number of cores available for parallel processing: 4
LogisticRegression




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 1217.45it/s]
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 18.19it/s] 
2025/04/01 10:22:59 INFO mlflow.sklearn.utils: Logging the 50 best runs, no runs will be omitted.


{'mean_fit_time': array([0.02428055]), 'std_fit_time': array([0.00261278]), 'mean_score_time': array([0.00821505]), 'std_score_time': array([0.00372417]), 'params': [{}], 'split0_test_f1_macro': array([0.68359824]), 'split1_test_f1_macro': array([0.68422094]), 'split2_test_f1_macro': array([0.68489701]), 'split3_test_f1_macro': array([0.67098809]), 'split4_test_f1_macro': array([0.66299637]), 'mean_test_f1_macro': array([0.67734013]), 'std_test_f1_macro': array([0.00882845]), 'rank_test_f1_macro': array([1]), 'split0_train_f1_macro': array([0.67705224]), 'split1_train_f1_macro': array([0.68104212]), 'split2_train_f1_macro': array([0.67543983]), 'split3_train_f1_macro': array([0.6815715]), 'split4_train_f1_macro': array([0.68516297]), 'mean_train_f1_macro': array([0.68005373]), 'std_train_f1_macro': array([0.0034549]), 'split0_test_accuracy': array([0.93283873]), 'split1_test_accuracy': array([0.93263035]), 'split2_test_accuracy': array([0.93595001]), 'split3_test_accuracy': array([0.93



Fitting 5 folds for each of 1 candidates, totalling 5 fits


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 403.71it/s] 
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 247.64it/s] 
2025/04/01 10:23:22 INFO mlflow.sklearn.utils: Logging the 50 best runs, no runs will be omitted.


{'mean_fit_time': array([2.35883455]), 'std_fit_time': array([0.54690854]), 'mean_score_time': array([0.05944681]), 'std_score_time': array([0.00636612]), 'params': [{}], 'split0_test_f1_macro': array([0.67970882]), 'split1_test_f1_macro': array([0.67498704]), 'split2_test_f1_macro': array([0.6735299]), 'split3_test_f1_macro': array([0.67252472]), 'split4_test_f1_macro': array([0.65563131]), 'mean_test_f1_macro': array([0.67127636]), 'std_test_f1_macro': array([0.00820098]), 'rank_test_f1_macro': array([1]), 'split0_train_f1_macro': array([0.997443]), 'split1_train_f1_macro': array([0.99743659]), 'split2_train_f1_macro': array([0.99817243]), 'split3_train_f1_macro': array([0.99743659]), 'split4_train_f1_macro': array([0.99707039]), 'mean_train_f1_macro': array([0.9975118]), 'std_train_f1_macro': array([0.00035981]), 'split0_test_accuracy': array([0.91897696]), 'split1_test_accuracy': array([0.91525093]), 'split2_test_accuracy': array([0.91739895]), 'split3_test_accuracy': array([0.9168