In [28]:
# fixing directory to root of project
import git
import os
import sys

repo = git.Repo(".", search_parent_directories=True)
os.chdir(repo.working_tree_dir)
sys.path.append(repo.working_tree_dir)

import pandas as pd
import numpy as np
from scipy.stats import uniform, loguniform, randint
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from src.utils.utils import int_loguniform
from src.modelling.pipeline.ml_pipeline import (
    preprocess_features,
    preprocess_target,
    FilterFeatures,
    model_pipeline,
    )


from sklearn.datasets import load_iris


In [29]:
# Load datasets
def load_sipher_datasets():
    sipher = pd.read_csv("Q:/SDU/simulation-modelling/data/SIPHER/synth_pop_dataset/csv/sp_ind_wavek_census2011_est2020_8cons.csv")
    us_ind = pd.read_table("Q:/SDU/simulation-modelling/data/SIPHER/understanding_society/tab/ukhls/k_indresp.tab")
    return sipher, us_ind


def get_individuals(us_ind):
    ind_vars = ['pidp', 'k_dvage', 'k_health', 'k_vegeamt', 'k_fruitamt',
                'k_vwhrs', 'k_mwhrs', 'k_ethn_dv',
                'k_helphours1','k_helphoursb1',
                'k_anypaya', 'k_payamta', 'k_allcosta',
                'k_hospc1', 'k_hospdc1',
                'k_brainnervtypn1', 'k_brainnervtypn2', 'k_brainnervtypn3',
                'k_brainnervtypn4', 'k_brainnervtypn5', 'k_brainnervtypn6']
    sipher_individuals = us_ind[ind_vars]
    return sipher_individuals


sipher, us_ind = load_sipher_datasets()
sipher_data = get_individuals(us_ind)


In [30]:
sipher_data["has_social_care"] = 0
sipher_data.loc[sipher_data["k_helphours1"] > 0, ["has_social_care"]] = 1

sipher_data["has_disability"] = 0
sipher_data.loc[sipher_data["k_health"] == 1, ["has_disability"]] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["has_social_care"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["has_disability"] = 0


In [31]:
# has been to hospital due to condition
sipher_data["hospital_condition"] = 0
sipher_data.loc[sipher_data["k_hospc1"] == 1, ["hospital_condition"]] = 1
neurological_conditions = ['k_brainnervtypn1', 'k_brainnervtypn2', 'k_brainnervtypn3','k_brainnervtypn4', 'k_brainnervtypn5', 'k_brainnervtypn6']
sipher_data["neuro_condition"] = 0
sipher_data.loc[(sipher_data[neurological_conditions] == 1).any(axis="columns"), "neuro_condition"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["hospital_condition"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["neuro_condition"] = 0


In [32]:
sipher_data["healthy_eater"] = 0
sipher_data.loc[sipher_data["k_fruitamt"] + sipher_data["k_vegeamt"] > 5, ["healthy_eater"]] = 1

sipher_data["physically_active"] = 0
sipher_data.loc[sipher_data["k_vwhrs"] + sipher_data["k_mwhrs"] > 2.5, ["physically_active"]] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["healthy_eater"] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["physically_active"] = 0


In [33]:
sipher_data["white_british"] = 0
sipher_data.loc[sipher_data["k_ethn_dv"] == 1, ["white_british"]] = 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sipher_data["white_british"] = 0


In [34]:
sipher_data = sipher_data[["pidp", "healthy_eater", "has_social_care", "white_british", "has_disability", "k_dvage", "hospital_condition", "neuro_condition"]]

In [35]:
# target variables
target_var_list = ["has_social_care"]
# drop any unecessary variables from the model. In this case, we are dropping the geographical identifier.
drop_variables = []
# model dictionary and hyperparameter search space
model_param_dict = {
    LogisticRegression(): {},
    RandomForestClassifier(): {}
}
# optional controls:
# select features list - use to subset specific features of interest, if blank it will use all features.
# change feature_filter__filter_features hyperparam when using this
select_features_list = []
# optional - user specified model for evaluation plots. e.g. user_model = "Lasso"
# if left blank out the best performing model will be used for the evaluation plots
user_model = ""
# shortened feature name label for evaluation plots
col_labels = {}

# run pipeline for all models
for target_var in target_var_list:
    # pre-processing
    # drop cols, convert to set to drop unique cols only
    cols_to_drop = list(set([target_var] + drop_variables))
    features = preprocess_features(df=sipher_data, cols_to_drop=cols_to_drop)
    target_df = preprocess_target(df=sipher_data, target_col=target_var)

    # run model pipeline
    model_pipeline(
        model_param_dict=model_param_dict,
        target_var=target_var,
        target_df=target_df,
        feature_df=features,
        id_col="pidp",
        original_df=sipher_data,
        output_path="outputs",
        output_label="classification_sipher",
        col_label_map=col_labels,
        user_evaluation_model=user_model,
    )


Number of cores available for parallel processing: 4
LogisticRegression




Fitting 5 folds for each of 1 candidates, totalling 5 fits


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 663.04it/s] 
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 317.96it/s]
2025/04/01 10:39:00 INFO mlflow.sklearn.utils: Logging the 50 best runs, no runs will be omitted.


{'mean_fit_time': array([0.03178883]), 'std_fit_time': array([0.0089947]), 'mean_score_time': array([0.00699768]), 'std_score_time': array([0.00184636]), 'params': [{}], 'split0_test_f1_macro': array([0.6850322]), 'split1_test_f1_macro': array([0.68524281]), 'split2_test_f1_macro': array([0.68639546]), 'split3_test_f1_macro': array([0.67368504]), 'split4_test_f1_macro': array([0.66223539]), 'mean_test_f1_macro': array([0.67851818]), 'std_test_f1_macro': array([0.00936155]), 'rank_test_f1_macro': array([1]), 'split0_train_f1_macro': array([0.67759358]), 'split1_train_f1_macro': array([0.67927525]), 'split2_train_f1_macro': array([0.67601053]), 'split3_train_f1_macro': array([0.68244413]), 'split4_train_f1_macro': array([0.68691613]), 'mean_train_f1_macro': array([0.68044792]), 'std_train_f1_macro': array([0.0038739]), 'split0_test_accuracy': array([0.93303397]), 'split1_test_accuracy': array([0.93263035]), 'split2_test_accuracy': array([0.93614528]), 'split3_test_accuracy': array([0.933



Fitting 5 folds for each of 1 candidates, totalling 5 fits


Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 777.73it/s] 
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 239.06it/s]
2025/04/01 10:39:22 INFO mlflow.sklearn.utils: Logging the 50 best runs, no runs will be omitted.


{'mean_fit_time': array([1.55518637]), 'std_fit_time': array([0.20348543]), 'mean_score_time': array([0.06941562]), 'std_score_time': array([0.00595786]), 'params': [{}], 'split0_test_f1_macro': array([0.67236942]), 'split1_test_f1_macro': array([0.67428265]), 'split2_test_f1_macro': array([0.67954085]), 'split3_test_f1_macro': array([0.66454582]), 'split4_test_f1_macro': array([0.65308497]), 'mean_test_f1_macro': array([0.66876474]), 'std_test_f1_macro': array([0.00920022]), 'rank_test_f1_macro': array([1]), 'split0_train_f1_macro': array([0.99853885]), 'split1_train_f1_macro': array([0.99926989]), 'split2_train_f1_macro': array([0.9990865]), 'split3_train_f1_macro': array([0.99890277]), 'split4_train_f1_macro': array([0.99835467]), 'mean_train_f1_macro': array([0.99883054]), 'std_train_f1_macro': array([0.0003392]), 'split0_test_accuracy': array([0.91878173]), 'split1_test_accuracy': array([0.91935169]), 'split2_test_accuracy': array([0.92032806]), 'split3_test_accuracy': array([0.92