In [2]:
# reload magics
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datasets
from project_modules.io import load_dataset_to_df
from project_modules.classifcation import classify_MP,getXY, boruta_fs
from project_modules.utils import MPutils
from sklearn.ensemble import RandomForestClassifier
import multiprocessing
from sklearn.model_selection import cross_val_score
import cupy as cp
from datetime import datetime

from tqdm.notebook import tqdm
from copy import deepcopy

import umap
import matplotlib.colors as mc

import colorcet as cc
from sklearn.pipeline import Pipeline
# clustering
from sklearn.preprocessing import MinMaxScaler
from sklearn.manifold import TSNE
import json

import shap
import pickle

# from project_modules.utils import get_logger
# logger = get_logger("log-data-combine-split.log")
# # read the parameter file

# from project_modules.utils import read_parameters
# parms = read_parameters("/Users/david/projects/lc-project-data/project.yaml")

pd.options.display.max_columns = None
pd.options.display.max_rows = 50

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
lScorersBinary = [
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    # "average_precision",
    "NPV",
    "PPV",
    # "neg_mean_squared_error",
]
lResCol = [
    "Title",
    "cv",
    "param_clf",
    "param_clf__max_depth",
    "param_clf__n_estimators",
    "param_clf__random_state",
    # "param_clf__max_iter",
    "mean_test_accuracy",
    "mean_test_balanced_accuracy",
    "mean_test_roc_auc",
    "mean_test_f1",
    "mean_test_recall",
    "mean_test_sensitivity",
    "mean_test_specificity",
    "mean_test_precision",
    "mean_test_NPV",
    "mean_test_PPV",
]

In [4]:
# lDataNames = ['T81','T85','DT']


lDataNames = [
    "dfcmpltPreLC4",  # Done
    "dfcmpltPreLC4_SITE1",  # Done
    "dfcmpltPreLC4_SITE2",  # Done
    "dfcmpltPreLC4_SITE3",  # Done
    "dfcmpltPreLC4_SITE4",  # Done
    "dfcmpltPreLC4_SITE5",  # LEFT
    "dfcmpltPreLC4_SITE6",  # LEFT
    "dfcmpltPreLC4_SITE7",  # LEFT
]
dataDir = "../Data/DataV4/TTS/"
saveDir = MPutils.get_saving_dir('OUTPUT/MP/05-classifiers/')

In [5]:
site_name_dict = {1: "LDN", 2: "MTL", 3: "SAN", 4: "RIO", 5: "LUS", 6: "CA", 7: "NA"}
status_name_dict = {
    0: "LC_NEG",
    1: "LC_POS",
    2: "HC",
}
# assign consistent styles
site_style = {
              1: 'D',
              2: 'o',
              3: 'P',
              4: 'X',
              5: 's',
}

In [6]:
palette = cc.glasbey_hv

# create a palette dictionary that has the same keys as the original palette
palette_dict = {i: mc.to_hex(palette[i]) for i in range(len(palette))} # type: ignore

# add an entry for -1
palette_dict[-1] = '#ffffff'

# set default colour for seaborn heatmaps
sns.set_theme(style = "whitegrid", rc={'figure.figsize':(8,8)}, palette='viridis')

with open('JSON/selectFeaturePalette.json') as f: paletteSel = json.load(f)
with open('JSON/selectFeaturePalette_NiceFeat.json') as f: paletteSel_NF = json.load(f)
with open('JSON/selectFeaturePalette_NiceFeatFlipped.json') as f: paletteSel_NFF = json.load(f)


In [7]:
def make_categorical_colours(df, palette=cc.glasbey_hv):

    # make a list of colours
    colours_dict = {feature: palette[i] for i, feature in enumerate(df.columns)}
    return colours_dict


# Process data and run clustering

In [8]:
# NOTE - MATCH with 05.2 holdout classifiers and the best features exported by 05.1
globalfeat = "FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3"
valSelTests = {
    "dfcmpltPreLC4": {"featlist": globalfeat, "site": "Global"},
    "dfcmpltPreLC4_SITE1": {
        "featlist": globalfeat,
        "site": "London",
    },
    "dfcmpltPreLC4_SITE2": {
        "featlist": globalfeat,
        "site": "Montreal",
    },
    "dfcmpltPreLC4_SITE3": {
        "featlist": globalfeat,
        "site": "San Diego",
    },
    "dfcmpltPreLC4_SITE4": {
        "featlist": globalfeat,
        "site": "Rio",
    },
    # "dfcmpltPreLC4_SITE5": {"featlist": globalfeat, "site": "Lusaka"},
    # "dfcmpltPreLC4_SITE6": {
    #     "featlist": globalfeat,
    #     "site": "Canada",
    # },
    # "dfcmpltPreLC4_SITE7": {
    #     "featlist": globalfeat,
    #     "site": "NorthAmerica",
    # },
    "dfcmpltPreLC4_SITE1_siteFeat": {
        "featlist": "FS_dfcmpltPreLC4_SITE1_Boruta_T1000_itrr500_th100_topR5_MD3",
        "site": "London",
    },
    "dfcmpltPreLC4_SITE2_siteFeat": {
        "featlist": "FS_dfcmpltPreLC4_SITE2_Boruta_T1000_itrr500_th100_topR5_MD5",
        "site": "Montreal",
    },
    "dfcmpltPreLC4_SITE3_siteFeat": {
        "featlist": "FS_dfcmpltPreLC4_SITE3_Boruta_T1000_itrr500_th100_topR5_MD3",
        "site": "San Diego",
    },
    "dfcmpltPreLC4_SITE4_siteFeat": {
        "featlist": "FS_dfcmpltPreLC4_SITE4_Boruta_T1000_itrr500_th100_topR5_MD3",
        "site": "Rio",
    },
    # "dfcmpltPreLC4_SITE5_siteFeat": {"featlist": globalfeat, "site": "Lusaka"},
    # "dfcmpltPreLC4_SITE6_siteFeat": {
    #     "featlist": globalfeat,
    #     "site": "Canada",
    # },
    # "dfcmpltPreLC4_SITE7_siteFeat": {
    #     "featlist": globalfeat,
    #     "site": "NorthAmerica",
    # },
}
valSelKeys = [ 
 'dfcmpltPreLC4',
 'dfcmpltPreLC4_SITE1',
 'dfcmpltPreLC4_SITE2',
 'dfcmpltPreLC4_SITE3',
 'dfcmpltPreLC4_SITE4',
#  'dfcmpltPreLC4_SITE5',
#  'dfcmpltPreLC4_SITE6',
#  'dfcmpltPreLC4_SITE7'
 ]

In [11]:
for selData in tqdm(valSelTests.keys(),desc='DataSet Main Outer Loop'):

    tmpDict = valSelTests[selData] # get site Dict
    selFeatName = tmpDict['featlist'] #get site feature file

    # load df
    df_h = load_dataset_to_df(f"{dataDir}{selData.replace('_siteFeat','')}_Holdout.arrow", verbose=True)
    df_Tr = load_dataset_to_df(
        f"{dataDir}{selData.replace('_siteFeat','')}_Train.arrow", verbose=True
    )

    df_h.drop(df_h[df_h["LC_STATUS"] == 2].index, inplace=True)  # drop HC

    # drop columns
    lColDrop = ["__index_level_0__", "LC_STATUS_SITE",'SITE']
    for c in lColDrop:
        if c in df_h.columns:
            df_h.drop(columns=[c], inplace=True)

    tmpDict['data'] = df_h #store data

    # init saving dir
    tmpDict['savedir'] = MPutils.get_saving_dir(
        f"OUTPUT/MP/05-classifiers/DataV4/{selData}/"
    )

    # get Sel Cols
    tmpDict['featDir']= selFeatName.replace('FS_','').split('_Boruta')[0]
    tmpDict['selCols']= MPutils.getTrueFeatList(f"OUTPUT/MP/05-classifiers/DataV4/{tmpDict['featDir']}/{selFeatName}.csv")

    # get x and y
    x_h, y_h = getXY(df_h)
    tmpDict['x_h']=x_h[tmpDict['selCols']]# select Boruta Cols and store
    tmpDict['y_h']=y_h #store labels

    # get x and y
    x_Tr, y_Tr = getXY(df_Tr)
    tmpDict['x_Tr']=x_Tr[tmpDict['selCols']]# select Boruta Cols and store
    tmpDict['y_Tr']=y_Tr #store labels

    scaler = MinMaxScaler().set_output(transform="pandas")
    x_h_scaled = scaler.fit_transform(tmpDict["x_h"])

    tmpDict["x_h_scaled"] = x_h_scaled

    with open(
        f"./OUTPUT/MP/05-classifiers/DataV4/generatedHoldoutDataDicts/{selData}.pickle",
        "wb",
    ) as handle:
        pickle.dump(tmpDict, handle, protocol=pickle.HIGHEST_PROTOCOL)

DataSet Main Outer Loop:   0%|          | 0/9 [00:00<?, ?it/s]