In [5]:
# reload magics
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datasets
from project_modules.io import load_dataset_to_df
from project_modules.classifcation import classify_MP,getXY, boruta_fs
from project_modules.utils import MPutils
from sklearn.ensemble import RandomForestClassifier
import multiprocessing
from sklearn.model_selection import cross_val_score
import cupy as cp
from datetime import datetime

from tqdm.notebook import tqdm

# from project_modules.utils import get_logger
# logger = get_logger("log-data-combine-split.log")
# # read the parameter file

# from project_modules.utils import read_parameters
# parms = read_parameters("/Users/david/projects/lc-project-data/project.yaml")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
from pathlib import Path

# input_path = Path("../lc-project-data")
# output_path = Path("../lc-project-data")

# for MP,
# input_path = Path("../Data/DataV3")
# output_path = Path("OUTPUT/MP/05-classifiers/DataV3/")

# DataV4 - 2024-10-09 - Dropped patients with missing vax data
input_path = Path("../Data/DataV4")
output_path = Path("OUTPUT/MP/05-classifiers/DataV4/")

dataDir = "../Data/DataV4/TTS/"

In [7]:
lScorersBinary = [
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    # "average_precision",
    "NPV",
    "PPV",
    # "neg_mean_squared_error",
]
lResCol = [
    "Title",
    "cv",
    "param_clf",
    "param_clf__max_depth",
    "param_clf__n_estimators",
    "param_clf__random_state",
    # "param_clf__max_iter",
    "mean_test_accuracy",
    "mean_test_balanced_accuracy",
    "mean_test_roc_auc",
    "mean_test_f1",
    "mean_test_recall",
    "mean_test_sensitivity",
    "mean_test_specificity",
    "mean_test_precision",
    "mean_test_NPV",
    "mean_test_PPV",
]

lPrettyCols = [
    "MainDataset",
    "RunType",
    "classifier",
    "brt_nTrue",
    "brt_nTop",
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    "NPV",
    "PPV",
    "brt_md",
]

In [10]:
# saveDir = MPutils.get_saving_dir(f"OUTPUT/MP/05-classifiers/DataV2/{dataName}/")


lClassifiers = ["RF", "LogR",'XGBcpu'] # different classifiers #NOTE - XGBcpu is faster than XGBcuda for some reason?
lCV = [5] # different cross-validation splits
lTrees=[100,1000] #num trees
lMD=[3,6,10,15, None] #max depth

lBrt_MD = [3, 5, 7]
brt_trees = 1000
brt_iter = 500
brt_thres = 100
brt_toprank = 5


# FIXME - BELOW CODE FOR TESTING PURPOSES
# dataName = 'dfcmplt'
# lClassifiers = [
#     "RF",
#     "LogR",
#     "XGBcpu",
# ]  # different classifiers #NOTE - XGBcpu is faster than XGBcuda for some reason?
# lCV = [5]  # different cross-validation splits
# lTrees = [10]  # num trees
# lMD = [3]  # max depth

# lBrt_MD = [4]
# brt_trees=12
# brt_iter = 10
# brt_thres=100
# brt_toprank=5

# Automated ML BRT and testing

In [11]:
import os


def updateExistingDF(dfTmp, fileName, save=False):
    if os.path.exists(fileName):
        dfTmp = pd.concat([dfTmp, pd.read_csv(fileName)])
        if 'params' in dfTmp.columns:
            dfTmp["params"] = dfTmp["params"].iloc[0].__str__()
        dfTmp = dfTmp.drop_duplicates(
             subset=[x for x in dfTmp.columns if x != "param_clf" and x!='date' and 'split' not in x and '_time' not in x and 'rank_' not in x and 'std' not in x]
        )
    if save:
        dfTmp.to_csv(fileName, index=False)

    return dfTmp

# Holdoutset - validation testing - Boruta Tested Best Feature + Sites

In [12]:
dataDir

'../Data/DataV4/TTS/'

In [14]:
# NOTE - MATCH with 05.2 holdout classifiers and the best features exported by 05.1

globalfeat = "FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3"
valSelTests = {
    "dfcmpltPreLC4": [globalfeat],
    "dfcmpltPreLC4_SITE1": [
        globalfeat,
        "FS_dfcmpltPreLC4_SITE1_Boruta_T1000_itrr500_th100_topR5_MD3",
    ],
    "dfcmpltPreLC4_SITE2": [
        globalfeat,
        "FS_dfcmpltPreLC4_SITE2_Boruta_T1000_itrr500_th100_topR5_MD5",
    ],
    "dfcmpltPreLC4_SITE3": [
        globalfeat,
        "FS_dfcmpltPreLC4_SITE3_Boruta_T1000_itrr500_th100_topR5_MD3",
    ],
    "dfcmpltPreLC4_SITE4": [
        globalfeat,
        "FS_dfcmpltPreLC4_SITE4_Boruta_T1000_itrr500_th100_topR5_MD3",
    ],
    # NOT FINISHED CALCULATING YET
    # "dfcmpltPreLC4_SITE5": [
    #     globalfeat,
    #     "FS_dfcmpltPreLC4_SITE5_Boruta_T1000_itrr500_th100_topR5_MD5",
    # ],
    # "dfcmpltPreLC4_SITE6": [
    #     globalfeat,
    #     "FS_dfcmpltPreLC4_SITE6_Boruta_T1000_itrr500_th100_topR5_MD3",
    # ],
    # "dfcmpltPreLC4_SITE7": [
    #     globalfeat,
    #     "FS_dfcmpltPreLC4_SITE7_Boruta_T1000_itrr500_th100_topR5_MD3",
    # ],
}

In [15]:
lRunDataFrames = []
for selData in tqdm(valSelTests.keys(),desc='DataSet Main Outer Loop'):
    for selFeatName in valSelTests[selData]:

        df_h = load_dataset_to_df(f"{dataDir}{selData}_Holdout.arrow", verbose=True)

        lColDrop = ["__index_level_0__", "LC_STATUS_SITE",'SITE']

        df_h.drop(df_h[df_h["LC_STATUS"] == 2].index, inplace=True)  # drop HC

        # df_h["LC_STATUS"] = df_h["LC_STATUS"].apply(
        #     lambda x: 1 if x == "LC_POS" else 0
        # )  # Convert to 0==LCNeg, 1==LCPos

        for c in lColDrop:
            if c in df_h.columns:
                df_h.drop(columns=[c], inplace=True)
        # df_h.drop(columns=lColDrop, inplace=True)  # drop unneeded columns

        saveDir = MPutils.get_saving_dir(
            f"OUTPUT/MP/05-classifiers/DataV4/{selData}/"
        )

        featDir= selFeatName.replace('FS_','').split('_Boruta')[0]
        lColFS = MPutils.getTrueFeatList(f"OUTPUT/MP/05-classifiers/DataV4/{featDir}/{selFeatName}.csv")
        
        ## HOLDOUT SET ONLY
        x_h, y_h = getXY(df_h)
        x_h = x_h[lColFS]

        dfRHold = classify_MP(
            X=x_h,
            y=y_h,
            lScorers=lScorersBinary,
            lClassifiers=lClassifiers,
            lTrees=lTrees,
            lMD=lMD,
            lCV=lCV,
            n_jobs=-1
        )
        dfRHold["Title"] = f"{selData}_HoldoutVal"
        dfRHold['RunType']='HoldoutVal'
        dfRHold["FeatureSet"] = selFeatName
        dfRHold['date']=datetime.today()
        dfRHold["classifier"] = dfRHold["param_clf"].apply(lambda x: x.__str__().split("(")[0])
        dfRHold["MainDataset"] = selData

        lColFirst = [
            "MainDataset",
            "Title",
            'FeatureSet',
            "date",
            "classifier",
            "RunType",
            "cv",
        ]
        dfRHold = MPutils.reorder_columns(dfRHold, lColFirst)

        ## Export
        dfRHold.to_csv(f"{saveDir}Val_{selData}_HoldoutVal_{selFeatName}.csv", index=False)

        ## Select Columns
        lColsSel = lColFirst + [
            x
            for x in dfRHold
            if (x.startswith("mean_test") or x.startswith("param_") or x.startswith("brt_"))
            and x not in lColFirst
        ]

        dfRHold[lColsSel].to_csv(
            f"{saveDir}Val_{selData}_HoldoutVal_{selFeatName}_SelCol.csv",
            index=False,
        )

    # lRunDataFrames.append(dfRHold)

        ### Feature Importance

        clf = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42)
        clf.fit(x_h, y_h)
        feature_importances = clf.feature_importances_
        feature_importances = pd.Series(feature_importances, index=x_h.columns).sort_values(
            ascending=False
        )

        dfFeatureImp = pd.DataFrame(feature_importances).reset_index()
        dfFeatureImp.columns = ["Feature", "Importance"]
        dfFeatureImp["Importance"] = dfFeatureImp["Importance"].round(4)
        dfFeatureImp["Rank"] = dfFeatureImp.rank(ascending=False)["Importance"].astype(int)
        dfFeatureImp = dfFeatureImp[["Rank", "Feature", "Importance"]]

        dfFeatureImp.to_csv(
            f"{saveDir}FR_{selData}_HoldoutVal_{selFeatName}.csv", index=False
        )

DataSet Main Outer Loop:   0%|          | 0/5 [00:00<?, ?it/s]