In [1]:
# reload magics
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datasets
from project_modules.io import load_dataset_to_df
from project_modules.classifcation import classify_MP,getXY, boruta_fs
from project_modules.utils import MPutils
from sklearn.ensemble import RandomForestClassifier
import multiprocessing
from sklearn.model_selection import cross_val_score
import cupy as cp
from datetime import datetime

from tqdm.notebook import tqdm

# from project_modules.utils import get_logger
# logger = get_logger("log-data-combine-split.log")
# # read the parameter file

# from project_modules.utils import read_parameters
# parms = read_parameters("/Users/david/projects/lc-project-data/project.yaml")

In [2]:
lScorersBinary = [
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    # "average_precision",
    "NPV",
    "PPV",
    # "neg_mean_squared_error",
]
lResCol = [
    "Title",
    "cv",
    "param_clf",
    "param_clf__max_depth",
    "param_clf__n_estimators",
    "param_clf__random_state",
    # "param_clf__max_iter",
    "mean_test_accuracy",
    "mean_test_balanced_accuracy",
    "mean_test_roc_auc",
    "mean_test_f1",
    "mean_test_recall",
    "mean_test_sensitivity",
    "mean_test_specificity",
    "mean_test_precision",
    "mean_test_NPV",
    "mean_test_PPV",
]

# Import Data

In [3]:
lDataNames = ['T81','T85','DT']
dataDir = '../Data/DataV2/TTS/'

In [4]:
# saveDir = MPutils.get_saving_dir(f"OUTPUT/MP/05-classifiers/DataV2/{dataName}/")


lClassifiers = ["RF", "LogR",'XGBcpu'] # different classifiers #NOTE - XGBcpu is faster than XGBcuda for some reason?
lCV = [5] # different cross-validation splits
lTrees=[100,1000] #num trees
lMD=[3,6,10,15, None] #max depth

lBrt_MD = [3, 5, 7]
brt_trees = 1000
brt_iter = 500
brt_thres = 100
brt_toprank = 5


# FIXME - BELOW CODE FOR TESTING PURPOSES
# dataName = 'T81'
# lClassifiers = [
#     "RF",
#     "LogR",
#     "XGBcpu",
# ]  # different classifiers #NOTE - XGBcpu is faster than XGBcuda for some reason?
# lCV = [5]  # different cross-validation splits
# lTrees = [10]  # num trees
# lMD = [3]  # max depth

# lBrt_MD = [3]
# brt_trees=10
# brt_iter = 10
# brt_thres=100
# brt_toprank=5

In [5]:
lRunDataFrames = []
for dataName in tqdm(lDataNames,desc='DataSet Main Outer Loop'):

    df_tr = load_dataset_to_df(f"{dataDir}{dataName}_Train.arrow", verbose=True)
    df_ts = load_dataset_to_df(f"{dataDir}{dataName}_Test.arrow", verbose=True)
    df_h = load_dataset_to_df(f"{dataDir}{dataName}_Holdout.arrow", verbose=True)
    df_full = pd.concat([df_tr,df_ts,df_h])
    lColDrop = ["__index_level_0__", "LC_STATUS_SITE",'SITE']

    for df in [df_tr,df_ts,df_h,df_full]:
        df.drop(df[df["LC_STATUS"] == "HC"].index, inplace=True)  # drop HC
        df["LC_STATUS"] = df["LC_STATUS"].apply(
            lambda x: 1 if x == "LC_POS" else 0
        )  # Convert to 0==LCNeg, 1==LCPos
        df.drop(columns=lColDrop, inplace=True)  # drop unneeded columns

    saveDir = MPutils.get_saving_dir(
        f"OUTPUT/MP/05-classifiers/DataV2/{dataName}/"
    )

    ## HOLDOUT SET ONLY
    x_h, y_h = getXY(df_h)

    dfRHold = classify_MP(
        X=x_h,
        y=y_h,
        lScorers=lScorersBinary,
        lClassifiers=lClassifiers,
        lTrees=lTrees,
        lMD=lMD,
        lCV=lCV,
    )
    dfRHold["Title"] = f"{dataName}_Holdout"

    dfRHold.to_csv(f"{saveDir}CA_{dataName}_Holdout.csv", index=False)

    ## Full SET ONLY
    x_F, y_F = getXY(df_full)

    dfRFull = classify_MP(
        X=x_F,
        y=y_F,
        lScorers=lScorersBinary,
        lClassifiers=lClassifiers,
        lTrees=lTrees,
        lMD=lMD,
        lCV=lCV,
    )
    dfRFull["Title"] = f"{dataName}_Full"

    dfRFull.to_csv(f"{saveDir}CA_{dataName}_Full.csv", index=False)

    ## BORUTA Feature Selection

    x_Tr, y_Tr = getXY(df_tr)
    for boruta_md in tqdm(lBrt_MD[:],desc='Boruta MD Loop'):
        # run brouta = get true and top feat
        trueFeat, topFeat = boruta_fs(
            X=x_Tr.values,
            y=y_Tr,
            feat_list=x_Tr.columns,
            trees=brt_trees,
            ittr=brt_iter,
            threshold=brt_thres,
            top_rank=brt_toprank,
            verbose=0,
            model=RandomForestClassifier(
                n_jobs=-1, class_weight="balanced", max_depth=boruta_md, random_state=42
            ),
            fileName=f"{saveDir}FS_{dataName}_Boruta_T{brt_trees}_itrr{brt_iter}_th{brt_thres}_topR{brt_toprank}_MD{boruta_md}.csv",
        )

        ## Test Brouta Features
        x_TsMain, y_Ts = getXY(df_ts)

        x_Ts_True = x_TsMain[trueFeat]
        x_Ts_Top = x_TsMain[topFeat]

        # True Feat
        dfR_True = classify_MP(
            X=x_Ts_True,
            y=y_Ts,
            lScorers=lScorersBinary,
            lClassifiers=lClassifiers,
            lTrees=lTrees,
            lMD=lMD,
            lCV=lCV,
        )
        dfR_True["Title"] = f"{dataName}_Boruta_True"
        # Top Feat
        dfR_Top = classify_MP(
            X=x_Ts_Top,
            y=y_Ts,
            lScorers=lScorersBinary,
            lClassifiers=lClassifiers,
            lTrees=lTrees,
            lMD=lMD,
            lCV=lCV,
        )

        dfR_Top["Title"] = f"{dataName}_Boruta_Top"

        dfR_True["brt_md"] = boruta_md
        dfR_Top["brt_md"] = boruta_md
        
        dfR_True['brt_nTrue'] = len(trueFeat)
        dfR_True['brt_nTop'] = len(topFeat)
        
        dfR_Top['brt_nTrue'] = len(trueFeat)
        dfR_Top["brt_nTop"] = len(topFeat)

    dfCLFRun = pd.concat([dfRHold, dfRFull, dfR_True, dfR_Top])

    dfCLFRun["MainDataset"] = dataName
    dfCLFRun["date"] = datetime.today()
    dfCLFRun["brt_params"] = (
        f"T{brt_trees}_itrr{brt_iter}_th{brt_thres}_topR{brt_toprank}"
    )

    dfCLFRun = MPutils.reorder_columns(dfCLFRun, ["MainDataset", "Title", "date",'brt_nTrue','brt_nTop'])

    dfCLFRun.to_csv(f"{saveDir}CLFRun_{dataName}_Results.csv", index=False)
    lRunDataFrames.append(dfCLFRun)

DataSet Main Outer Loop:   0%|          | 0/3 [00:00<?, ?it/s]

Boruta MD Loop:   0%|          | 0/3 [00:00<?, ?it/s]

Boruta MD Loop:   0%|          | 0/3 [00:00<?, ?it/s]

Boruta MD Loop:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dfCombRes = pd.concat(lRunDataFrames)

In [7]:
dfCombRes["classifier"] = dfCombRes["param_clf"].apply(lambda x: x.__str__().split("(")[0])

In [8]:
lColFirst = [
    "MainDataset",
    "Title",
    "date",
    "classifier",
    "cv",
    "brt_nTrue",
    "brt_nTop",
]

In [9]:
dfCombRes = MPutils.reorder_columns(dfCombRes, lColFirst)

In [10]:
dfCombRes.to_csv('OUTPUT/MP/05-classifiers/DataV2/CLFRunCombined.csv',index=False)

In [11]:
lColsSel = lColFirst+[x for x in dfCombRes if x.startswith('mean_test') or x.startswith('param_') or x.startswith('brt')]

In [12]:
dfCombRes[lColsSel].to_csv(
    "OUTPUT/MP/05-classifiers/DataV2/CLFRunCombined_SelCol.csv", index=False
)

# Holdoutset - validation testing - Boruta Tested Best Feature

In [None]:
lScorersBinary = [
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    # "average_precision",
    "NPV",
    "PPV",
    # "neg_mean_squared_error",
]
lClassifiers = ["RF", "ET", "LogR"] # different classifiers
lCV = [3,5,10] # different cross-validation splits
lTrees=[100,1000,10000] #num trees
lMD=[None] #max depth


In [None]:
selFeat = "Test_T81_Boruta_T1000_itrr200_thres100_topR5_MD7_True_LCN_LCP".replace(
    "_LCN_LCP", ""
).replace("Test_", "")
lSelFeat = dFeat[selFeat]
x,y = getXY(T81_h)
x=x[lSelFeat]

In [None]:
dfResults = classify_MP(X=x, y=y, lScorers=lScorersBinary, lClassifiers=lClassifiers, lTrees=lTrees, lMD=lMD,lCV=lCV)

In [None]:
# dfResults.columns

In [None]:
dfResults['Title']='R81_Holdout_'+selFeat

In [None]:
lResCol = ['Title','cv']+[x for x in dfResults.columns if x.startswith("param_") or x.startswith("mean_test_")]

In [None]:
dfResults[lResCol] # all results

In [None]:
dfResults[
    (dfResults["cv"] == 5)
    # & (dfResults["param_clf"] == "RandomForestClassifier()")
    & (dfResults["param_clf__n_estimators"] == 1000)
][lResCol].T

In [None]:
dfResults[(dfResults['cv']==5) & (dfResults['param_clf']=='RandomForestClassifier()') & (dfResults['param_clf__n_estimators']==1000)][lResCol]

In [None]:
dfResults.to_csv(
    f"OUTPUT/MP/05-classifiers/Validation/CA_{'R81_Holdout_'+selFeat}.csv", index=False
)
dfResults[lResCol].to_csv(f"OUTPUT/MP/05-classifiers/Validation/CA_{'R81_Holdout_'+selFeat}_selCol.csv", index=False)

In [None]:
dfResults = pd.read_csv(
    f"OUTPUT/MP/05-classifiers/Validation/CA_{'R81_Holdout_'+selFeat}.csv"
)

### Feature Importance of Best Set


In [None]:
clf = RandomForestClassifier(n_estimators=1000,max_depth=None,random_state=42)
clf.fit(x,y)
feature_importances = clf.feature_importances_
feature_importances = pd.Series(feature_importances, index=x.columns).sort_values(
    ascending=False
)

In [None]:
feature_importances

In [None]:
feature_importances.to_csv(f"OUTPUT/MP/05-classifiers/Validation/FR_{'R81_Holdout_'+selFeat}.csv", index=False)

age                                     0.419838
dag                                     0.235372
SYMPT-fatigue___2                       0.084360
SYMPT-dizziness___2                     0.079019
SYMPT-muscle_aches_myalgia___2          0.074982
SYMPT-shortness_of_breath_dyspne___2    0.059716
SYMPT-joint_pain_arthralgia___2         0.046712

In [None]:
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, random_state=42)
cross_val_score(clf, x, y, cv=5, scoring="accuracy").mean()