In [1]:
# reload magics
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datasets
from project_modules.io import load_dataset_to_df
from project_modules.classifcation import classify_MP,getXY, boruta_fs
from project_modules.utils import MPutils
from sklearn.ensemble import RandomForestClassifier
import multiprocessing
from sklearn.model_selection import cross_val_score
import cupy as cp
from datetime import datetime

from tqdm.notebook import tqdm
from copy import deepcopy

# from project_modules.utils import get_logger
# logger = get_logger("log-data-combine-split.log")
# # read the parameter file

# from project_modules.utils import read_parameters
# parms = read_parameters("/Users/david/projects/lc-project-data/project.yaml")

pd.options.display.max_columns = None
pd.options.display.max_rows = 50
pd.options.display.max_colwidth = None

In [2]:
lScorersBinary = [
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    # "average_precision",
    "NPV",
    "PPV",
    # "neg_mean_squared_error",
]
lResCol = [
    "Title",
    "cv",
    "param_clf",
    "param_clf__max_depth",
    "param_clf__n_estimators",
    "param_clf__random_state",
    # "param_clf__max_iter",
    "mean_test_accuracy",
    "mean_test_balanced_accuracy",
    "mean_test_roc_auc",
    "mean_test_f1",
    "mean_test_recall",
    "mean_test_sensitivity",
    "mean_test_specificity",
    "mean_test_precision",
    "mean_test_NPV",
    "mean_test_PPV",
]

In [3]:
site_name_dict = {1: "LDN", 2: "MTL", 3: "SAN", 4: "RIO", 5: "LUS", 6: "CA", 7: "NA"}
status_name_dict = {
    0: "LC_NEG",
    1: "LC_POS",
    2: "HC",
}

# Combine Results

In [4]:
import glob

path = "OUTPUT/MP/05-classifiers/DataV4"
files = glob.glob(path + '/*/CA*.csv')
# files = files+glob.glob(path + "/*/FS*.csv")
files = files + glob.glob(path + "/*/Val*.csv")
files = files+glob.glob(path+'/*/CLFRun*.csv')
files = [x for x in files if 'selcol' not in x.lower()]

print(files)
print(len(files))

['OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE4/CA_dfcmpltPreLC4_SITE4_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE4/CA_dfcmpltPreLC4_SITE4_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE6/CA_dfcmpltPreLC4_SITE6_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE6/CA_dfcmpltPreLC4_SITE6_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE1/CA_dfcmpltPreLC4_SITE1_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE1/CA_dfcmpltPreLC4_SITE1_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE3/CA_dfcmpltPreLC4_SITE3_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE3/CA_dfcmpltPreLC4_SITE3_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE5/CA_dfcmpltPreLC4_SITE5_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE5/CA_dfcmpltPreLC4_SITE5_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4/CA_dfcmpltPreLC4_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpl

In [5]:
ldf = []
for i in files:
    ldf.append(pd.read_csv(i))

In [6]:
dfRes = pd.concat(ldf)

In [7]:
def changeSite(x):
    if pd.isnull(x):
        return np.nan
    if 'SITE7' in x:
        return 'NorthAmerica'
    if 'SITE6' in x:
        return 'Canada'
    if 'SITE5' in x:
        return 'LUS'
    if 'SITE4' in x:
        return 'RIO'
    if 'SITE3' in x:
        return 'SAN'
    if 'SITE2' in x:
        return 'MTL'
    if 'SITE1' in x:
        return 'LDN'
    return "Global"

In [8]:
dfRes["classifier"] = dfRes["param_clf"].apply(lambda x: x.__str__().split("(")[0])
dfRes['site'] = dfRes['MainDataset'].apply(changeSite)
lColFirst = [
    "MainDataset",
    "Title",
    'site',
    "date",
    "classifier",
    'RunType',
    "cv",
    # 'FeatureSet',
    "brt_nTrue",
    "brt_nTop",
    'brt_md',
]
dfRes = MPutils.reorder_columns(dfRes, lColFirst)
dfRes.to_csv("OUTPUT/MP/05-classifiers/DataV4/CLFRunCombined.csv", index=False)
dfResOrg = deepcopy(dfRes)
lColsSel = lColFirst + [
    x
    for x in dfRes
    if (x.startswith("mean_test") or x.startswith("param_") or x.startswith("brt_"))
    and x not in lColFirst
]
dfRes[lColsSel].to_csv("OUTPUT/MP/05-classifiers/DataV4/CLFRunCombined_SelCol.csv")

# Go through Data and Simplify for inspection

In [9]:
dfRes = dfRes[lColsSel]

In [10]:
lSelMainData = [
"dfcmpltPreLC4",
"dfcmpltPreLC4_SITE1",
"dfcmpltPreLC4_SITE2",
"dfcmpltPreLC4_SITE3",
"dfcmpltPreLC4_SITE4",
"dfcmpltPreLC4_SITE5",
"dfcmpltPreLC4_SITE6",
"dfcmpltPreLC4_SITE7",
]

lPrettyCols =     [
        "MainDataset",
        "RunType",
        "classifier",
        "brt_nTrue",
        "brt_nTop",
        "accuracy",
        "balanced_accuracy",
        "roc_auc",
        "f1",
        "recall",
        "sensitivity",
        "specificity",
        "precision",
        "NPV",
        "PPV",
        "brt_md",
        'brt_params'
    ]
lPrettyColsSite = [
    "MainDataset",
    "RunType",
    'site',
    "classifier",
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    "NPV",
    "PPV",
]
s_brt = 'and (RunType=="Boruta_True" or RunType=="Boruta_Top")'
s_RF10 = 'classifier=="RandomForestClassifier" and param_clf__n_estimators==1000 and param_clf__max_depth==10'
s_RFXGB10 = 'param_clf__n_estimators==1000 and param_clf__max_depth==10'
s_CMPLT = 'MainDataset=="dfcmpltPreLC4"'
s_val = 'RunType=="HoldoutVal"'

In [11]:
dfRes = dfRes[dfRes['MainDataset'].isin(lSelMainData)]
dfRes.columns = [x.replace('mean_test_','') for x in dfRes.columns]

In [12]:
# FS_{tmpdataset}_Boruta_T1000_itrr500_th100_topR5_MD{md}.csv'
dfRes['brt_file'] = dfRes[['MainDataset','brt_md']].apply(lambda x: f"FS_{x['MainDataset']}_Boruta_T1000_itrr500_th100_topR5_MD{x['brt_md']:.0f}.csv",axis=1)

# Per site - Best Results, use this for validation

In [13]:
# SECTION - Determine the best Boruta and Top Boruta for each dataset
# dfTmp = dfRes.query(f'{s_RF10} and RunType=="Boruta_True"').sort_values('balanced_accuracy',ascending=False)
dfTmp = dfRes.query(f'RunType=="Boruta_True" and {s_RFXGB10}').sort_values('balanced_accuracy',ascending=False)

lTopBrt = []
for i in lSelMainData:
    if i in dfTmp['MainDataset'].values:
        dfTmpTop = dfTmp.query(f'MainDataset=="{i}"').sort_values('balanced_accuracy',ascending=False)
        lTopBrt.append(dfTmpTop.iloc[0,:])

In [14]:
print('Pretty Columns data first')
pd.DataFrame(lTopBrt)[lPrettyColsSite+['brt_md','brt_nTrue','Title','brt_file']].sort_values('balanced_accuracy',ascending=False)

Pretty Columns data first


Unnamed: 0,MainDataset,RunType,site,classifier,accuracy,balanced_accuracy,roc_auc,f1,recall,sensitivity,specificity,precision,NPV,PPV,brt_md,brt_nTrue,Title,brt_file
87,dfcmpltPreLC4_SITE4,Boruta_True,RIO,RandomForestClassifier,0.9875,0.966667,0.998718,0.992593,1.0,1.0,0.933333,0.985714,1.0,0.985714,3.0,14.0,dfcmpltPreLC4_SITE4_Boruta_True,FS_dfcmpltPreLC4_SITE4_Boruta_T1000_itrr500_th100_topR5_MD3.csv
87,dfcmpltPreLC4_SITE1,Boruta_True,LDN,RandomForestClassifier,0.86,0.875,0.85,0.892857,0.85,0.85,0.9,0.95,0.733333,0.95,3.0,4.0,dfcmpltPreLC4_SITE1_Boruta_True,FS_dfcmpltPreLC4_SITE1_Boruta_T1000_itrr500_th100_topR5_MD3.csv
87,dfcmpltPreLC4_SITE3,Boruta_True,SAN,RandomForestClassifier,0.817582,0.750303,0.892121,0.884236,0.867273,0.867273,0.633333,0.909231,0.583333,0.909231,3.0,16.0,dfcmpltPreLC4_SITE3_Boruta_True,FS_dfcmpltPreLC4_SITE3_Boruta_T1000_itrr500_th100_topR5_MD3.csv
91,dfcmpltPreLC4,Boruta_True,Global,RandomForestClassifier,0.723456,0.715487,0.789034,0.674343,0.63962,0.63962,0.791353,0.717731,0.730688,0.717731,3.0,21.0,dfcmpltPreLC4_Boruta_True,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3.csv
87,dfcmpltPreLC4_SITE7,Boruta_True,NorthAmerica,RandomForestClassifier,0.703759,0.656892,0.735614,0.542289,0.468,0.468,0.845783,0.649179,0.725406,0.649179,3.0,19.0,dfcmpltPreLC4_SITE7_Boruta_True,FS_dfcmpltPreLC4_SITE7_Boruta_T1000_itrr500_th100_topR5_MD3.csv
251,dfcmpltPreLC4_SITE6,Boruta_True,Canada,RandomForestClassifier,0.669398,0.603602,0.621667,0.448491,0.411154,0.411154,0.796049,0.49624,0.734384,0.49624,7.0,2.0,dfcmpltPreLC4_SITE6_Boruta_True,FS_dfcmpltPreLC4_SITE6_Boruta_T1000_itrr500_th100_topR5_MD7.csv
169,dfcmpltPreLC4_SITE2,Boruta_True,MTL,RandomForestClassifier,0.664912,0.573712,0.601268,0.380357,0.333175,0.333175,0.814249,0.446023,0.730814,0.446023,5.0,2.0,dfcmpltPreLC4_SITE2_Boruta_True,FS_dfcmpltPreLC4_SITE2_Boruta_T1000_itrr500_th100_topR5_MD5.csv
108,dfcmpltPreLC4_SITE5,Boruta_True,LUS,XGBClassifier,0.630769,0.473333,0.413333,0.146667,0.133333,0.133333,0.813333,0.166667,0.724242,0.166667,3.0,2.0,dfcmpltPreLC4_SITE5_Boruta_True,FS_dfcmpltPreLC4_SITE5_Boruta_T1000_itrr500_th100_topR5_MD3.csv


In [15]:
print('Pretty Columns data last')
pd.DataFrame(lTopBrt)[['brt_md','brt_nTrue','Title','brt_file']+lPrettyColsSite].sort_values('MainDataset',ascending=True)

Pretty Columns data last


Unnamed: 0,brt_md,brt_nTrue,Title,brt_file,MainDataset,RunType,site,classifier,accuracy,balanced_accuracy,roc_auc,f1,recall,sensitivity,specificity,precision,NPV,PPV
91,3.0,21.0,dfcmpltPreLC4_Boruta_True,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3.csv,dfcmpltPreLC4,Boruta_True,Global,RandomForestClassifier,0.723456,0.715487,0.789034,0.674343,0.63962,0.63962,0.791353,0.717731,0.730688,0.717731
87,3.0,4.0,dfcmpltPreLC4_SITE1_Boruta_True,FS_dfcmpltPreLC4_SITE1_Boruta_T1000_itrr500_th100_topR5_MD3.csv,dfcmpltPreLC4_SITE1,Boruta_True,LDN,RandomForestClassifier,0.86,0.875,0.85,0.892857,0.85,0.85,0.9,0.95,0.733333,0.95
169,5.0,2.0,dfcmpltPreLC4_SITE2_Boruta_True,FS_dfcmpltPreLC4_SITE2_Boruta_T1000_itrr500_th100_topR5_MD5.csv,dfcmpltPreLC4_SITE2,Boruta_True,MTL,RandomForestClassifier,0.664912,0.573712,0.601268,0.380357,0.333175,0.333175,0.814249,0.446023,0.730814,0.446023
87,3.0,16.0,dfcmpltPreLC4_SITE3_Boruta_True,FS_dfcmpltPreLC4_SITE3_Boruta_T1000_itrr500_th100_topR5_MD3.csv,dfcmpltPreLC4_SITE3,Boruta_True,SAN,RandomForestClassifier,0.817582,0.750303,0.892121,0.884236,0.867273,0.867273,0.633333,0.909231,0.583333,0.909231
87,3.0,14.0,dfcmpltPreLC4_SITE4_Boruta_True,FS_dfcmpltPreLC4_SITE4_Boruta_T1000_itrr500_th100_topR5_MD3.csv,dfcmpltPreLC4_SITE4,Boruta_True,RIO,RandomForestClassifier,0.9875,0.966667,0.998718,0.992593,1.0,1.0,0.933333,0.985714,1.0,0.985714
108,3.0,2.0,dfcmpltPreLC4_SITE5_Boruta_True,FS_dfcmpltPreLC4_SITE5_Boruta_T1000_itrr500_th100_topR5_MD3.csv,dfcmpltPreLC4_SITE5,Boruta_True,LUS,XGBClassifier,0.630769,0.473333,0.413333,0.146667,0.133333,0.133333,0.813333,0.166667,0.724242,0.166667
251,7.0,2.0,dfcmpltPreLC4_SITE6_Boruta_True,FS_dfcmpltPreLC4_SITE6_Boruta_T1000_itrr500_th100_topR5_MD7.csv,dfcmpltPreLC4_SITE6,Boruta_True,Canada,RandomForestClassifier,0.669398,0.603602,0.621667,0.448491,0.411154,0.411154,0.796049,0.49624,0.734384,0.49624
87,3.0,19.0,dfcmpltPreLC4_SITE7_Boruta_True,FS_dfcmpltPreLC4_SITE7_Boruta_T1000_itrr500_th100_topR5_MD3.csv,dfcmpltPreLC4_SITE7,Boruta_True,NorthAmerica,RandomForestClassifier,0.703759,0.656892,0.735614,0.542289,0.468,0.468,0.845783,0.649179,0.725406,0.649179


In [16]:
s_query = f'{s_CMPLT} and param_clf__n_estimators==1000 and param_clf__max_depth==10 {s_brt}'
dfTmp = deepcopy(dfRes.query(s_query)).sort_values('balanced_accuracy',ascending=False)
print(s_query)
print(f"Max balanced accuracy {dfTmp['balanced_accuracy'].max()}")
# print(f"Max balanced accuracy {dfTmp['balanced_accuracy'].max()}")

# dfTmp[lPrettyCols]

MainDataset=="dfcmpltPreLC4" and param_clf__n_estimators==1000 and param_clf__max_depth==10 and (RunType=="Boruta_True" or RunType=="Boruta_Top")
Max balanced accuracy 0.7154865414781337


## Export -- save to file Preliminary Results

In [17]:
# # Create the ExcelWriter object
writer = pd.ExcelWriter('../OUTPUTS/DataV4/DataV4_prelim_preLongCOVID4_Brt_columns.xlsx', engine='xlsxwriter')
pd.DataFrame(lTopBrt)[lPrettyColsSite+['brt_md','brt_nTrue','Title','brt_file']].sort_values('balanced_accuracy',ascending=False).to_excel(writer,sheet_name='Results')

for i,d in pd.DataFrame(lTopBrt).iterrows():
    tmpdataset = d['MainDataset']
    md = int(d['brt_md'])
    f = f'OUTPUT/MP/05-classifiers/DataV4/{tmpdataset}/FS_{tmpdataset}_Boruta_T1000_itrr500_th100_topR5_MD{md}.csv'
    dfeat = pd.read_csv(f)
    MPutils.getTrueFeatList(f,asDF=True).to_excel(writer, sheet_name=d['site'])
    



#     # Save the Excel file
writer.close()


## EXTRA - Curious -- Difference between Boruta Selected Datasets

In [57]:
# lbrt_7 = MPutils.getTrueFeatList(
#     "OUTPUT/MP/05-classifiers/DataV4/dfcmplt/FS_dfcmplt_Boruta_T1000_itrr500_th100_topR5_MD7.csv"
# )

# lbrt_5 = MPutils.getTrueFeatList(
#     "OUTPUT/MP/05-classifiers/DataV4/dfcmplt/FS_dfcmplt_Boruta_T1000_itrr500_th100_topR5_MD5.csv"
# )

# lbrt_3 = MPutils.getTrueFeatList(
#     "OUTPUT/MP/05-classifiers/DataV4/dfcmplt/FS_dfcmplt_Boruta_T1000_itrr500_th100_topR5_MD3.csv"
# )

# len(lbrt_7), len(lbrt_5), len(lbrt_3)

In [58]:
# set(lbrt_7).difference(set(lbrt_5))

In [59]:
# set(lbrt_5).difference(set(lbrt_7))

In [60]:
# set(lbrt_7).difference(set(lbrt_3))

In [61]:
# set(lbrt_3).difference(set(lbrt_5))

In [62]:
# set(lbrt_3).difference(set(lbrt_5))

In [63]:
# set(lbrt_5).difference(set(lbrt_3))

In [64]:
# set(lbrt_5).difference(set(lbrt_7))