In [1]:
# reload magics
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datasets
from project_modules.io import load_dataset_to_df
from project_modules.classifcation import classify_MP,getXY, boruta_fs
from project_modules.utils import MPutils
from sklearn.ensemble import RandomForestClassifier
import multiprocessing
from sklearn.model_selection import cross_val_score
import cupy as cp
from datetime import datetime

from tqdm.notebook import tqdm
from copy import deepcopy

# from project_modules.utils import get_logger
# logger = get_logger("log-data-combine-split.log")
# # read the parameter file

# from project_modules.utils import read_parameters
# parms = read_parameters("/Users/david/projects/lc-project-data/project.yaml")

pd.options.display.max_columns = None
pd.options.display.max_rows = 50
pd.options.display.max_colwidth = None

In [2]:
lScorersBinary = [
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    # "average_precision",
    "NPV",
    "PPV",
    # "neg_mean_squared_error",
]
lResCol = [
    "Title",
    "cv",
    "param_clf",
    "param_clf__max_depth",
    "param_clf__n_estimators",
    "param_clf__random_state",
    # "param_clf__max_iter",
    "mean_test_accuracy",
    "mean_test_balanced_accuracy",
    "mean_test_roc_auc",
    "mean_test_f1",
    "mean_test_recall",
    "mean_test_sensitivity",
    "mean_test_specificity",
    "mean_test_precision",
    "mean_test_NPV",
    "mean_test_PPV",
]

In [3]:
site_name_dict = {1: "LDN", 2: "MTL", 3: "SAN", 4: "RIO", 5: "LUS", 6: "CA", 7: "NA"}
status_name_dict = {
    0: "LC_NEG",
    1: "LC_POS",
    2: "HC",
}

# Combine Results

In [None]:
import glob

path = "OUTPUT/MP/05-classifiers/DataV4"
files = glob.glob(path + '/*/CA*.csv')
# files = files+glob.glob(path + "/*/FS*.csv")
files = files + glob.glob(path + "/*/Val*.csv")
files = files+glob.glob(path+'/*/CLFRun*.csv')
files = [x for x in files if 'selcol' not in x.lower()]

print(files)
print(len(files))

['OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE4/CA_dfcmpltPreLC4_SITE4_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE4/CA_dfcmpltPreLC4_SITE4_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE6/CA_dfcmpltPreLC4_SITE6_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE6/CA_dfcmpltPreLC4_SITE6_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE1/CA_dfcmpltPreLC4_SITE1_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE1/CA_dfcmpltPreLC4_SITE1_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE3/CA_dfcmpltPreLC4_SITE3_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE3/CA_dfcmpltPreLC4_SITE3_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE5/CA_dfcmpltPreLC4_SITE5_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE5/CA_dfcmpltPreLC4_SITE5_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4/CA_dfcmpltPreLC4_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpl

In [5]:
ldf = []
for i in files:
    ldf.append(pd.read_csv(i))

In [6]:
dfRes = pd.concat(ldf)

In [7]:
def changeSite(x):
    if pd.isnull(x):
        return np.nan
    if 'SITE7' in x:
        return 'NorthAmerica'
    if 'SITE6' in x:
        return 'Canada'
    if 'SITE5' in x:
        return 'LUS'
    if 'SITE4' in x:
        return 'RIO'
    if 'SITE3' in x:
        return 'SAN'
    if 'SITE2' in x:
        return 'MTL'
    if 'SITE1' in x:
        return 'LDN'
    return "Global"

In [8]:
dfRes["classifier"] = dfRes["param_clf"].apply(lambda x: x.__str__().split("(")[0])
dfRes['site'] = dfRes['MainDataset'].apply(changeSite)
lColFirst = [
    "MainDataset",
    "Title",
    'site',
    "date",
    "classifier",
    'RunType',
    "cv",
    'FeatureSet',
    "brt_nTrue",
    "brt_nTop",
    'brt_md',
]
dfRes = MPutils.reorder_columns(dfRes, lColFirst)
dfRes.to_csv("OUTPUT/MP/05-classifiers/DataV4/CLFRunCombined.csv", index=False)
dfResOrg = deepcopy(dfRes)
lColsSel = lColFirst + [
    x
    for x in dfRes
    if (x.startswith("mean_test") or x.startswith("param_") or x.startswith("brt_"))
    and x not in lColFirst
]
dfRes[lColsSel].to_csv("OUTPUT/MP/05-classifiers/DataV4/CLFRunCombined_SelCol.csv")

# Go through Data and Simplify for inspection

In [9]:
dfRes = dfRes[lColsSel]

In [10]:
lSelMainData = [
"dfcmpltPreLC4",
"dfcmpltPreLC4_SITE1",
"dfcmpltPreLC4_SITE2",
"dfcmpltPreLC4_SITE3",
"dfcmpltPreLC4_SITE4",
"dfcmpltPreLC4_SITE5",
"dfcmpltPreLC4_SITE6",
"dfcmpltPreLC4_SITE7",
]

lPrettyCols =     [
        "MainDataset",
        "RunType",
        "classifier",
        "brt_nTrue",
        "brt_nTop",
        "accuracy",
        "balanced_accuracy",
        "roc_auc",
        "f1",
        "recall",
        "sensitivity",
        "specificity",
        "precision",
        "NPV",
        "PPV",
        "brt_md",
        'brt_params'
    ]
lPrettyColsSite = [
    "MainDataset",
    "RunType",
    'site',
    "classifier",
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    "NPV",
    "PPV",
]
s_brt = 'and (RunType=="Boruta_True" or RunType=="Boruta_Top")'
s_RF10 = 'classifier=="RandomForestClassifier" and param_clf__n_estimators==1000 and param_clf__max_depth==10'
s_RFXGB10 = 'param_clf__n_estimators==1000 and param_clf__max_depth==10'
s_CMPLT = 'MainDataset=="dfcmpltPreLC4"'
s_val = 'RunType=="HoldoutVal"'

In [11]:
dfRes = dfRes[dfRes['MainDataset'].isin(lSelMainData)]
dfRes.columns = [x.replace('mean_test_','') for x in dfRes.columns]

In [12]:
# FS_{tmpdataset}_Boruta_T1000_itrr500_th100_topR5_MD{md}.csv'
dfRes['brt_file'] = dfRes[['MainDataset','brt_md']].apply(lambda x: f"FS_{x['MainDataset']}_Boruta_T1000_itrr500_th100_topR5_MD{x['brt_md']:.0f}.csv",axis=1)

# Inspect Original Holdout Results
- May have duplicates
- DO NOT USE FOR RESULTS -- INTERNAL VALIDATION ONLY and COMPARISON ONLY

In [13]:
dfRes.query(f'{s_RF10} and RunType=="Holdout"').sort_values("balanced_accuracy", ascending=False)[lPrettyColsSite].T

Unnamed: 0,5,5.1,5.2,5.3,5.4,5.5,5.6,5.7,5.8,5.9,5.10,5.11,5.12,5.13,5.14,5.15
MainDataset,dfcmpltPreLC4_SITE4,dfcmpltPreLC4_SITE4,dfcmpltPreLC4_SITE3,dfcmpltPreLC4_SITE3,dfcmpltPreLC4,dfcmpltPreLC4,dfcmpltPreLC4_SITE7,dfcmpltPreLC4_SITE7,dfcmpltPreLC4_SITE6,dfcmpltPreLC4_SITE6,dfcmpltPreLC4_SITE5,dfcmpltPreLC4_SITE5,dfcmpltPreLC4_SITE2,dfcmpltPreLC4_SITE2,dfcmpltPreLC4_SITE1,dfcmpltPreLC4_SITE1
RunType,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout
site,RIO,RIO,SAN,SAN,Global,Global,NorthAmerica,NorthAmerica,Canada,Canada,LUS,LUS,MTL,MTL,LDN,LDN
classifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier
accuracy,0.980645,0.980645,0.832967,0.832967,0.697829,0.697829,0.698204,0.698204,0.698333,0.698333,0.738462,0.738462,0.703951,0.703951,0.613333,0.613333
balanced_accuracy,0.94,0.94,0.720303,0.720303,0.681052,0.681052,0.619294,0.619294,0.564422,0.564422,0.546667,0.546667,0.542991,0.542991,0.525,0.525
roc_auc,0.975385,0.975385,0.895758,0.895758,0.776731,0.776731,0.75805,0.75805,0.732845,0.732845,0.64,0.64,0.710012,0.710012,0.6,0.6
f1,0.988819,0.988819,0.897749,0.897749,0.604697,0.604697,0.420637,0.420637,0.265447,0.265447,0.166667,0.166667,0.187445,0.187445,0.719841,0.719841
recall,1.0,1.0,0.907273,0.907273,0.513734,0.513734,0.294039,0.294039,0.166282,0.166282,0.133333,0.133333,0.111429,0.111429,0.75,0.75
sensitivity,1.0,1.0,0.907273,0.907273,0.513734,0.513734,0.294039,0.294039,0.166282,0.166282,0.133333,0.133333,0.111429,0.111429,0.75,0.75


# Validation Results

In [14]:
dfVal = dfRes.query(f'{s_RF10} and {s_val}').sort_values('balanced_accuracy',ascending=False)
dfVal

Unnamed: 0,MainDataset,Title,site,date,classifier,RunType,cv,FeatureSet,brt_nTrue,brt_nTop,brt_md,param_clf,param_clf__max_depth,param_clf__n_estimators,param_clf__random_state,param_clf__max_iter,param_clf__learning_rate,accuracy,balanced_accuracy,roc_auc,f1,recall,sensitivity,specificity,precision,NPV,PPV,brt_params,brt_file
5,dfcmpltPreLC4_SITE4,dfcmpltPreLC4_SITE4_HoldoutVal,RIO,2024-10-09 03:07:37.508013,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_SITE4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.987097,0.96,0.979231,0.992453,1.0,1.0,0.92,0.985185,1.0,0.985185,,FS_dfcmpltPreLC4_SITE4_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE4,dfcmpltPreLC4_SITE4_HoldoutVal,RIO,2024-10-09 03:07:34.529991,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.967944,0.919487,0.951538,0.981127,0.992308,0.992308,0.846667,0.970615,0.96,0.970615,,FS_dfcmpltPreLC4_SITE4_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE3,dfcmpltPreLC4_SITE3_HoldoutVal,SAN,2024-10-09 03:07:28.483268,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.893407,0.829394,0.914545,0.934891,0.925455,0.925455,0.733333,0.94697,0.683333,0.94697,,FS_dfcmpltPreLC4_SITE3_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE3,dfcmpltPreLC4_SITE3_HoldoutVal,SAN,2024-10-09 03:07:31.562717,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_SITE3_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.846154,0.712727,0.911818,0.907273,0.925455,0.925455,0.5,0.892121,0.633333,0.892121,,FS_dfcmpltPreLC4_SITE3_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE5,dfcmpltPreLC4_SITE5_HoldoutVal,LUS,2024-10-09 13:33:17.422784,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_SITE5_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.769231,0.693333,0.742778,0.527619,0.533333,0.533333,0.853333,0.633333,0.842843,0.633333,,FS_dfcmpltPreLC4_SITE5_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4,dfcmpltPreLC4_HoldoutVal,Global,2024-10-09 03:07:04.012799,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.680905,0.667723,0.759669,0.601572,0.536203,0.536203,0.799243,0.688738,0.67845,0.688738,,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE7,dfcmpltPreLC4_SITE7_HoldoutVal,NorthAmerica,2024-10-09 13:33:26.547977,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.689238,0.633835,0.719619,0.495824,0.405255,0.405255,0.862416,0.649101,0.704415,0.649101,,FS_dfcmpltPreLC4_SITE7_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE7,dfcmpltPreLC4_SITE7_HoldoutVal,NorthAmerica,2024-10-09 13:33:29.724069,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_SITE7_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.683189,0.629787,0.737157,0.492059,0.409176,0.409176,0.850397,0.63491,0.703276,0.63491,,FS_dfcmpltPreLC4_SITE7_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE1,dfcmpltPreLC4_SITE1_HoldoutVal,LDN,2024-10-09 03:07:07.374856,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.68,0.6,0.625,0.775397,0.8,0.8,0.4,0.763333,0.5,0.763333,,FS_dfcmpltPreLC4_SITE1_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE5,dfcmpltPreLC4_SITE5_HoldoutVal,LUS,2024-10-09 13:33:15.181428,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.723077,0.587778,0.765556,0.313333,0.3,0.3,0.875556,0.38,0.783392,0.38,,FS_dfcmpltPreLC4_SITE5_Boruta_T1000_itrr500_th100_topR5_MDnan.csv


In [15]:
dfVal = dfVal.sort_values(['FeatureSet','MainDataset'])
dfVal

Unnamed: 0,MainDataset,Title,site,date,classifier,RunType,cv,FeatureSet,brt_nTrue,brt_nTop,brt_md,param_clf,param_clf__max_depth,param_clf__n_estimators,param_clf__random_state,param_clf__max_iter,param_clf__learning_rate,accuracy,balanced_accuracy,roc_auc,f1,recall,sensitivity,specificity,precision,NPV,PPV,brt_params,brt_file
5,dfcmpltPreLC4,dfcmpltPreLC4_HoldoutVal,Global,2024-10-09 03:07:04.012799,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.680905,0.667723,0.759669,0.601572,0.536203,0.536203,0.799243,0.688738,0.67845,0.688738,,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE1,dfcmpltPreLC4_SITE1_HoldoutVal,LDN,2024-10-09 03:07:07.374856,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.68,0.6,0.625,0.775397,0.8,0.8,0.4,0.763333,0.5,0.763333,,FS_dfcmpltPreLC4_SITE1_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE2,dfcmpltPreLC4_SITE2_HoldoutVal,MTL,2024-10-09 03:07:22.535741,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.668986,0.555495,0.637449,0.322345,0.25127,0.25127,0.859721,0.457594,0.715374,0.457594,,FS_dfcmpltPreLC4_SITE2_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE3,dfcmpltPreLC4_SITE3_HoldoutVal,SAN,2024-10-09 03:07:28.483268,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.893407,0.829394,0.914545,0.934891,0.925455,0.925455,0.733333,0.94697,0.683333,0.94697,,FS_dfcmpltPreLC4_SITE3_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE4,dfcmpltPreLC4_SITE4_HoldoutVal,RIO,2024-10-09 03:07:34.529991,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.967944,0.919487,0.951538,0.981127,0.992308,0.992308,0.846667,0.970615,0.96,0.970615,,FS_dfcmpltPreLC4_SITE4_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE5,dfcmpltPreLC4_SITE5_HoldoutVal,LUS,2024-10-09 13:33:15.181428,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.723077,0.587778,0.765556,0.313333,0.3,0.3,0.875556,0.38,0.783392,0.38,,FS_dfcmpltPreLC4_SITE5_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE6,dfcmpltPreLC4_SITE6_HoldoutVal,Canada,2024-10-09 13:33:20.375586,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.683333,0.58695,0.676549,0.38588,0.301154,0.301154,0.872747,0.552446,0.71565,0.552446,,FS_dfcmpltPreLC4_SITE6_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE7,dfcmpltPreLC4_SITE7_HoldoutVal,NorthAmerica,2024-10-09 13:33:26.547977,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.689238,0.633835,0.719619,0.495824,0.405255,0.405255,0.862416,0.649101,0.704415,0.649101,,FS_dfcmpltPreLC4_SITE7_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE1,dfcmpltPreLC4_SITE1_HoldoutVal,LDN,2024-10-09 03:07:09.885581,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_SITE1_Boruta_T1000_itrr500_th100_topR5_MD3,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.62,0.575,0.675,0.727778,0.75,0.75,0.4,0.75,0.366667,0.75,,FS_dfcmpltPreLC4_SITE1_Boruta_T1000_itrr500_th100_topR5_MDnan.csv
5,dfcmpltPreLC4_SITE2,dfcmpltPreLC4_SITE2_HoldoutVal,MTL,2024-10-09 03:07:25.602645,RandomForestClassifier,HoldoutVal,5,FS_dfcmpltPreLC4_SITE2_Boruta_T1000_itrr500_th100_topR5_MD5,,,,RandomForestClassifier(),10.0,1000.0,42.0,,,0.635759,0.540607,0.585818,0.326076,0.285238,0.285238,0.795975,0.400564,0.709337,0.400564,,FS_dfcmpltPreLC4_SITE2_Boruta_T1000_itrr500_th100_topR5_MDnan.csv


In [16]:
sGlobal = 'FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3'

dfVal['FeatureSetSpecific'] = dfVal['FeatureSet'].apply(lambda x:  'Global' if x == sGlobal else 'Site')

In [17]:
# # Create the ExcelWriter object
writer = pd.ExcelWriter('../OUTPUTS/DataV4/DataV4_prelim_preLongCOVID4_Val.xlsx', engine='xlsxwriter')
dfVal[lPrettyColsSite+['brt_md','brt_nTrue','Title','FeatureSet','FeatureSetSpecific','brt_file']].sort_values('balanced_accuracy',ascending=False).to_excel(writer,sheet_name='Results')


for i,d in pd.DataFrame(dfVal.sort_values([])).iterrows():
    tmpFS = d['FeatureSet']
    sheetName = f"{d['FeatureSetSpecific']} Feat - {d['site']}"
        
    tmpMainData = d['MainDataset']
    
    dFeat = pd.read_csv(f'OUTPUT/MP/05-classifiers/DataV4/{tmpMainData}/FR_{tmpMainData}_HoldoutVal_{tmpFS}.csv')
    # f = f'OUTPUT/MP/05-classifiers/DataV4/{tmpdataset}/FS_{tmpdataset}_Boruta_T1000_itrr500_th100_topR5_MD{md}.csv'
    # dfeat = pd.read_csv(f)
    
    # MPutils.getTrueFeatList(f,asDF=True).to_excel(writer, sheet_name=d['site'])
    dFeat.to_excel(writer,sheet_name=sheetName)


    # Write each dataframe to a different sheet
    # df1.to_excel(writer, sheet_name=d['site'])

#     # Save the Excel file
writer.close()


In [18]:
dfVal[dfVal['FeatureSetSpecific']=='Global'][
    [
        "RunType",
        "FeatureSetSpecific",
        "site",
        "classifier",
        "accuracy",
        "balanced_accuracy",
        "roc_auc",
        "f1",
        "recall",
        "sensitivity",
        "specificity",
        "precision",
        "NPV",
        "PPV",
    ]
].sort_values("FeatureSetSpecific", ascending=True).round(2)

Unnamed: 0,RunType,FeatureSetSpecific,site,classifier,accuracy,balanced_accuracy,roc_auc,f1,recall,sensitivity,specificity,precision,NPV,PPV
5,HoldoutVal,Global,Global,RandomForestClassifier,0.68,0.67,0.76,0.6,0.54,0.54,0.8,0.69,0.68,0.69
5,HoldoutVal,Global,LDN,RandomForestClassifier,0.68,0.6,0.62,0.78,0.8,0.8,0.4,0.76,0.5,0.76
5,HoldoutVal,Global,MTL,RandomForestClassifier,0.67,0.56,0.64,0.32,0.25,0.25,0.86,0.46,0.72,0.46
5,HoldoutVal,Global,SAN,RandomForestClassifier,0.89,0.83,0.91,0.93,0.93,0.93,0.73,0.95,0.68,0.95
5,HoldoutVal,Global,RIO,RandomForestClassifier,0.97,0.92,0.95,0.98,0.99,0.99,0.85,0.97,0.96,0.97
5,HoldoutVal,Global,LUS,RandomForestClassifier,0.72,0.59,0.77,0.31,0.3,0.3,0.88,0.38,0.78,0.38
5,HoldoutVal,Global,Canada,RandomForestClassifier,0.68,0.59,0.68,0.39,0.3,0.3,0.87,0.55,0.72,0.55
5,HoldoutVal,Global,NorthAmerica,RandomForestClassifier,0.69,0.63,0.72,0.5,0.41,0.41,0.86,0.65,0.7,0.65


In [19]:
dfVal[dfVal['FeatureSetSpecific']=='Site'][
    [
        "RunType",
        "FeatureSetSpecific",
        "site",
        "classifier",
        "accuracy",
        "balanced_accuracy",
        "roc_auc",
        "f1",
        "recall",
        "sensitivity",
        "specificity",
        "precision",
        "NPV",
        "PPV",
    ]
].sort_values("FeatureSetSpecific", ascending=True).round(2)

Unnamed: 0,RunType,FeatureSetSpecific,site,classifier,accuracy,balanced_accuracy,roc_auc,f1,recall,sensitivity,specificity,precision,NPV,PPV
5,HoldoutVal,Site,LDN,RandomForestClassifier,0.62,0.57,0.68,0.73,0.75,0.75,0.4,0.75,0.37,0.75
5,HoldoutVal,Site,MTL,RandomForestClassifier,0.64,0.54,0.59,0.33,0.29,0.29,0.8,0.4,0.71,0.4
5,HoldoutVal,Site,SAN,RandomForestClassifier,0.85,0.71,0.91,0.91,0.93,0.93,0.5,0.89,0.63,0.89
5,HoldoutVal,Site,RIO,RandomForestClassifier,0.99,0.96,0.98,0.99,1.0,1.0,0.92,0.99,1.0,0.99
5,HoldoutVal,Site,LUS,RandomForestClassifier,0.77,0.69,0.74,0.53,0.53,0.53,0.85,0.63,0.84,0.63
5,HoldoutVal,Site,Canada,RandomForestClassifier,0.62,0.55,0.59,0.37,0.34,0.34,0.76,0.41,0.7,0.41
5,HoldoutVal,Site,NorthAmerica,RandomForestClassifier,0.68,0.63,0.74,0.49,0.41,0.41,0.85,0.63,0.7,0.63
