In [1]:
# reload magics
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datasets
from project_modules.io import load_dataset_to_df
from project_modules.classifcation import classify_MP,getXY, boruta_fs
from project_modules.utils import MPutils
from sklearn.ensemble import RandomForestClassifier
import multiprocessing
from sklearn.model_selection import cross_val_score
import cupy as cp
from datetime import datetime

from tqdm.notebook import tqdm
from copy import deepcopy

# from project_modules.utils import get_logger
# logger = get_logger("log-data-combine-split.log")
# # read the parameter file

# from project_modules.utils import read_parameters
# parms = read_parameters("/Users/david/projects/lc-project-data/project.yaml")

pd.options.display.max_columns = None
pd.options.display.max_rows = 50
pd.options.display.max_colwidth = None

In [2]:
lScorersBinary = [
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    # "average_precision",
    "NPV",
    "PPV",
    # "neg_mean_squared_error",
]
lResCol = [
    "Title",
    "cv",
    "param_clf",
    "param_clf__max_depth",
    "param_clf__n_estimators",
    "param_clf__random_state",
    # "param_clf__max_iter",
    "mean_test_accuracy",
    "mean_test_balanced_accuracy",
    "mean_test_roc_auc",
    "mean_test_f1",
    "mean_test_recall",
    "mean_test_sensitivity",
    "mean_test_specificity",
    "mean_test_precision",
    "mean_test_NPV",
    "mean_test_PPV",
]

In [3]:
site_name_dict = {1: "LDN", 2: "MTL", 3: "SAN", 4: "RIO", 5: "LUS", 6: "CA", 7: "NA"}
status_name_dict = {
    0: "LC_NEG",
    1: "LC_POS",
    2: "HC",
}

# Combine Results

In [4]:
import glob

path = "OUTPUT/MP/05-classifiers/DataV4"
files = glob.glob(path + '/*/CA*.csv')
# files = files+glob.glob(path + "/*/FS*.csv")
files = files + glob.glob(path + "/*/Val*.csv")
files = files+glob.glob(path+'/*/CLFRun*.csv')
files = [x for x in files if 'selcol' not in x.lower()]

print(files)
print(len(files))

['OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE4/CA_dfcmpltPreLC4_SITE4_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE4/CA_dfcmpltPreLC4_SITE4_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE1/CA_dfcmpltPreLC4_SITE1_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE1/CA_dfcmpltPreLC4_SITE1_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE3/CA_dfcmpltPreLC4_SITE3_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE3/CA_dfcmpltPreLC4_SITE3_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4/CA_dfcmpltPreLC4_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4/CA_dfcmpltPreLC4_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE2/CA_dfcmpltPreLC4_SITE2_Holdout.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE2/CA_dfcmpltPreLC4_SITE2_Full.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreLC4_SITE1/CLFRun_dfcmpltPreLC4_SITE1_Results.csv', 'OUTPUT/MP/05-classifiers/DataV4/dfcmpltPreL

In [5]:
ldf = []
for i in files:
    ldf.append(pd.read_csv(i))

In [6]:
dfRes = pd.concat(ldf)

In [7]:
def changeSite(x):
    if pd.isnull(x):
        return np.nan
    if 'SITE7' in x:
        return 'NorthAmerica'
    if 'SITE6' in x:
        return 'Canada'
    if 'SITE5' in x:
        return 'LUS'
    if 'SITE4' in x:
        return 'RIO'
    if 'SITE3' in x:
        return 'SAN'
    if 'SITE2' in x:
        return 'MTL'
    if 'SITE1' in x:
        return 'LDN'
    return "Global"

In [9]:
dfRes["classifier"] = dfRes["param_clf"].apply(lambda x: x.__str__().split("(")[0])
dfRes['site'] = dfRes['MainDataset'].apply(changeSite)
lColFirst = [
    "MainDataset",
    "Title",
    'site',
    "date",
    "classifier",
    'RunType',
    "cv",
    'FeatureSet',
    "brt_nTrue",
    "brt_nTop",
    'brt_md',
]
dfRes = MPutils.reorder_columns(dfRes, lColFirst)
dfRes.to_csv("OUTPUT/MP/05-classifiers/DataV4/CLFRunCombined.csv", index=False)
dfResOrg = deepcopy(dfRes)
lColsSel = lColFirst + [
    x
    for x in dfRes
    if (x.startswith("mean_test") or x.startswith("param_") or x.startswith("brt_"))
    and x not in lColFirst
]
dfRes[lColsSel].to_csv("OUTPUT/MP/05-classifiers/DataV4/CLFRunCombined_SelCol.csv")

# Go through Data and Simplify for inspection

In [154]:
dfRes = dfRes[lColsSel]

In [156]:
lSelMainData = [
"dfcmpltPreLC4",
"dfcmpltPreLC4_SITE1",
"dfcmpltPreLC4_SITE2",
"dfcmpltPreLC4_SITE3",
"dfcmpltPreLC4_SITE4",
"dfcmpltPreLC4_SITE5",
"dfcmpltPreLC4_SITE6",
"dfcmpltPreLC4_SITE7",
]

lPrettyCols =     [
        "MainDataset",
        "RunType",
        "classifier",
        "brt_nTrue",
        "brt_nTop",
        "accuracy",
        "balanced_accuracy",
        "roc_auc",
        "f1",
        "recall",
        "sensitivity",
        "specificity",
        "precision",
        "NPV",
        "PPV",
        "brt_md",
        'brt_params'
    ]
lPrettyColsSite = [
    "MainDataset",
    "RunType",
    'site',
    "classifier",
    "accuracy",
    "balanced_accuracy",
    "roc_auc",
    "f1",
    "recall",
    "sensitivity",
    "specificity",
    "precision",
    "NPV",
    "PPV",
]
s_brt = 'and (RunType=="Boruta_True" or RunType=="Boruta_Top")'
s_RF10 = 'classifier=="RandomForestClassifier" and param_clf__n_estimators==1000 and param_clf__max_depth==10'
s_RFXGB10 = 'param_clf__n_estimators==1000 and param_clf__max_depth==10'
s_CMPLT = 'MainDataset=="dfcmpltPreLC4"'
s_val = 'RunType=="HoldoutVal"'

In [157]:
dfRes = dfRes[dfRes['MainDataset'].isin(lSelMainData)]
dfRes.columns = [x.replace('mean_test_','') for x in dfRes.columns]

In [158]:
# FS_{tmpdataset}_Boruta_T1000_itrr500_th100_topR5_MD{md}.csv'
dfRes['brt_file'] = dfRes[['MainDataset','brt_md']].apply(lambda x: f"FS_{x['MainDataset']}_Boruta_T1000_itrr500_th100_topR5_MD{x['brt_md']:.0f}.csv",axis=1)

# Inspect Original Holdout Results
- May have duplicates
- DO NOT USE FOR RESULTS -- INTERNAL VALIDATION ONLY and COMPARISON ONLY

In [159]:
dfRes.query(f'{s_RF10} and RunType=="Holdout"').sort_values("balanced_accuracy", ascending=False)[lPrettyColsSite].T

Unnamed: 0,5,5.1,5.2,5.3,5.4,5.5,5.6,5.7,5.8
MainDataset,dfcmpltPreLC4_SITE4,dfcmpltPreLC4_SITE3,dfcmpltPreLC4_SITE3,dfcmpltPreLC4,dfcmpltPreLC4,dfcmpltPreLC4_SITE2,dfcmpltPreLC4_SITE2,dfcmpltPreLC4_SITE1,dfcmpltPreLC4_SITE1
RunType,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout,Holdout
site,RIO,SAN,SAN,Global,Global,MTL,MTL,LDN,LDN
classifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier
accuracy,0.980645,0.832967,0.832967,0.697829,0.697829,0.703951,0.703951,0.613333,0.613333
balanced_accuracy,0.94,0.720303,0.720303,0.681052,0.681052,0.542991,0.542991,0.525,0.525
roc_auc,0.975385,0.895758,0.895758,0.776731,0.776731,0.710012,0.710012,0.6,0.6
f1,0.988819,0.897749,0.897749,0.604697,0.604697,0.187445,0.187445,0.719841,0.719841
recall,1.0,0.907273,0.907273,0.513734,0.513734,0.111429,0.111429,0.75,0.75
sensitivity,1.0,0.907273,0.907273,0.513734,0.513734,0.111429,0.111429,0.75,0.75


# Validation Results

In [65]:
dfVal = dfRes.query(f'{s_RF10} and {s_val}').sort_values('balanced_accuracy',ascending=False)
dfVal

Unnamed: 0,MainDataset,Title,site,date,classifier,RunType,cv,brt_nTrue,brt_nTop,brt_md,param_clf,param_clf__max_depth,param_clf__n_estimators,param_clf__random_state,param_clf__max_iter,param_clf__learning_rate,accuracy,balanced_accuracy,roc_auc,f1,recall,sensitivity,specificity,precision,NPV,PPV,brt_params


In [66]:
dfVal = dfVal.sort_values(['FeatureSet','MainDataset'])
dfVal

KeyError: 'FeatureSet'

In [67]:
sGlobal = 'FS_dfcmpltPreLC4_Boruta_T1000_itrr500_th100_topR5_MD3'

dfVal['FeatureSetSpecific'] = dfVal['FeatureSet'].apply(lambda x:  'Global' if x == sGlobal else 'Site')

KeyError: 'FeatureSet'

In [75]:
# # Create the ExcelWriter object
writer = pd.ExcelWriter('../OUTPUTS/prelim_preLongCOVID4_Val.xlsx', engine='xlsxwriter')
dfVal[lPrettyColsSite+['brt_md','brt_nTrue','Title','FeatureSet','FeatureSetSpecific']].sort_values('balanced_accuracy',ascending=False).to_excel(writer,sheet_name='Results')


for i,d in pd.DataFrame(dfVal.sort_values([])).iterrows():
    tmpFS = d['FeatureSet']
    sheetName = f"{d['FeatureSetSpecific']} Feat - {d['site']}"
        
    tmpMainData = d['MainDataset']
    
    dFeat = pd.read_csv(f'OUTPUT/MP/05-classifiers/DataV4/{tmpMainData}/FR_{tmpMainData}_HoldoutVal_{tmpFS}.csv')
    # f = f'OUTPUT/MP/05-classifiers/DataV4/{tmpdataset}/FS_{tmpdataset}_Boruta_T1000_itrr500_th100_topR5_MD{md}.csv'
    # dfeat = pd.read_csv(f)
    
    # MPutils.getTrueFeatList(f,asDF=True).to_excel(writer, sheet_name=d['site'])
    dFeat.to_excel(writer,sheet_name=sheetName)


    # Write each dataframe to a different sheet
    # df1.to_excel(writer, sheet_name=d['site'])

#     # Save the Excel file
writer.close()
