# Model Training with Simulated Datasets

## 1. Load Required Libraries and Scripts

In [2]:
# Import required libraries

#Datawrangling
import pandas as pd
import os

#Models and Hyperparamters (provided as a .py file)
from ParameterOptimization import search_space, models

#Hyperparameter Optiization Algorithm
from skopt import BayesSearchCV

#Train test split
from sklearn.model_selection import train_test_split

#Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

#Model Export
import joblib

#Track for loops
from fastprogress.fastprogress import progress_bar

## 2. Import Data

In [9]:
# Simulated Datasets
dir_path = '/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/Projects/Explainability Method Comparison/Data-ML-XAI-Eval/Synthetic Data/Data Files'
# list to store files name
res = []
for (dir_path, dir_names, file_names) in os.walk(dir_path):
    res.extend(file_names)
    
for file in res:
    if file not in ['GroundTruth.csv', '.Rhistory', '.DS_Store']:
        data_path = f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/Projects/Explainability Method Comparison/Data-ML-XAI-Eval/Synthetic Data/Data Files/{file}'
        df = pd.read_csv(data_path)
        #Class Imbalance
        print(f"Positive percentage in {file}:", df.Target.mean())

res_ = []
for file in res:
     if file not in ['GroundTruth.csv', '.Rhistory', '.DS_Store']:
        res_.append(file)


Positive percentage in 0_vars_corr_0HC_n25000.csv: 0.50592
Positive percentage in 0_vars_corr_0HC_n10000_skew_0.5194.csv: 0.5194
Positive percentage in 0_vars_corr_0HC_n100.csv: 0.43
Positive percentage in 0_vars_corr_0HC_n10000_skew_0.7354.csv: 0.7354
Positive percentage in 3_vars_corr_1HC_n10000.csv: 0.4927
Positive percentage in 0_vars_corr_0HC_n10000_skew_0.8612.csv: 0.8612
Positive percentage in 0_vars_corr_0HC_n15000.csv: 0.49533333333333335
Positive percentage in 2_vars_corr_1HC_n10000.csv: 0.5007
Positive percentage in 0_vars_corr_0HC_n10000_skew_0.9694.csv: 0.9694
Positive percentage in 0_vars_corr_0HC_n10000_skew_0.9699.csv: 0.9699
Positive percentage in 0_vars_corr_0HC_n10000_skew_0.7298.csv: 0.7298
Positive percentage in 0_vars_corr_0HC_n10000_skew_0.6145.csv: 0.6145
Positive percentage in 2_vars_corr_2HC_n10000.csv: 0.4963
Positive percentage in 0_vars_corr_0HC_n10000.csv: 0.5098
Positive percentage in 0_vars_corr_0HC_n10000_skew_0.8608.csv: 0.8608
Positive percentage in 0

In [10]:
#Train Test Split
data = {}
for file in res_:
        data_path = f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/Projects/Explainability Method Comparison/Data-ML-XAI-Eval/Synthetic Data/Data Files/{file}'
        df = pd.read_csv(data_path)
        train, test = train_test_split(df, test_size=0.25, random_state=42)
        data[file] = {
            'X_train': train.drop('Target', axis=1),
            'y_train': train.Target,
            'X_test': test.drop('Target', axis=1),
            'y_test': test.Target
        }
        ##Class Imbalance
        # print(f"Positive percentage in {file} Train:", data[file]['y_train'].mean())
        # print(f"Positive percentage in {file} Test:",  data[file]['y_test'].mean())

## 3. Model Training

### 3.1 Optimization Parameters

In [11]:
def accu(estimator, X, y):
    y_pred = estimator.predict(X)
    return accuracy_score(y, y_pred)


def Optimizer(model=None, X_train=None, y_train=None, 
    search_space=None, threshold = None, n_iter = 50, n_jobs = 20, cv = 10):
        """
        Optimize a model using a bayesian search
        """
        search = BayesSearchCV(model, search_space, n_iter=n_iter, n_jobs=n_jobs, cv=cv, scoring = accu, random_state=42, return_train_score = True)

        return search.fit(X_train, y_train)

### 3.2 Best Model and Hyperparameter Search

In [12]:
def training_function(file):
    keys = progress_bar(models.keys())
    trained_models = {}
    for key in keys:
        model = Optimizer(models[key], data[file]['X_train'], data[file]['y_train'], search_space[key], n_iter=10, cv = 10)
        trained_models[key] = model
        filepath = f'./Models/{file}_{key}.joblib'
        joblib.dump(model.best_estimator_, filepath) 
    return trained_models

In [14]:
from multiprocess import Pool
#Best models with the imbalanced dataset
pool = Pool(3)
results = pool.map(training_function, res_)



## 4. Model Evaluation

### 4.1 Evaluation of Models Trained for each Dataset

In [4]:
data.keys()

dict_keys(['0_vars_corr_0HC_n25000.csv', '0_vars_corr_0HC_n100.csv', '0_vars_corr_0HC_n10000_skew_0.7354.csv', '3_vars_corr_1HC_n10000.csv', '0_vars_corr_0HC_n10000_skew_0.8612.csv', '0_vars_corr_0HC_n15000.csv', '2_vars_corr_1HC_n10000.csv', '0_vars_corr_0HC_n10000_skew_0.9694.csv', '2_vars_corr_2HC_n10000.csv', '0_vars_corr_0HC_n10000.csv', '0_vars_corr_0HC_n5000.csv', '0_vars_corr_0HC_n10000_skew_0.508.csv', '0_vars_corr_0HC_n10000_skew_0.6222.csv', '3_vars_corr_2HC_n10000B.csv', '0_vars_corr_0HC_n1000.csv', '3_vars_corr_2HC_n10000.csv', '0_vars_corr_0HC_n20000.csv'])

In [12]:
for i in range(len(res_)):
    for model in models:
        pd.DataFrame(imb_results[i][model].cv_results_).to_csv(f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/XAI method performacne when Explainaing the PORT Dataset/Results/Models/Model Evals/Simulated/{res_[i]}_{model}_optim.csv')

NameError: name 'imb_results' is not defined

In [None]:
for i in range(len(res_)):
    evals = pd.DataFrame(columns=['Accu','Precision','Recall','F1','TNR', 'TPR','TP','FP','TN','FN'],
                    index=models.keys())
    for key in models.keys():
        y_pred = imb_results[i][key].best_estimator_.predict(data[res_[i]]['X_test'])
        accu = accuracy_score(data[res_[i]]['y_test'], y_pred)
        (prec, rec, f, _) = precision_recall_fscore_support(data[res_[i]]['y_test'], y_pred, average='binary')
        (tn, fp, fn, tp) = confusion_matrix(data[res_[i]]['y_test'], y_pred).ravel()
        tpr = tp/(tp+fn)
        tnr = tn/(tn+fp)
        
        evals.loc[key] = accu, prec, rec, f, tnr, tpr, tp, fp, tn, fn
    evals.to_csv(f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/XAI method performacne when Explainaing the PORT Dataset/Results/Models/Model Evals/{res_[i]}_model_evals_optim.csv')
    print(evals)


for i in range(len(res_)):
    evals = pd.DataFrame(columns=['Accu','Precision','Recall','F1','TNR', 'TPR','TP','FP','TN','FN'],
                    index=models.keys())
    for key in models.keys():
        y_pred = imb_results_non_optim[i][key].predict(data[res_[i]]['X_test'])
        accu = accuracy_score(data[res_[i]]['y_test'], y_pred)
        (prec, rec, f, _) = precision_recall_fscore_support(data[res_[i]]['y_test'], y_pred, average='binary')
        (tn, fp, fn, tp) = confusion_matrix(data[res_[i]]['y_test'], y_pred).ravel()
        tpr = tp/(tp+fn)
        tnr = tn/(tn+fp)
        
        evals.loc[key] = accu, prec, rec, f, tnr, tpr, tp, fp, tn, fn
    evals.to_csv(f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/XAI method performacne when Explainaing the PORT Dataset/Results/Models/Model Evals/{res_[i]}_model_evals.csv')
    print(evals)




In [81]:
from imp import reload
import itertools
from unittest import result
import shap
import ROAR
reload(ROAR)


from multiprocess.pool import Pool 


# def reps(i,k):
#     return ROAR.single_roar(data[res_[1]]['X_train'], data[res_[1]]['y_train'], data[res_[1]]['X_test'], data[res_[1]]['y_test'], imb_results_non_optim[1]['RandomForest'], explainer = shap.explainers.Tree, t = 1, shap_values = None)



def reps(i,k):
    return f'{i} : {k}'

results = pool.starmap(reps, )



models = ['PassiveAgressive','SGDClassifier','RandomForest','Perceptron','RidgeClassifier','LogisticRegression','DecisionTree','XGBoost','MultinomialNB', 'GaussianNB']

  
results


['0_vars_corr_0HC_n25000.csv : PassiveAgressive',
 '0_vars_corr_0HC_n25000.csv : SGDClassifier',
 '0_vars_corr_0HC_n25000.csv : RandomForest',
 '0_vars_corr_0HC_n25000.csv : Perceptron',
 '0_vars_corr_0HC_n25000.csv : RidgeClassifier',
 '0_vars_corr_0HC_n25000.csv : LogisticRegression',
 '0_vars_corr_0HC_n25000.csv : DecisionTree',
 '0_vars_corr_0HC_n25000.csv : XGBoost',
 '0_vars_corr_0HC_n25000.csv : MultinomialNB',
 '0_vars_corr_0HC_n25000.csv : GaussianNB',
 '0_vars_corr_0HC_n100.csv : PassiveAgressive',
 '0_vars_corr_0HC_n100.csv : SGDClassifier',
 '0_vars_corr_0HC_n100.csv : RandomForest',
 '0_vars_corr_0HC_n100.csv : Perceptron',
 '0_vars_corr_0HC_n100.csv : RidgeClassifier',
 '0_vars_corr_0HC_n100.csv : LogisticRegression',
 '0_vars_corr_0HC_n100.csv : DecisionTree',
 '0_vars_corr_0HC_n100.csv : XGBoost',
 '0_vars_corr_0HC_n100.csv : MultinomialNB',
 '0_vars_corr_0HC_n100.csv : GaussianNB',
 '0_vars_corr_0HC_n10000_skew_0.7354.csv : PassiveAgressive',
 '0_vars_corr_0HC_n10000_s