# Model Training with Simulated Datasets

## 1. Load Required Libraries and Scripts

In [2]:
# Import required libraries

#Datawrangling
import pandas as pd
import os
from imblearn.over_sampling import SMOTE

#Models and Hyperparamters (provided as a .py file)
from ParameterOptimization import search_space, models

#Hyperparameter Optiization Algorithm
from skopt import BayesSearchCV

#Train test split
from sklearn.model_selection import train_test_split

#Metrics
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

#Model Export
import joblib

#Track for loops
from fastprogress.fastprogress import progress_bar

## 2. Import Data

In [None]:
# Simulated Datasets
dir_path = '/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/Projects/Explainability Method Comparison/Data-ML-XAI-Eval/Synthetic Data/Data files with summed target'
# list to store files name
res = []
for (dir_path, dir_names, file_names) in os.walk(dir_path):
    res.extend(file_names)
    
for file in res:
    if file not in ['GroundTruth.csv', '.Rhistory', '.DS_Store']:
        data_path = f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/Projects/Explainability Method Comparison/Data-ML-XAI-Eval/Synthetic Data/Data files with summed target/{file}'
        df = pd.read_csv(data_path)
        #Class Imbalance
        print(f"Positive percentage in {file}:", df.Target.mean())

res_ = []
for file in res:
     if file not in ['GroundTruth.csv', '.Rhistory', '.DS_Store']:
        res_.append(file)


In [18]:
#Train Test Split
data = {}
for file in res_:
        data_path = f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/Projects/Explainability Method Comparison/Data-ML-XAI-Eval/Synthetic Data/Data files with summed target/{file}'
        df = pd.read_csv(data_path)
        # df, target = SMOTE().fit_resample(df.drop('Target', axis=1), df.Target)
        # df['Target'] = target
        # df.to_csv(data_path, index=False)
        train, test = train_test_split(df, test_size=0.25, random_state=42)
      
        data[file] = {
            'X_train': train.drop('Target', axis=1),
            'y_train': train.Target,
            'X_test': test.drop('Target', axis=1),
            'y_test': test.Target
        }
        ##Class Imbalance
        # print(f"Positive percentage in {file} Train:", data[file]['y_train'].mean())
        # print(f"Positive percentage in {file} Test:",  data[file]['y_test'].mean())

## 3. Model Training

### 3.1 Optimization Parameters

In [19]:
def accu(estimator, X, y):
    y_pred = estimator.predict(X)
    return accuracy_score(y, y_pred)


def Optimizer(model=None, X_train=None, y_train=None, 
    search_space=None, threshold = None, n_iter = 50, n_jobs = 20, cv = 10):
        """
        Optimize a model using a bayesian search
        """
        search = BayesSearchCV(model, search_space, n_iter=n_iter, n_jobs=n_jobs, cv=cv, scoring = accu, random_state=42, return_train_score = True)

        return search.fit(X_train, y_train)

### 3.1 Model Training function

In [20]:
# Description: This file contains the functions used to train the models on the simulated data
# Training Function without Optimization
def training_function(file):
    keys = progress_bar(models.keys())
    trained_models = {}
    for key in keys:
        model = models[key].fit(data[file]['X_train'].to_numpy(), data[file]['y_train'].to_numpy())
        trained_models[key] = model
        filepath = f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/Projects/Explainability Method Comparison/Data-ML-XAI-Eval/Model Training/Models Sum/{file}_{key}.joblib'
        joblib.dump(model, filepath) 
    return trained_models

In [21]:
trained_models = []
files = progress_bar(res_)
for file in files:
    trained_models.append(training_function(file))

## 4. Model Evaluation

### 4.1 Evaluation of Models Trained for each Dataset

In [22]:
non_perfect_models = []

for i in range(len(res_)):
    evals = pd.DataFrame(columns=['Accu','Precision','Recall','F1','TNR', 'TPR','TP','FP','TN','FN'],
                    index=models.keys())
    for key in models.keys():
        y_pred = trained_models[i][key].predict(data[res_[i]]['X_test'].to_numpy())
        accu = accuracy_score(data[res_[i]]['y_test'], y_pred)
        if accu < 1:
            non_perfect_models.append(key)
        (prec, rec, f, _) = precision_recall_fscore_support(data[res_[i]]['y_test'], y_pred, average='binary')
        (tn, fp, fn, tp) = confusion_matrix(data[res_[i]]['y_test'], y_pred).ravel()
        tpr = tp/(tp+fn)
        tnr = tn/(tn+fp)
        
        evals.loc[key] = accu, prec, rec, f, tnr, tpr, tp, fp, tn, fn
    path = f'/Users/eddie/Library/CloudStorage/OneDrive-UniversityofPittsburgh/Research/Projects/Explainability Method Comparison/Data-ML-XAI-Eval/Model Training/Model Evals/{res_[i]}_model_evals.csv'
    evals.to_csv(path)
    print(res_[i])
    print(evals)
    print('\n')

2_vars_corr_2HC_n700.csv
                        Accu Precision    Recall        F1       TNR  \
PassiveAgressive         1.0       1.0       1.0       1.0       1.0   
SGDClassifier            1.0       1.0       1.0       1.0       1.0   
RandomForest             1.0       1.0       1.0       1.0       1.0   
Perceptron               1.0       1.0       1.0       1.0       1.0   
RidgeClassifier          1.0       1.0       1.0       1.0       1.0   
LogisticRegression       1.0       1.0       1.0       1.0       1.0   
DecisionTree             1.0       1.0       1.0       1.0       1.0   
XGBoost                  1.0       1.0       1.0       1.0       1.0   
MultinomialNB       0.741758  0.779221  0.666667  0.718563  0.815217   
GaussianNB          0.917582  0.894737  0.944444  0.918919  0.891304   
SVC                      1.0       1.0       1.0       1.0       1.0   

                         TPR  TP  FP  TN  FN  
PassiveAgressive         1.0  90   0  92   0  
SGDClassifier   

In [26]:
pd.Series(non_perfect_models).unique()

array(['MultinomialNB', 'GaussianNB'], dtype=object)