<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/05_Model_Pipeline_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
# Import & download statements
# General Statements
#!git clone https://github.com/d-atallah/implicit_gender_bias.git
#! pip install joblib
#! pip install shap
import pandas as pd
import string
import re
import joblib
#from implicit_gender_bias import config as cf
import os
import numpy as np
import time
#from sklearn.externals import joblib

import shap
import matplotlib.pyplot as plt

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Inputs

In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Load DataFrames from pkl files
y_train = pd.read_pickle(folder_path + 'y_train.pkl')
y_test = pd.read_pickle(folder_path + 'y_test.pkl')

## Define Functions


In [None]:
# Evaluate a model
def model_eval(pipeline, X_test, y_test):
    """
    Evaluates a specified model using accuracy, precision, recall, F-1 score, AUC-ROC & PR, log-Loss, and a confusion matrix.

    Parameters:
    - pipeline (object): Fitted pipeline.
    - X_test (list or array): Test set features.
    - y_test (list or array): True labels.

    Returns:
    - metrics_df (pd.DataFrame): DataFrame containing the metrics and scores.
    - confusion_df (pd.DataFrame): DataFrame containing a confusion matrix.
    """

    start_time = time.time()

    # Create positive binary predictions
    y_pred_proba = pipeline.predict_proba(X_test)
    y_pred = (y_pred_proba >= 0.5)[:, 1].astype(int)
    print('preds created')

    # Calculate single split metrics
    f1 = f1_score(y_test, y_pred)
    auc_pr = average_precision_score(y_test, y_pred)


    #Single split evaluation
    metrics_df = pd.DataFrame({
        'Metric': ['F1-Score', 'AUC-PR'],
        'Single Split Score': [f1, auc_pr]
    })
    metrics_df = metrics_df.reset_index(drop=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Single split evaluation completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Cross-validation
    scoring = {
        'f1': make_scorer(f1_score),
        'pr_auc': make_scorer(average_precision_score),
    }
    print('cv scoring created')

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_validate(pipeline, X_test, y_test, cv=cv, scoring=scoring)

    print('cv pipeline created')
    # Create DataFrame to store cross-validation results
    cv_metrics_df = pd.DataFrame({
    'Metric': ['F1-Score','AUC-PR'],
    'CV_Mean': [np.mean(cv_results['test_f1']),
                np.mean(cv_results['test_pr_auc']),
                ],
    'CV_Std Dev': [np.std(cv_results['test_f1']),
                   np.std(cv_results['test_pr_auc']),
                  ]
    })

    print('DF created')
    combined_metrics_df = pd.merge(metrics_df, cv_metrics_df, on='Metric')

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Cross validation completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Print cross-validation results
    print("\nEvaluation results:")
    print(combined_metrics_df)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])

    print("\nConfusion Matrix:")
    print(confusion_matrix)

    return combined_metrics_df, confusion_df

In [None]:
def model_rank(model_list, model_str, metric):
    """
    Finds the model with the best score based on a specified metric.

    Parameters:
    - models_list (list): List of dictionaries, each representing a model's details.
    - model_str (list): List of model names corresponding to models_list.
    - metric (str): Metric to rank the models by (e.g., 'Accuracy', 'F1-Score').

    Returns:
    - all_models (pd.DataFrame): DataFrame with metric scores and model names.
    - models_by_metric (pd.DataFrame): DataFrame filtered by the specified metric and sorted in descending order.
    """
    all_models = [model_dict['metrics'].assign(Model=model_name) for model_dict, model_name in zip(model_list, model_str)]

    # Concatenate the DataFrames in the list
    all_models = pd.concat(all_models, ignore_index=True)


    # Sort the DataFrame by the specified metric in descending order
    models_by_metric = all_models[all_models['Metric'] == metric].sort_values(by='Score', ascending=False)

    return all_models, models_by_metric

## XGBoost

### XGB Final Model:
*   Vectorization: TF-IDF
*   Feature Representation: Unigram

In [None]:
model = 'xgb'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

# Save to CSV
metrics_df.to_csv(f'{folder_path}{model}_metrics.csv')
confusion_df.to_csv(f'{folder_path}{model}_confusion_matrix.csv')

preds created
Single split evaluation completed. Time elapsed: 0.24 minutes.
cv scoring created
cv pipeline created
DF created
Cross validation completed. Time elapsed: 17.68 minutes.

Evaluation results:
     Metric  Single Split Score   CV_Mean  CV_Std Dev
0  F1-Score            0.613309  0.769248    0.001193
1    AUC-PR            0.677126  0.644443    0.001148

Confusion Matrix:
<function confusion_matrix at 0x1539905a4550>


## K-Nearest Neighbors

### K-Nearest Neighbors Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'knn'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

# Save to CSV
metrics_df.to_csv(f'{folder_path}{model}_metrics.csv')
confusion_df.to_csv(f'{folder_path}{model}_confusion_matrix.csv')

preds created
Single split evaluation completed. Time elapsed: 0.05 minutes.
cv scoring created
cv pipeline created
DF created
Cross validation completed. Time elapsed: 0.16 minutes.

Evaluation results:
     Metric  Single Split Score   CV_Mean  CV_Std Dev
0  F1-Score            0.657191  0.644615    0.004530
1    AUC-PR            0.615378  0.616392    0.002254

Confusion Matrix:
<function confusion_matrix at 0x1483674011f0>


## Naive Bayes

In [None]:
# Define variables
model = 'nb'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

# Save to CSV
metrics_df.to_csv(f'{folder_path}{model}_metrics.csv')
confusion_df.to_csv(f'{folder_path}{model}_confusion_matrix.csv')

preds created
Single split evaluation completed. Time elapsed: 0.00 minutes.
cv scoring created
cv pipeline created
DF created
Cross validation completed. Time elapsed: 0.01 minutes.

Evaluation results:
     Metric  Single Split Score   CV_Mean  CV_Std Dev
0  F1-Score             0.78081  0.774895    0.000535
1    AUC-PR             0.66361  0.647433    0.000777

Confusion Matrix:
<function confusion_matrix at 0x15521c438dc0>
