<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/Supervised_Pipeline_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
# Import & download statements
# General Statements
#!git clone https://github.com/d-atallah/implicit_gender_bias.git
#! pip install joblib
#! pip install shap
import pandas as pd
import string
import re
import joblib
#from implicit_gender_bias import config as cf
import os
import numpy as np
import time
#from sklearn.externals import joblib

import shap
import matplotlib.pyplot as plt

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Inputs

In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Load DataFrames from pkl files
y_train = pd.read_pickle(folder_path + 'y_train.pkl')
y_test = pd.read_pickle(folder_path + 'y_test.pkl')

## Define Functions


In [None]:
# Evaluate a model
def model_eval(pipeline, X_test, y_test):
    """
    Evaluates a specified model using accuracy, precision, recall, F-1 score, AUC-ROC & PR, log-Loss, and a confusion matrix.

    Parameters:
    - pipeline (object): Fitted pipeline.
    - X_test (list or array): Test set features.
    - y_test (list or array): True labels.

    Returns:
    - metrics_df (pd.DataFrame): DataFrame containing the metrics and scores.
    - confusion_df (pd.DataFrame): DataFrame containing a confusion matrix.
    """

    start_time = time.time()

    # Create positive binary predictions
    y_pred_proba = pipeline.predict_proba(X_test)
    y_pred = (y_pred_proba >= 0.5)[:, 1].astype(int)

    # Calculate single split metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred)
    auc_pr = average_precision_score(y_test, y_pred)
    logloss = log_loss(y_test, y_pred)

    #Single split evaluation
    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC', 'AUC-PR', 'Log-Loss'],
        'Single Split Score': [accuracy, precision, recall, f1, auc_roc, auc_pr, logloss]
    })
    metrics_df = metrics_df.reset_index(drop=True)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Single split evaluation completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Cross-validation
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'f1': make_scorer(f1_score),
        'roc_auc': make_scorer(roc_auc_score),
        'pr_auc': make_scorer(average_precision_score),
        'log_loss': make_scorer(log_loss, needs_proba=True),
        'precision': make_scorer(precision_score),
        'recall': make_scorer(recall_score)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_validate(pipeline, X_test, y_test, cv=cv, scoring=scoring)

    # Create DataFrame to store cross-validation results
    cv_metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC', 'AUC-PR', 'Log-Loss'],
    'CV_Mean': [np.mean(cv_results['test_accuracy']),
                np.mean(cv_results['test_precision']),
                np.mean(cv_results['test_recall']),
                np.mean(cv_results['test_f1']),
                np.mean(cv_results['test_roc_auc']),
                np.mean(cv_results['test_pr_auc']),
                np.mean(cv_results['test_log_loss'])],
    'CV_Std Dev': [np.std(cv_results['test_accuracy']),
                   np.std(cv_results['test_precision']),
                   np.std(cv_results['test_recall']),
                   np.std(cv_results['test_f1']),
                   np.std(cv_results['test_roc_auc']),
                   np.std(cv_results['test_pr_auc']),
                   np.std(cv_results['test_log_loss'])]
    })

    combined_metrics_df = pd.merge(metrics_df, cv_metrics_df, on='Metric')

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Cross validation completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Print cross-validation results
    print("\nEvaluation results:")
    print(combined_metrics_df)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])

    print("\nConfusion Matrix:")
    print(confusion_matrix)

    return metrics_df, confusion_df

In [None]:
def model_rank(model_list, model_str, metric):
    """
    Finds the model with the best score based on a specified metric.

    Parameters:
    - models_list (list): List of dictionaries, each representing a model's details.
    - model_str (list): List of model names corresponding to models_list.
    - metric (str): Metric to rank the models by (e.g., 'Accuracy', 'F1-Score').

    Returns:
    - all_models (pd.DataFrame): DataFrame with metric scores and model names.
    - models_by_metric (pd.DataFrame): DataFrame filtered by the specified metric and sorted in descending order.
    """
    all_models = [model_dict['metrics'].assign(Model=model_name) for model_dict, model_name in zip(model_list, model_str)]

    # Concatenate the DataFrames in the list
    all_models = pd.concat(all_models, ignore_index=True)


    # Sort the DataFrame by the specified metric in descending order
    models_by_metric = all_models[all_models['Metric'] == metric].sort_values(by='Score', ascending=False)

    return all_models, models_by_metric

## XGBoost

### XGB Final Model:
*   Vectorization: TF-IDF
*   Feature Representation: Unigram

In [None]:
model = 'xgb'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

xgb = {
    'pipeline': pipeline,
    'X_test': test,
    'metrics': metrics_df,
    'confusion matrix': confusion_df
}

## Logistic Regression

### Logistic Regression Model Method:
*   Vectorization: Count
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'log'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

log = {
    'pipeline': pipeline,
    'X_test': test,
    'metrics': metrics_df,
    'confusion matrix': confusion_df
}

Hyperparameters: {'logisticregression__solver': 'saga', 'logisticregression__penalty': 'l1', 'logisticregression__C': 0.1}
Metrics:
      Metric     Score
0   Accuracy  0.525630
1  Precision  0.521610
2     Recall  0.890585
3   F1-Score  0.657895
4        AUC  0.516530
5   Log-Loss  0.692039

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 160                 963
Actual Negative                 129                1050


## Support Vector Machine

### Support Vector Machine Model Method:
*   Vectorization: Count
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'svm'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

svm = {
    'pipeline': pipeline,
    'X_test': test,
    'metrics': metrics_df,
    'confusion matrix': confusion_df
}

Hyperparameters: {'svc__kernel': 'rbf', 'svc__gamma': 'scale', 'svc__C': 10}
Metrics:
      Metric     Score
0   Accuracy  0.526499
1  Precision  0.524930
2     Recall  0.794741
3   F1-Score  0.632254
4        AUC  0.519811
5   Log-Loss  0.692200

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 275                 848
Actual Negative                 242                 937


## K-Nearest Neighbors

### K-Nearest Neighbors Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'knn'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

knn = {
    'pipeline': pipeline,
    'X_test': test,
    'metrics': metrics_df,
    'confusion matrix': confusion_df
}

Hyperparameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 1, 'kneighborsclassifier__n_neighbors': 3}
Metrics:
      Metric     Score
0   Accuracy  0.551694
1  Precision  0.550034
2     Recall  0.685327
3   F1-Score  0.610272
4        AUC  0.548362
5   Log-Loss  7.951702

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 462                 661
Actual Negative                 371                 808


# Model Ranking Comparison

In [None]:
# Model Names (Need string values for dataframe column)
model_list = [xgb_count_1, xgb_tfidf_1, xgb_count_1, xgb_count_2, log_count_1, log_count_2, log_tfidf_1, log_tfidf_2, svm_count_1, svm_count_2, svm_tfidf_1, svm_tfidf_2, knn_count_1, knn_count_2, knn_tfidf_1, knn_tfidf_2]
model_str = ['xgb_count_1', 'xgb_tfidf_1', 'xgb_count_1', 'xgb_count_2', 'log_count_1', 'log_count_2', 'log_tfidf_1', 'log_tfidf_2', 'svm_count_1', 'svm_count_2', 'svm_tfidf_1', 'svm_tfidf_2', 'knn_count_1', 'knn_count_2', 'knn_tfidf_1', 'knn_tfidf_2']

# Specify the metric to rank the models by
all_models, models_by_metric = model_rank(model_list, model_str, 'AUC')
models_by_metric

Unnamed: 0,Metric,Score,Model
10,AUC,0.571551,xgb_tfidf_1
4,AUC,0.570937,xgb_count_1
16,AUC,0.570937,xgb_count_1
22,AUC,0.549021,xgb_count_2
94,AUC,0.548362,knn_tfidf_2
76,AUC,0.545513,knn_count_1
82,AUC,0.541344,knn_count_2
52,AUC,0.519811,svm_count_1
88,AUC,0.519561,knn_tfidf_1
28,AUC,0.51653,log_count_1


In [None]:
all_models.to_csv(folder_path+test_num+'all_models.csv', index=False)
models_by_metric.to_csv(folder_path+test_num+'models_by_metric.csv', index=False)

# Save the vectorizer and associated data
joblib.dump(vectorizer_tfidf_bi,folder_path+'tfidf_vectorizer_bi.pkl')
joblib.dump(X_train_vtfidf_bi, folder_path+'X_train_vtfidf_bi.pkl')
joblib.dump(X_validation_vtfidf_bi, folder_path+'X_validation_vtfidf_bi.pkl')
joblib.dump(X_test_vtfidf_bi, folder_path+'X_test_vtfidf_bi.pkl')