<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/05_Supervised_Model_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
# Import & download statements
# General Statements
#!git clone https://github.com/d-atallah/implicit_gender_bias.git
#! pip install joblib
#! pip install shap
import pandas as pd
import string
import re
import joblib
#from implicit_gender_bias import config as cf
import os
import numpy as np
import time
#from sklearn.externals import joblib

import shap
import matplotlib.pyplot as plt

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Inputs

In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Load DataFrames from pkl files
y_train = pd.read_pickle(folder_path + 'y_train.pkl')
y_test = pd.read_pickle(folder_path + 'y_test.pkl')

## Define Functions


In [None]:
def model_eval(pipeline, X_test, y_test):
    """
    Evaluates a specified model using accuracy, precision, recall, F-1 score, AUC-ROC & PR, log-Loss,
    and a confusion matrix.

    Parameters:
    - pipeline (object): Fitted pipeline.
    - X_test (list or array): Test set features.
    - y_test (list or array): True labels.

    Returns:
    - metrics_df (pd.DataFrame): DataFrame containing the metrics and scores.
    - confusion_df (pd.DataFrame): DataFrame containing a confusion matrix.
    """

    start_time = time.time()

    # Cross-validation
    scoring = {
        'f1': make_scorer(f1_score),
        'pr_auc': make_scorer(average_precision_score),
        'log_loss': make_scorer(log_loss, greater_is_better=False, needs_proba=True)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_validate(pipeline, X_test, y_test, cv=cv, scoring=scoring)

    # Create DataFrame to store cross-validation results
    cv_metrics_df = pd.DataFrame({
        'Metric': ['F1-Score', 'AUC-PR', 'Log Loss'],#
        'CV_Mean': [np.mean(cv_results['test_f1']),
                    np.mean(cv_results['test_pr_auc']),
                    -np.mean(cv_results['test_log_loss'])],  # Note the negative sign for log loss
        'CV_Std Dev': [np.std(cv_results['test_f1']),
                       np.std(cv_results['test_pr_auc']),
                       np.std(cv_results['test_log_loss'])
                       ]
    })



    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Cross-validation completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Print cross-validation results
    print("\nEvaluation results:")
    print(cv_metrics_df)

    # Create positive binary predictions
    y_pred_proba = pipeline.predict_proba(X_test)
    y_pred = (y_pred_proba >= 0.5)[:, 1].astype(int)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'],
                                index=['Actual Positive', 'Actual Negative'])

    print("\nConfusion Matrix:")
    print(confusion_df)

    return cv_metrics_df, confusion_df

## Naive Bayes

In [None]:
# Define variables
model = 'nb'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

# Save to CSV
metrics_df.to_csv(f'{folder_path}{model}_metrics.csv')
confusion_df.to_csv(f'{folder_path}{model}_confusion_matrix.csv')

The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Cross-validation completed. Time elapsed: 0.01 minutes.

Evaluation results:
     Metric   CV_Mean  CV_Std Dev
0  F1-Score  0.791308    0.001115
1    AUC-PR  0.675922    0.001170
2  Log Loss  0.643449    0.003693
preds created

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive               39106               30222
Actual Negative               30645               90374


## K-Nearest Neighbors

In [None]:
# Define variables
model = 'knn'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

# Save to CSV
metrics_df.to_csv(f'{folder_path}{model}_metrics.csv')
confusion_df.to_csv(f'{folder_path}{model}_confusion_matrix.csv')

The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Cross-validation completed. Time elapsed: 0.13 minutes.

Evaluation results:
     Metric   CV_Mean  CV_Std Dev
0  F1-Score  0.682386    0.004496
1    AUC-PR  0.657732    0.001818
2  Log Loss  5.127464    0.150495
preds created

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive               33644               35684
Actual Negative               46430               74589


## XGBoost

#### Hyperparameter Tuned Model

In [None]:
model = 'xgb'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

# Save to CSV
metrics_df.to_csv(f'{folder_path}{model}_metrics.csv')
confusion_df.to_csv(f'{folder_path}{model}_confusion_matrix.csv')

#### Hyperparameter Tuned & POS Tagged Model

In [None]:
model = 'xgb'
pipeline = joblib.load(f'{folder_path}{model}_pipeline.pkl')
X_test = joblib.load(f'{folder_path}{model}_X_test.pkl')
metrics_df, confusion_df = model_eval(pipeline, X_test, y_test)

# Save to CSV
metrics_df.to_csv(f'{folder_path}{model}_pos_metrics.csv')
confusion_df.to_csv(f'{folder_path}{model}_pos_confusion_matrix.csv')

The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Cross-validation completed. Time elapsed: 15.08 minutes.

Evaluation results:
     Metric   CV_Mean  CV_Std Dev
0  F1-Score  0.794441    0.000448
1    AUC-PR  0.675404    0.000788
2  Log Loss  0.593747    0.001084
preds created

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive               43104               26224
Actual Negative               42636               78383
