<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/06_Supervised_Ablation_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
!pip install pydantic==1.8.2 typing-extensions==4.0.1

Defaulting to user installation because normal site-packages is not writeable


In [None]:
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 8.3 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
# Import & download statements
# General Statements
#!git clone https://github.com/d-atallah/implicit_gender_bias.git
#! pip install joblib
#! pip install shap
import pandas as pd
import string
import re
import joblib
#from implicit_gender_bias import config as cf
import os
import numpy as np
import time
import matplotlib.pyplot as plt
import spacy
import shap
import scipy


# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Inputs

In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Load DataFrames from pkl files
X_train = pd.read_pickle(folder_path + 'X_train_preprocessed.pkl')
X_test = pd.read_pickle(folder_path + 'X_test_preprocessed.pkl')
y_train = pd.read_pickle(folder_path + 'y_train.pkl')
y_test = pd.read_pickle(folder_path + 'y_test.pkl')

## Define Functions


In [None]:
def tokenize_and_categorize_batch(texts):
    docs = list(nlp.pipe(texts))
    word_features = [' '.join([token.text for token in doc]) for doc in docs]
    pos_tags = [' '.join([token.pos_ for token in doc]) for doc in docs]

    return word_features, pos_tags

In [None]:
def model_eval(pipeline, X_test, y_test):
    """
    Evaluates a specified model using accuracy, precision, recall, F-1 score, AUC-ROC & PR, log-Loss, and a confusion matrix.

    Parameters:
    - pipeline (object): Fitted pipeline.
    - X_test (list or array): Test set features.
    - y_test (list or array): True labels.

    Returns:
    - metrics_df (pd.DataFrame): DataFrame containing the metrics and scores.
    - confusion_df (pd.DataFrame): DataFrame containing a confusion matrix.
    """

    start_time = time.time()

    # Cross-validation
    scoring = {
        'f1': make_scorer(f1_score),
        'pr_auc': make_scorer(average_precision_score),
        'log_loss': make_scorer(log_loss, greater_is_better=False, needs_proba=True)
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_validate(pipeline, X_test, y_test, cv=cv, scoring=scoring)

    # Create DataFrame to store cross-validation results
    cv_metrics_df = pd.DataFrame({
        'Metric': ['F1-Score', 'AUC-PR', 'Log Loss'],#
        'CV_Mean': [np.mean(cv_results['test_f1']),
                    np.mean(cv_results['test_pr_auc']),
                    -np.mean(cv_results['test_log_loss'])],  # Note the negative sign for log loss
        'CV_Std Dev': [np.std(cv_results['test_f1']),
                       np.std(cv_results['test_pr_auc']),
                       np.std(cv_results['test_log_loss'])
                       ]
    })



    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Cross-validation completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Print cross-validation results
    print("\nEvaluation results:")
    print(cv_metrics_df)


    return cv_metrics_df

In [None]:
def model_testing(X_train, y_train, X_test, y_test, params):
    """
    Runs a specified model and dimensionality reduction method with tuned hyperparameters

    Parameters:
    - X_train (array-like): Training set features, preprocessed.
    - y_train (array-like): Training set labels.
    - X_test (array-like): Test set features, preprocessed.
    - y_test (array-like): Test set labels.

    Returns:
    - Pipeline: Trained and fit pipeline with the best hyperparameters.
    - X_train_ (array-like): Preprocessed  and vectorized training set features.
    - X_test_ (array-like): Preprocessed  and vectorized test set features.
    """
    #X_train_word_features, X_train_pos_tags = zip(*map(tokenize_and_categorize, X_train))
    #X_test_word_features, X_test_pos_tags = zip(*map(tokenize_and_categorize, X_test))
    start_time = time.time()

    train_batch_size = 10000
    test_batch_size = 10000

    # Initialize empty pandas Series for training data
    X_train_ = pd.Series(dtype='object')
    X_train_pos = pd.Series(dtype='object')
    # Initialize empty pandas Series for testing data
    X_test_ = pd.Series(dtype='object')
    X_test_pos = pd.Series(dtype='object')

    # Iterator for training data
    train_iterator = (X_train.iloc[i:i+train_batch_size] for i in range(0, len(X_train), train_batch_size))

    # Concatenate each batch of results for training data
    print('Train batch start')
    for train_batch in train_iterator:
        X_train_word_features, X_train_pos_tags = tokenize_and_categorize_batch(train_batch)

        # Concatenate to the existing Series
        X_train_ = pd.concat([X_train_, pd.Series(X_train_word_features)])
        X_train_pos = pd.concat([X_train_pos, pd.Series(X_train_pos_tags)])


    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Train batch completed. Time elapsed: {elapsed_time/60:.2f} minutes.")
    # Iterator for testing data
    test_iterator = (X_test.iloc[i:i+test_batch_size] for i in range(0, len(X_test), test_batch_size))

    print('Test batch start')
    # Concatenate each batch of results for testing data
    for test_batch in test_iterator:
        X_test_word_features, X_test_pos_tags = tokenize_and_categorize_batch(test_batch)

        # Concatenate to the existing Series
        X_test_ = pd.concat([X_test_, pd.Series(X_test_word_features)])
        X_test_pos = pd.concat([X_test_pos, pd.Series(X_test_pos_tags)])

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Test batch completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Reset the index of training data
    X_train_.reset_index(drop=True, inplace=True)
    X_train_pos.reset_index(drop=True, inplace=True)
    # Reset the index of testing data
    X_test_.reset_index(drop=True, inplace=True)
    X_test_pos.reset_index(drop=True, inplace=True)

    X_train = X_train_
    X_test = X_test_
    X_train_pos_tags = X_train_pos
    X_test_pos_tags = X_test_pos

    ablation_results = {}
    # Iterate over unique part of speech categories
    unique_pos_tags = [
         'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ','NOUN', 'NUM'
        , 'PART', 'PRON', 'PROPN','PUNCT', 'SCONJ', 'SYM', 'VERB', 'X'#
    ]
    for pos_tag in unique_pos_tags:
        print(pos_tag)
        # Filter out the current part of speech category
        X_train_filtered = [doc for doc, tags in zip(X_train, X_train_pos_tags) if pos_tag not in tags]
        X_test_filtered = [doc for doc, tags in zip(X_test, X_test_pos_tags) if pos_tag not in tags]

        y_train_filtered = [label for doc, label, tags in zip(X_train, y_train, X_train_pos_tags) if pos_tag not in tags]
        y_test_filtered = [label for doc, label, tags in zip(X_test, y_test, X_test_pos_tags) if pos_tag not in tags]

        # Filter the part of speech tags
        X_train_pos_tags_filtered = [tags for tags in X_train_pos_tags if pos_tag not in tags]
        X_test_pos_tags_filtered = [tags for tags in X_test_pos_tags if pos_tag not in tags]

        # Vectorize the word features
        word_features_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
        X_train_word_features_ = word_features_vectorizer.fit_transform(X_train_filtered)
        X_test_word_features_ = word_features_vectorizer.transform(X_test_filtered)

        # Vectorize the parts of speech tags
        pos_tags_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
        X_train_pos_tags_ = pos_tags_vectorizer.fit_transform(X_train_pos_tags_filtered)
        X_test_pos_tags_ = pos_tags_vectorizer.transform(X_test_pos_tags_filtered)

        # Combine the vectorized word features and parts of speech tags
        X_train_combined = scipy.sparse.hstack([X_train_word_features_, X_train_pos_tags_])
        X_test_combined = scipy.sparse.hstack([X_test_word_features_, X_test_pos_tags_])

        model = XGBClassifier(random_state=42, **params.get('xgbclassifier', {}))
        model.fit(X_train_combined, y_train_filtered)

        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"{pos_tag} removed model fit completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

        metrics_df = model_eval(model, X_test_combined, y_test_filtered)
        ablation_results[pos_tag] = metrics_df

    joblib.dump(ablation_results, f'{folder_path}_ablation_results.pkl')

    return ablation_results


## XGBoost

### Remove nulls from preprocessing

In [None]:
non_nan_indices_train = ~X_train.isnull()
non_nan_indices_test = ~X_test.isnull()

# Filter y_train and y_test using the non-NaN indices
y_train = y_train[non_nan_indices_train]
y_test = y_test[non_nan_indices_test]

# Filter X_train and X_test to remove NaN records
X_train = X_train[non_nan_indices_train]
X_test = X_test[non_nan_indices_test]

### Ablation Testing

In [None]:
# Define variables
params = {
    'xgbclassifier': {'subsample': 0.8, 'n_estimators': 150, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.5},
}

# Run ablation testing
ablation_results = model_testing(X_train, y_train, X_test, y_test, params)

Train batch start
Train batch completed. Time elapsed: 11.67 minutes.
Test batch start
Test batch completed. Time elapsed: 17.59 minutes.
PUNCT
PUNCT removed model fit completed. Time elapsed: 24.49 minutes.


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Cross-validation completed. Time elapsed: 19.97 minutes.

Evaluation results:
     Metric   CV_Mean  CV_Std Dev
0  F1-Score  0.794586    0.001437
1    AUC-PR  0.675257    0.000775
2  Log Loss  0.594037    0.001727
SCONJ
SCONJ removed model fit completed. Time elapsed: 49.91 minutes.


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Cross-validation completed. Time elapsed: 15.23 minutes.

Evaluation results:
     Metric   CV_Mean  CV_Std Dev
0  F1-Score  0.789836    0.000862
1    AUC-PR  0.669149    0.001147
2  Log Loss  0.598953    0.001029
SYM
SYM removed model fit completed. Time elapsed: 71.42 minutes.


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Cross-validation completed. Time elapsed: 18.26 minutes.

Evaluation results:
     Metric   CV_Mean  CV_Std Dev
0  F1-Score  0.794564    0.000643
1    AUC-PR  0.675739    0.000843
2  Log Loss  0.593839    0.001212
VERB
VERB removed model fit completed. Time elapsed: 90.04 minutes.


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Cross-validation completed. Time elapsed: 0.91 minutes.

Evaluation results:
     Metric   CV_Mean  CV_Std Dev
0  F1-Score  0.728845    0.003082
1    AUC-PR  0.597389    0.003013
2  Log Loss  0.636080    0.002550
X
X removed model fit completed. Time elapsed: 93.40 minutes.


The `needs_threshold` and `needs_proba` parameter are deprecated in version 1.4 and will be removed in 1.6. You can either let `response_method` be `None` or set it to `predict` to preserve the same behaviour.


Cross-validation completed. Time elapsed: 6.92 minutes.

Evaluation results:
     Metric   CV_Mean  CV_Std Dev
0  F1-Score  0.774063    0.001053
1    AUC-PR  0.650621    0.001096
2  Log Loss  0.612550    0.001118


In [None]:
# Provided data
data = [
    ("ADJ", "F1-Score", 0.760941, 0.001806),
    ("ADJ", "AUC-PR", 0.630041, 0.001573),
    ("ADJ", "Log Loss", 0.624516, 0.001486),
    ("ADP", "F1-Score", 0.787126, 0.001073),
    ("ADP", "AUC-PR", 0.666226, 0.001381),
    ("ADP", "Log Loss", 0.602603, 0.001933),
    ("ADV", "F1-Score", 0.763784, 0.000713),
    ("ADV", "AUC-PR", 0.637339, 0.000374),
    ("ADV", "Log Loss", 0.620317, 0.000713),
    ("AUX", "F1-Score", 0.774778, 0.001708),
    ("AUX", "AUC-PR", 0.652002, 0.001772),
    ("AUX", "Log Loss", 0.611575, 0.001864),
    ("CONJ", "F1-Score", 0.789011, 0.000541),
    ("CONJ", "AUC-PR", 0.668109, 0.000436),
    ("CONJ", "Log Loss", 0.600078, 0.000797),
    ("CCONJ", "F1-Score", 0.793561, 0.001265),
    ("CCONJ", "AUC-PR", 0.674641, 0.001247),
    ("CCONJ", "Log Loss", 0.594794, 0.001532),
    ("DET", "F1-Score", 0.791592, 0.000687),
    ("DET", "AUC-PR", 0.671683, 0.000878),
    ("DET", "Log Loss", 0.597343, 0.001275),
    ("INTJ", "F1-Score", 0.797550, 0.000983),
    ("INTJ", "AUC-PR", 0.677920, 0.001160),
    ("INTJ", "Log Loss", 0.593957, 0.000977),
    ("NOUN", "F1-Score", 0.725884, 0.003907),
    ("NOUN", "AUC-PR", 0.594031, 0.002901),
    ("NOUN", "Log Loss", 0.635091, 0.002177),
    ("NUM", "F1-Score", 0.787400, 0.001387),
    ("NUM", "AUC-PR", 0.665819, 0.000999),
    ("NUM", "Log Loss", 0.601654, 0.001036),
    ("PART", "F1-Score", 0.784691, 0.000786),
    ("PART", "AUC-PR", 0.662827, 0.000909),
    ("PART", "Log Loss", 0.602660, 0.000983),
    ("PRON", "F1-Score", 0.775020, 0.001730),
    ("PRON", "AUC-PR", 0.647677, 0.001525),
    ("PRON", "Log Loss", 0.616708, 0.001684),
    ("PROPN", "F1-Score", 0.774665, 0.001458),
    ("PROPN", "AUC-PR", 0.650410, 0.001261),
    ("PROPN", "Log Loss", 0.621508, 0.001641),
    ("PUNCT", "F1-Score", 0.794586, 0.001437),
    ("PUNCT", "AUC-PR", 0.675257, 0.000775),
    ("PUNCT", "Log Loss", 0.594037, 0.001727),
    ("SCONJ", "F1-Score", 0.789836, 0.000862),
    ("SCONJ", "AUC-PR", 0.669149, 0.001147),
    ("SCONJ", "Log Loss", 0.598953, 0.001029),
    ("SYM", "F1-Score", 0.794564, 0.000643),
    ("SYM", "AUC-PR", 0.675739, 0.000843),
    ("SYM", "Log Loss", 0.593839, 0.001212),
    ("VERB", "F1-Score", 0.728845, 0.003082),
    ("VERB", "AUC-PR", 0.597389, 0.003013),
    ("VERB", "Log Loss", 0.636080, 0.002550),
    ("X", "F1-Score", 0.774063, 0.001053),
    ("X", "AUC-PR", 0.650621, 0.001096),
    ("X", "Log Loss", 0.612550, 0.001118),
]

# Create a DataFrame from the provided data
ablation_testing = pd.DataFrame(data, columns=["POS", "Metric", "CV_Mean", "CV_Std Dev"])
#joblib.dump(ablation_results, f'{folder_path}ablation_results.pkl')

In [None]:
filtered_df = ablation_testing[ablation_testing['Metric'] == 'F1-Score']
filtered_df

Unnamed: 0,POS,Metric,CV_Mean,CV_Std Dev
0,ADJ,F1-Score,0.760941,0.001806
3,ADP,F1-Score,0.787126,0.001073
6,ADV,F1-Score,0.763784,0.000713
9,AUX,F1-Score,0.774778,0.001708
12,CONJ,F1-Score,0.789011,0.000541
15,CCONJ,F1-Score,0.793561,0.001265
18,DET,F1-Score,0.791592,0.000687
21,INTJ,F1-Score,0.79755,0.000983
24,NOUN,F1-Score,0.725884,0.003907
27,NUM,F1-Score,0.7874,0.001387
