<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/04_XGB_POS_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [31]:
# Import & download statements
# General Statements
#!git clone https://github.com/d-atallah/implicit_gender_bias.git
#! pip install joblib
#! pip install shap
import pandas as pd
import string
import re
import joblib
#from implicit_gender_bias import config as cf
import os
import numpy as np
import time
import spacy
import scipy
from sklearn.feature_selection import SelectFromModel

#import shap
import matplotlib.pyplot as plt

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read Inputs

In [4]:
# Variables
folder_path = '/content/drive/MyDrive/Supervised Learning Notebooks/'#'/home/gibsonce/datallah-jaymefis-gibsonce/'

# Load DataFrames from pkl files
X_train = pd.read_pickle(folder_path + 'X_train_preprocessed.pkl')
X_test = pd.read_pickle(folder_path + 'X_test_preprocessed.pkl')
y_train = pd.read_pickle(folder_path + 'y_train.pkl')
y_test = pd.read_pickle(folder_path + 'y_test.pkl')

In [5]:
non_nan_indices_train = ~X_train.isnull()
non_nan_indices_test = ~X_test.isnull()

# Filter y_train and y_test using the non-NaN indices
y_train = y_train[non_nan_indices_train]
y_test = y_test[non_nan_indices_test]

# Filter X_train and X_test to remove NaN records
X_train = X_train[non_nan_indices_train]
X_test = X_test[non_nan_indices_test]

## Define Functions


In [32]:
def tokenize_and_categorize(text):
    # Tokenize and process the text using spaCy
    doc = nlp(text)
    word_features = ' '.join([token.text for token in doc])
    pos_tags = ' '.join([token.pos_ for token in doc])

    return word_features, pos_tags

In [None]:

def model_testing(X_train, y_train, X_test, y_test, params, model_type = 'XGB'):
    """
    Runs a specified model and dimensionality reduction method with tuned hyperparameters

    Parameters:
    - X_train (array-like): Training set features, preprocessed.
    - y_train (array-like): Training set labels.
    - X_test (array-like): Test set features, preprocessed.
    - y_test (array-like): Test set labels.
    - params (dict): Hyperparameter grid for the specified model and dimensionality reduction method.

    Returns:
    - Pipeline: Trained and fit pipeline with the best hyperparameters.
    - X_train_combined (array-like): Preprocessed  and vectorized training set features with POS tagging.
    - X_test_combined (array-like): Preprocessed  and vectorized test set features with POS tagging.
    """

    start_time = time.time()

    X_train_word_features, X_train_pos_tags = zip(*map(tokenize_and_categorize, X_train))
    X_test_word_features, X_test_pos_tags = zip(*map(tokenize_and_categorize, X_test))

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"POS tagging completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Vectorize the word features
    word_features_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    X_train_word_features_ = word_features_vectorizer.fit_transform(X_train_word_features)
    X_test_word_features_ = word_features_vectorizer.transform(X_test_word_features)

    # Vectorize the parts of speech tags
    pos_tags_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    X_train_pos_tags_ = pos_tags_vectorizer.fit_transform(X_train_pos_tags)
    X_test_pos_tags_ = pos_tags_vectorizer.transform(X_test_pos_tags)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Vectorization completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Combine the vectorized word features and parts of speech tags
    X_train_combined = scipy.sparse.hstack([X_train_word_features_, X_train_pos_tags_])
    X_test_combined = scipy.sparse.hstack([X_test_word_features_, X_test_pos_tags_])

    model = XGBClassifier(random_state=42, **params.get('xgbclassifier', {}))
    model.fit(X_train_combined, y_train)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Pipeline fitting completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    explainer = None#shap.Explainer(model)
    feature_importances = model.feature_importances_
    feature_names = word_features_vectorizer.get_feature_names_out()

    # Clone the original vectorizer and fit it to the misclassified samples
    misclassified_indices = np.where(y_test != model.predict(X_test_combined))[0]
    misclassified_samples = X_test_combined[misclassified_indices]
    misclassified_features = word_features_vectorizer.inverse_transform(misclassified_samples)

    # Combine misclassified feature names into a bag of words
    misclassified_bow = [' '.join(features) for features in misclassified_features]

    # Create feature importances
    feature_importance_dict = dict(zip(feature_names, feature_importances))

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Feature analysis completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Save the trained pipeline
    joblib.dump(model, f'{folder_path}{model_type}_pipeline.pkl')
    joblib.dump(X_train_combined, f'{folder_path}{model_type}_X_train.pkl')
    joblib.dump(X_test_combined, f'{folder_path}{model_type}_X_test.pkl')
    joblib.dump(feature_importance_dict, f'{folder_path}{model_type}_features.pkl')
    joblib.dump(misclassified_bow, f'{folder_path}{model_type}_misclassified_bow.pkl')


    if explainer:
      joblib.dump(explainer, f'{folder_path}{model_type}_shap.pkl')

    print('Write to pkl file completed.')

    return model, X_train_combined, X_test_combined, explainer, feature_importance_dict, misclassified_bow


In [50]:

def model_testing(X_train, y_train, X_test, y_test, params, model_type = 'XGB'):
    """
    Runs a specified model and dimensionality reduction method with tuned hyperparameters

    Parameters:
    - X_train (array-like): Training set features, preprocessed.
    - y_train (array-like): Training set labels.
    - X_test (array-like): Test set features, preprocessed.
    - y_test (array-like): Test set labels.
    - params (dict): Hyperparameter grid for the specified model and dimensionality reduction method.

    Returns:
    - Pipeline: Trained and fit pipeline with the best hyperparameters.
    - X_train_combined (array-like): Preprocessed  and vectorized training set features with POS tagging.
    - X_test_combined (array-like): Preprocessed  and vectorized test set features with POS tagging.
    """

    start_time = time.time()

    X_train_word_features, X_train_pos_tags = zip(*map(tokenize_and_categorize, X_train))
    X_test_word_features, X_test_pos_tags = zip(*map(tokenize_and_categorize, X_test))

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"POS tagging completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Vectorize the word features
    word_features_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    X_train_word_features_ = word_features_vectorizer.fit_transform(X_train_word_features)
    X_test_word_features_ = word_features_vectorizer.transform(X_test_word_features)

    # Vectorize the parts of speech tags
    pos_tags_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    X_train_pos_tags_ = pos_tags_vectorizer.fit_transform(X_train_pos_tags)
    X_test_pos_tags_ = pos_tags_vectorizer.transform(X_test_pos_tags)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Vectorization completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Combine the vectorized word features and parts of speech tags
    X_train_combined = scipy.sparse.hstack([X_train_word_features_, X_train_pos_tags_])
    X_test_combined = scipy.sparse.hstack([X_test_word_features_, X_test_pos_tags_])

    model = XGBClassifier(random_state=42, **params.get('xgbclassifier', {}))
    model.fit(X_train_combined, y_train)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Pipeline fitting completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    explainer = None#shap.Explainer(model)

    # Create feature importances
    feature_importances = model.feature_importances_
    word_features = (word_features_vectorizer.get_feature_names_out())
    word_importance_dict = dict(zip(word_features, feature_importances))


    # Clone the original vectorizer and fit it to the misclassified samples
    misclassified_indices = np.where(y_test != model.predict(X_test_combined))[0]
    misclassified_samples = X_test_combined[misclassified_indices]
    misclassified_features = (
        word_features_vectorizer.inverse_transform(misclassified_samples[:, :X_test_word_features_.shape[1]])
        + pos_tags_vectorizer.inverse_transform(misclassified_samples[:, X_test_word_features_.shape[1]:])
    )

    # Combine misclassified feature names into a bag of words
    misclassified_bow = [' '.join(features) for features in misclassified_features]

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Feature analysis completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Save the trained pipeline
    joblib.dump(model, f'{folder_path}{model_type}_pipeline.pkl')
    joblib.dump(X_train_combined, f'{folder_path}{model_type}_X_train.pkl')
    joblib.dump(X_test_combined, f'{folder_path}{model_type}_X_test.pkl')
    #joblib.dump(feature_importance_dict, f'{folder_path}{model_type}_features.pkl')
    joblib.dump(misclassified_bow, f'{folder_path}{model_type}_misclassified_bow.pkl')


    if explainer:
      joblib.dump(explainer, f'{folder_path}{model_type}_shap.pkl')

    print('Write to pkl file completed.')

    return model, X_train_combined, X_test_combined, explainer, word_importance_dict, misclassified_bow


## XGBoost

In [51]:
# Define variables
model_type = 'xgb'
params = {
    'xgbclassifier': {'subsample': 0.8, 'n_estimators': 150, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.5},
}

# Run model
model, train, test, explainer, word_importance_dict, misclassified_bow = model_testing(X_train, y_train, X_test, y_test, params , model_type)

POS tagging completed. Time elapsed: 1.91 minutes.
Vectorization completed. Time elapsed: 1.92 minutes.
Pipeline fitting completed. Time elapsed: 2.14 minutes.
Feature analysis completed. Time elapsed: 2.15 minutes.
Write to pkl file completed.


In [52]:
word_importance_dict

{'aaaaaaaa': 0.0,
 'aaaaaaaaaaaaa': 0.0,
 'aaaaw': 0.0,
 'aaah': 0.0,
 'aand': 0.0,
 'aaron': 0.0,
 'ab': 0.0,
 'abad': 0.0,
 'abalicious': 0.0,
 'abalone': 0.0,
 'abandon': 0.0,
 'abandoned': 0.0,
 'abaya': 0.0,
 'abbia': 0.0,
 'abc': 0.0,
 'abdul': 0.0,
 'abe': 0.0,
 'aberdeen': 0.0,
 'abide': 0.0,
 'ability': 0.0,
 'abillity': 0.0,
 'abject': 0.0,
 'ablas': 0.0,
 'ablation': 0.0,
 'able': 0.0019955356,
 'ably': 0.0,
 'abnormal': 0.0,
 'aboard': 0.0,
 'abolish': 0.0,
 'abolishing': 0.0,
 'abominable': 0.0,
 'abortion': 0.0,
 'abounds': 0.0,
 'abouti': 0.0,
 'aboutno': 0.0,
 'aboutshouldnt': 0.0,
 'aboutwhich': 0.0,
 'abraham': 0.0,
 'abras': 0.0,
 'abrazo': 0.0,
 'abrazos': 0.0,
 'abroad': 0.0,
 'absentee': 0.0,
 'absolute': 0.0,
 'absolutely': 0.0025113393,
 'absolutly': 0.0,
 'absolved': 0.0,
 'absorb': 0.0,
 'abstain': 0.0,
 'abstention': 0.0,
 'absurd': 0.0,
 'abt': 0.0,
 'abundance': 0.0,
 'abuse': 0.0,
 'abused': 0.0,
 'abuser': 0.0,
 'abusing': 0.0,
 'abvious': 0.0,
 'abyss': 

In [42]:
feature_importances = model.feature_importances_
len(feature_importances)

17213

In [None]:
sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

# Get the top 100 features and their importance values
top_features = list(sorted_feature_importance.keys())[:1000]
top_features_importance = list(sorted_feature_importance.values())[:1000]
sorted_feature_importance