<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/04_XGB_POS_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
! pip install shap

Defaulting to user installation because normal site-packages is not writeable


In [None]:
# Import & download statements
# General Statements
#!git clone https://github.com/d-atallah/implicit_gender_bias.git
#! pip install joblib
#! pip install shap
import pandas as pd
import string
import re
import joblib
#from implicit_gender_bias import config as cf
import os
import numpy as np
import time
import spacy
import scipy
from sklearn.feature_selection import SelectFromModel

import shap
import matplotlib.pyplot as plt

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Inputs

In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Load DataFrames from pkl files
X_train = pd.read_pickle(folder_path + 'X_train_preprocessed.pkl')
X_test = pd.read_pickle(folder_path + 'X_test_preprocessed.pkl')
y_train = pd.read_pickle(folder_path + 'y_train.pkl')
y_test = pd.read_pickle(folder_path + 'y_test.pkl')

In [None]:
non_nan_indices_train = ~X_train.isnull()
non_nan_indices_test = ~X_test.isnull()

# Filter y_train and y_test using the non-NaN indices
y_train = y_train[non_nan_indices_train]
y_test = y_test[non_nan_indices_test]

# Filter X_train and X_test to remove NaN records
X_train = X_train[non_nan_indices_train]
X_test = X_test[non_nan_indices_test]

## Define Functions


In [None]:
def tokenize_and_categorize_batch(texts):
    docs = list(nlp.pipe(texts))
    word_features = [' '.join([token.text for token in doc]) for doc in docs]
    pos_tags = [' '.join([token.pos_ for token in doc]) for doc in docs]

    return word_features, pos_tags

In [None]:
def model_testing(X_train, y_train, X_test, y_test, params, model_type = 'XGB'):
    """
    Runs a specified model and dimensionality reduction method with tuned hyperparameters

    Parameters:
    - X_train (array-like): Training set features, preprocessed.
    - y_train (array-like): Training set labels.
    - X_test (array-like): Test set features, preprocessed.
    - y_test (array-like): Test set labels.
    - params (dict): Hyperparameter grid for the specified model and dimensionality reduction method.

    Returns:
    - Pipeline: Trained and fit pipeline with the best hyperparameters.
    - X_train_combined (array-like): Preprocessed  and vectorized training set features with POS tagging.
    - X_test_combined (array-like): Preprocessed  and vectorized test set features with POS tagging.
    """

    start_time = time.time()

    train_batch_size = 10000
    test_batch_size = 10000

    # Initialize empty pandas Series for training data
    X_train_ = pd.Series(dtype='object')
    X_train_pos = pd.Series(dtype='object')
    # Initialize empty pandas Series for testing data
    X_test_ = pd.Series(dtype='object')
    X_test_pos = pd.Series(dtype='object')

    # Iterator for training data
    train_iterator = (X_train.iloc[i:i+train_batch_size] for i in range(0, len(X_train), train_batch_size))

    # Concatenate each batch of results for training data
    print('Train batch start')
    for train_batch in train_iterator:
        X_train_word_features, X_train_pos_tags = tokenize_and_categorize_batch(train_batch)

        # Concatenate to the existing Series
        X_train_ = pd.concat([X_train_, pd.Series(X_train_word_features)])
        X_train_pos = pd.concat([X_train_pos, pd.Series(X_train_pos_tags)])

    print('Train batch end')
    # Iterator for testing data
    test_iterator = (X_test.iloc[i:i+test_batch_size] for i in range(0, len(X_test), test_batch_size))

    print('Test batch start')
    # Concatenate each batch of results for testing data
    for test_batch in test_iterator:
        X_test_word_features, X_test_pos_tags = tokenize_and_categorize_batch(test_batch)

        # Concatenate to the existing Series
        X_test_ = pd.concat([X_test_, pd.Series(X_test_word_features)])
        X_test_pos = pd.concat([X_test_pos, pd.Series(X_test_pos_tags)])

    print('Test batch end')
    # Reset the index of training data
    X_train_.reset_index(drop=True, inplace=True)
    X_train_pos.reset_index(drop=True, inplace=True)
    # Reset the index of testing data
    X_test_.reset_index(drop=True, inplace=True)
    X_test_pos.reset_index(drop=True, inplace=True)

    X_train = X_train_
    X_test = X_test_
    X_train_pos_tags = X_train_pos
    X_test_pos_tags = X_test_pos

    # Vectorize the word features
    word_features_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    X_train_word_features_ = word_features_vectorizer.fit_transform(X_train)
    X_test_word_features_ = word_features_vectorizer.transform(X_test)

    # Vectorize the parts of speech tags
    pos_tags_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    X_train_pos_tags_ = pos_tags_vectorizer.fit_transform(X_train_pos)
    X_test_pos_tags_ = pos_tags_vectorizer.transform(X_test_pos)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Vectorization completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Combine the vectorized word features and parts of speech tags
    X_train_combined = scipy.sparse.hstack([X_train_word_features_, X_train_pos_tags_])
    X_test_combined = scipy.sparse.hstack([X_test_word_features_, X_test_pos_tags_])

    model = XGBClassifier(random_state=42, **params.get('xgbclassifier', {}))
    model.fit(X_train_combined, y_train)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Pipeline fitting completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    explainer = shap.Explainer(model)

    # Create feature importances
    feature_importances = model.feature_importances_
    word_features = (word_features_vectorizer.get_feature_names_out())
    feature_importance_dict = dict(zip(word_features, feature_importances))


    # Clone the original vectorizer and fit it to the misclassified samples
    misclassified_indices = np.where(y_test != model.predict(X_test_combined))[0]
    misclassified_samples = X_test_combined.tocsc()[misclassified_indices].tocsc()
    misclassified_features = (
        word_features_vectorizer.inverse_transform(misclassified_samples[:, :X_test_word_features_.shape[1]])
        + pos_tags_vectorizer.inverse_transform(misclassified_samples[:, X_test_word_features_.shape[1]:])
    )

    # Combine misclassified feature names into a bag of words
    misclassified_bow = [' '.join(features) for features in misclassified_features]

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Feature analysis completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Save the trained pipeline
    #joblib.dump(model, f'{folder_path}{model_type}_pipeline.pkl')
    #joblib.dump(X_train_combined, f'{folder_path}{model_type}_X_train.pkl')
    #joblib.dump(X_test_combined, f'{folder_path}{model_type}_X_test.pkl')
    #joblib.dump(feature_importance_dict, f'{folder_path}{model_type}_features.pkl')
    #joblib.dump(misclassified_bow, f'{folder_path}{model_type}_misclassified_bow.pkl')
    #joblib.dump(explainer, f'{folder_path}{model_type}_shap.pkl')


    print('Write to pkl file completed.')

    return model, X_train_combined, X_test_combined, explainer, word_importance_dict, misclassified_bow


## XGBoost

In [None]:
# Define variables
model_type = 'xgb'
params = {
    'xgbclassifier': {'subsample': 0.8, 'n_estimators': 150, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.5},
}


# Run model
model, train, test, explainer, word_importance_dict, misclassified_bow = model_testing(X_train_, y_train, X_test_, y_test, params , model_type)

## Function Testing
### Do not use

In [None]:
train_batch_size = 10000

# Initialize empty pandas Series for training data
X_train_ = pd.Series(dtype='object')
X_train_pos = pd.Series(dtype='object')

# Iterator for training data
train_iterator = (X_train.iloc[i:i+train_batch_size] for i in range(0, len(X_train), train_batch_size))

# Concatenate each batch of results to the respective Series for training data
for train_batch in train_iterator:
    X_train_word_features, X_train_pos_tags = tokenize_and_categorize_batch(train_batch)

    # Concatenate to the existing Series
    X_train_ = pd.concat([X_train_, pd.Series(X_train_word_features)])
    X_train_pos = pd.concat([X_train_pos, pd.Series(X_train_pos_tags)])

# Reset the index of the Series for training data
X_train_.reset_index(drop=True, inplace=True)
X_train_pos.reset_index(drop=True, inplace=True)

In [None]:
test_batch_size = 10000

# Initialize empty pandas Series for testing data
X_test_ = pd.Series(dtype='object')
X_test_pos = pd.Series(dtype='object')

# Iterator for testing data
test_iterator = (X_test.iloc[i:i+test_batch_size] for i in range(0, len(X_test), test_batch_size))

# Concatenate each batch of results to the respective Series for testing data
for test_batch in test_iterator:
    X_test_word_features, X_test_pos_tags = tokenize_and_categorize_batch(test_batch)

    # Concatenate to the existing Series
    X_test_ = pd.concat([X_test_, pd.Series(X_test_word_features)])
    X_test_pos = pd.concat([X_test_pos, pd.Series(X_test_pos_tags)])

# Reset the index of the Series for testing data
X_test_.reset_index(drop=True, inplace=True)
X_test_pos.reset_index(drop=True, inplace=True)

In [None]:
# Vectorize the word features
word_features_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X_train_word_features_ = word_features_vectorizer.fit_transform(X_train)
X_test_word_features_ = word_features_vectorizer.transform(X_test)

# Vectorize the parts of speech tags
pos_tags_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X_train_pos_tags_ = pos_tags_vectorizer.fit_transform(X_train_pos)
X_test_pos_tags_ = pos_tags_vectorizer.transform(X_test_pos)

In [None]:
# Combine the vectorized word features and parts of speech tags
X_train_combined = scipy.sparse.hstack([X_train_word_features_, X_train_pos_tags_])
X_test_combined = scipy.sparse.hstack([X_test_word_features_, X_test_pos_tags_])

In [None]:
model = XGBClassifier(random_state=42, **params.get('xgbclassifier', {}))
model.fit(X_train_combined, y_train)

In [None]:
explainer = shap.Explainer(model)

# Create feature importances
feature_importances = model.feature_importances_
word_features = (word_features_vectorizer.get_feature_names_out())
feature_importance_dict = dict(zip(word_features, feature_importances))

In [None]:
# Clone the original vectorizer and fit it to the misclassified samples
misclassified_indices = np.where(y_test != model.predict(X_test_combined))[0]
misclassified_samples = X_test_combined.tocsc()[misclassified_indices].tocsc()
misclassified_features = (
    word_features_vectorizer.inverse_transform(misclassified_samples[:, :X_test_word_features_.shape[1]])
    + pos_tags_vectorizer.inverse_transform(misclassified_samples[:, X_test_word_features_.shape[1]:])
)

# Combine misclassified feature names into a bag of words
misclassified_bow = [' '.join(features) for features in misclassified_features]

In [None]:
# Save the trained pipeline
joblib.dump(model, f'{folder_path}{model_type}_pipeline.pkl')
joblib.dump(X_train_combined, f'{folder_path}{model_type}_X_train.pkl')
joblib.dump(X_test_combined, f'{folder_path}{model_type}_X_test.pkl')
joblib.dump(feature_importance_dict, f'{folder_path}{model_type}_features.pkl')
joblib.dump(misclassified_bow, f'{folder_path}{model_type}_misclassified_bow.pkl')
joblib.dump(explainer, f'{folder_path}{model_type}_shap.pkl')

['/home/gibsonce/datallah-jaymefis-gibsonce/xgb_shap.pkl']