<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/03_Supervised_Model_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
# Import & download statements
# General statements
import pandas as pd
import string
import re
import joblib
import os
import numpy as np
import time

import shap
import matplotlib.pyplot as plt

# Feature selection & model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Read Inputs

In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Load DataFrames from pkl files
X_train = pd.read_pickle(folder_path + 'X_train_preprocessed.pkl')
X_test = pd.read_pickle(folder_path + 'X_test_preprocessed.pkl')
y_train = pd.read_pickle(folder_path + 'y_train.pkl')
y_test = pd.read_pickle(folder_path + 'y_test.pkl')

In [None]:
# Filter X_train and X_test to remove NaN records
non_nan_indices_train = ~X_train.isnull()
non_nan_indices_test = ~X_test.isnull()

y_train = y_train[non_nan_indices_train]
y_test = y_test[non_nan_indices_test]

X_train = X_train[non_nan_indices_train]
X_test = X_test[non_nan_indices_test]

## Define Functions


In [None]:
def model_testing(X_train, y_train, X_test, y_test, model_type, vectorizer, ngram, params):
    """
    Runs a specified model and dimensionality reduction method with tuned hyperparameters

    Parameters:
    - X_train (array-like): Training set features, preprocessed.
    - y_train (array-like): Training set labels.
    - X_test (array-like): Test set features, preprocessed.
    - y_test (array-like): Test set labels.
    - model_type (str): Type of model to test. Choose from 'log' (Logistic Regression), 'xgb' (XGBoost), 'knn' (k-Nearest Neighbors), 'svm' (Support Vector Machine).
    - vectorizer (str): Type of vectorizer to test. Choose from 'count' (Count Vecotizer) or 'tfidf' (TF-IDF Vecotizer).
    - ngram (int): Feature representation to test. Choose 1 for unigrams, 2 for bigrams, and so on.
    - params (dict): Hyperparameter grid for the specified model and dimensionality reduction method.

    Returns:
    - Pipeline: Trained and fit pipeline with the best hyperparameters.
    - X_train_ (array-like): Preprocessed  and vectorized training set features.
    - X_test_ (array-like): Preprocessed  and vectorized test set features.
    - explainer: SHAP explainer.
    - feature_importance_dict (Dict): Dictionary of model feature importances.
    """
    start_time = time.time()
    if vectorizer == 'count':
        vect = CountVectorizer(ngram_range=(ngram, ngram))
        X_train_ = vect.fit_transform(X_train)
        X_test_ = vect.transform(X_test)

    elif vectorizer == 'tfidf':
        vect = TfidfVectorizer(ngram_range=(ngram, ngram))
        X_train_ = vect.fit_transform(X_train)
        X_test_ = vect.transform(X_test)

    else:
        raise ValueError("Invalid vector type. Use 'count' or 'tfidf'.")

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Vectorization completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    if model_type == 'xgb':
        model = XGBClassifier(random_state=42, **params.get('xgbclassifier', {}))
        model.fit(X_train_, y_train)
        explainer = shap.Explainer(model)
        feature_importances = model.feature_importances_
        feature_names = vect.get_feature_names_out()
        feature_importance_dict = dict(zip(feature_names, feature_importances))
    elif model_type == 'knn':
        model = KNeighborsClassifier(**params.get('kneighborsclassifier', {}))
        model = make_pipeline(
        TruncatedSVD(**params.get('truncatedsvd', {}), random_state=42),
        model
        )
        model.fit(X_train_, y_train)
        explainer = None
        feature_importances = None
        feature_names = None
        feature_importance_dict = None
    elif model_type == 'nb':
        model = MultinomialNB(**params.get('multinomialnb', {}))
        model.fit(X_train_, y_train)
        explainer = None
        feature_importances = None
        feature_names = None
        feature_importance_dict = None

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Pipeline fitting completed. Time elapsed: {elapsed_time/60:.2f} minutes.")

    # Save the trained pipeline
    joblib.dump(model, f'{folder_path}{model_type}_pipeline.pkl')
    joblib.dump(X_train_, f'{folder_path}{model_type}_X_train.pkl')
    joblib.dump(X_test_, f'{folder_path}{model_type}_X_test.pkl')

    if explainer:
        joblib.dump(explainer, f'{folder_path}{model_type}_shap.pkl')
        joblib.dump(feature_importance_dict, f'{folder_path}{model_type}_features.pkl')
    print('Write to pkl file completed.')

    return model, X_train_, X_test_, explainer, feature_importance_dict


## Naive Bayes

In [None]:
# Define variables
model = 'nb'
vectorization = 'tfidf'
ngram = 1
params = {'multinomialnb': {'alpha': 1}}

# Run model
model, train, test, explainer, feature_importance_dict = model_testing(X_train, y_train, X_test, y_test, model, vectorization, ngram, params)

Vectorization completed. Time elapsed: 0.23 minutes.
Pipeline fitting completed. Time elapsed: 0.23 minutes.
Write to pkl file completed.


## K-Nearest Neighbors

In [None]:
# Define variables
model = 'knn'
vectorization = 'tfidf'
ngram = 1
params = {
    'kneighborsclassifier': {'weights': 'distance', 'p': 1, 'n_neighbors': 3},
}

# Run model
model, train, test, explainer, feature_importance_dict = model_testing(X_train, y_train, X_test, y_test, model, vectorization, ngram, params)

Vectorization completed. Time elapsed: 0.23 minutes.
Pipeline fitting completed. Time elapsed: 0.32 minutes.
Write to pkl file completed.


## XGBoost

In [None]:
# Define variables
model = 'xgb'
vectorization = 'tfidf'
ngram = 1
params = {
    'xgbclassifier': {'subsample': 0.8, 'n_estimators': 150, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.5},
}

# Run model
model, train, test, explainer, feature_importance_dict = model_testing(X_train, y_train, X_test, y_test, model, vectorization, ngram, params)

Vectorization completed. Time elapsed: 0.23 minutes.
Pipeline fitting completed. Time elapsed: 8.60 minutes.
Write to pkl file completed.
