<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/Supervised_Responses_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
# Import & download statements
# General Statements
!git clone https://github.com/d-atallah/implicit_gender_bias.git
#! pip install joblib
#! pip install shap
import pandas as pd
import string
import re
import joblib
from implicit_gender_bias import config as cf
import os
import numpy as np
import time

import shap
import matplotlib.pyplot as plt

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

fatal: destination path 'implicit_gender_bias' already exists and is not an empty directory.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Inputs
responses_combined = pd.read_csv(folder_path+'responses_combined.csv')

Columns (1,4,6,7,10,11,12) have mixed types.Specify dtype option on import or set low_memory=False.


## Define Functions


In [None]:
stop_words = {'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 #'he',
 #'her',
 'here',
 #'hers',
 #'herself',
 #'him',
 #'himself',
 #'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 #'she',
 #"she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [None]:
def preprocess_text(text):
    """
    Applies text preprocessing to a given text, including:
    - Removing special characters and digits
    - Converting to lowercase
    - Tokenization and removing stopwords
    - Lemmatization and stemming

    Parameters:
    - text (str): Input text to be preprocessed.

    Returns:
    - processed_text (str): Preprocessed text after applying the specified steps.
    """
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenization and removing stopwords
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    #tokens = [porter.stem(word) for word in tokens]

    # Rejoin tokens into a processed text
    processed_text = ' '.join(tokens)

    return processed_text

In [None]:
# Function to preprocess
def preprocess_batch(data, batch_index):
    processed_data = list(map(preprocess_text, data))
    return processed_data

In [None]:
def model_rank(model_list, model_str, metric):
    """
    Finds the model with the best score based on a specified metric.

    Parameters:
    - models_list (list): List of dictionaries, each representing a model's details.
    - model_str (list): List of model names corresponding to models_list.
    - metric (str): Metric to rank the models by (e.g., 'Accuracy', 'F1-Score').

    Returns:
    - all_models (pd.DataFrame): DataFrame with metric scores and model names.
    - models_by_metric (pd.DataFrame): DataFrame filtered by the specified metric and sorted in descending order.
    """
    all_models = [model_dict['metrics'].assign(Model=model_name) for model_dict, model_name in zip(model_list, model_str)]

    # Concatenate the DataFrames in the list
    all_models = pd.concat(all_models, ignore_index=True)


    # Sort the DataFrame by the specified metric in descending order
    models_by_metric = all_models[all_models['Metric'] == metric].sort_values(by='Score', ascending=False)

    return all_models, models_by_metric

# Train, Validate, Test Split

In [None]:
# All responses combined
# Set train-test split variables
X = responses_combined['response_text']
y = responses_combined['op_gender_binary']

# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=responses_combined['source']
)

## Preprocess Data

In [None]:
# Specify the batch size
batch_size = 500000
# Create batches
X_train_batches = [X_train[i:i + batch_size] for i in range(0, len(X_train), batch_size)]
# Preprocess data storage
X_train_preprocessed = []
# Initialize total time elapsed
total_start_time = time.time()

for batch_index, batch in enumerate(X_train_batches):
    print(f"Processing batch {batch_index + 1}...")
    # Start time for each batch
    start_time = time.time()

    # Run preprocessing
    X_train_preprocessed.extend(preprocess_batch(batch, batch_index))

    # End time for each batch
    end_time = time.time()
    total_end_time = time.time()

    # Calculate elapsed time for the batch
    elapsed_time = end_time - start_time
    total_elapsed_time = total_end_time - total_start_time
    print(f"Batch {batch_index + 1} processed in {elapsed_time/60:.2f} seconds.")
    print(f"Total time elapsed: {total_elapsed_time/60:.2f} minutes.")

Processing batch 1...
Batch 1 processed in 1.04 seconds.
Total time elapsed: 1.04 minutes.
Processing batch 2...
Batch 2 processed in 1.03 seconds.
Total time elapsed: 2.06 minutes.
Processing batch 3...
Batch 3 processed in 1.02 seconds.
Total time elapsed: 3.09 minutes.
Processing batch 4...
Batch 4 processed in 1.03 seconds.
Total time elapsed: 4.11 minutes.
Processing batch 5...
Batch 5 processed in 1.02 seconds.
Total time elapsed: 5.13 minutes.
Processing batch 6...
Batch 6 processed in 1.03 seconds.
Total time elapsed: 6.16 minutes.
Processing batch 7...
Batch 7 processed in 1.02 seconds.
Total time elapsed: 7.18 minutes.
Processing batch 8...
Batch 8 processed in 1.03 seconds.
Total time elapsed: 8.21 minutes.
Processing batch 9...
Batch 9 processed in 1.02 seconds.
Total time elapsed: 9.23 minutes.
Processing batch 10...
Batch 10 processed in 1.02 seconds.
Total time elapsed: 10.24 minutes.
Processing batch 11...
Batch 11 processed in 1.02 seconds.
Total time elapsed: 11.27 mi

In [None]:
# Specify the batch size
batch_size = 500000
# Create batches
X_test_batches = [X_test[i:i + batch_size] for i in range(0, len(X_test), batch_size)]
# Preprocess data storage
X_test_preprocessed = []
# Initialize total time elapsed
total_start_time = time.time()

for batch_index, batch in enumerate(X_test_batches):
    print(f"Processing batch {batch_index + 1}...")

    # Start time for each batch
    start_time = time.time()

    # Run preprocessing
    X_test_preprocessed.extend(preprocess_batch(batch, batch_index))

    # End time for each batch
    end_time = time.time()
    total_end_time = time.time()

    # Calculate elapsed time for the batch
    elapsed_time = end_time - start_time
    total_elapsed_time = total_end_time - total_start_time
    print(f"Batch {batch_index + 1} processed in {elapsed_time/60:.2f} seconds.")
    print(f"Total time elapsed: {total_elapsed_time/60:.2f} minutes.")

Processing batch 1...
Batch 1 processed in 1.02 seconds.
Total time elapsed: 1.02 minutes.
Processing batch 2...
Batch 2 processed in 1.02 seconds.
Total time elapsed: 2.04 minutes.
Processing batch 3...
Batch 3 processed in 1.02 seconds.
Total time elapsed: 3.06 minutes.
Processing batch 4...
Batch 4 processed in 1.02 seconds.
Total time elapsed: 4.09 minutes.
Processing batch 5...
Batch 5 processed in 1.02 seconds.
Total time elapsed: 5.11 minutes.
Processing batch 6...
Batch 6 processed in 1.03 seconds.
Total time elapsed: 6.14 minutes.
Processing batch 7...
Batch 7 processed in 1.02 seconds.
Total time elapsed: 7.16 minutes.
Processing batch 8...
Batch 8 processed in 1.02 seconds.
Total time elapsed: 8.18 minutes.
Processing batch 9...
Batch 9 processed in 1.03 seconds.
Total time elapsed: 9.21 minutes.
Processing batch 10...
Batch 10 processed in 1.02 seconds.
Total time elapsed: 10.23 minutes.
Processing batch 11...
Batch 11 processed in 0.61 seconds.
Total time elapsed: 10.84 mi

In [None]:
# Convert series to DataFrames
df_X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=['text'])
df_X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns=['text'])
df_y_train = pd.DataFrame({'op_gender_binary': y_train})
df_y_test = pd.DataFrame({'op_gender_binary': y_test})

# Remove Nulls
non_nan_indices_train = ~df_X_train_preprocessed.isnull()
non_nan_indices_test = ~df_X_test_preprocessed.isnull()

# Filter y_train and y_test using the non-NaN indices
y_train_filtered = df_y_train[non_nan_indices_train]
y_test_filtered = df_y_test[non_nan_indices_test]

# Filter X_train and X_test to remove NaN records
X_train_filtered = df_X_train_preprocessed[non_nan_indices_train]
X_test_filtered = df_X_test_preprocessed[non_nan_indices_test]

# Save DataFrames to CSV files
X_train_filtered.to_csv(folder_path + 'X_train_preprocessed.csv', index=False)
X_test_filtered.to_csv(folder_path + 'X_test_preprocessed.csv', index=False)
y_train_filtered.to_csv(folder_path + 'y_train.csv', index=False)
y_test_filtered.to_csv(folder_path + 'y_test.csv', index=False)