<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/Supervised_Learning_Prod.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
# Import & download statements
# General Statements
!git clone https://github.com/d-atallah/implicit_gender_bias.git
import pandas as pd
import string
import re
import joblib
from implicit_gender_bias import config as cf
import os
import numpy as np
import time

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score

# Model options
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

fatal: destination path 'implicit_gender_bias' already exists and is not an empty directory.


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Inputs
responses_combined = pd.read_csv(folder_path+'responses_combined.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


### Read Processed Data

In [None]:
# Read in the processed response DataFrame from the CSV file
all_data = pd.read_csv(folder_path+'all_data.csv')

# Separate the data into individual variables
responses_combined = all_data[['response_text', 'op_gender_binary', 'source']]
X_train = all_data['X_train']
y_train = all_data['y_train']
X_test = all_data['X_test']
y_test = all_data['y_test']
#X_train_preprocessed = all_data['X_train_preprocessed']
#X_test_preprocessed = all_data['X_test_preprocessed']

## Define Functions


In [None]:
# Evaluate a model
def model_eval(model, X_test, y_test, y_pred):
    """
    Evaluates a specified model using accuracy, precision, recall, F-1 score, AUC, log-Loss, and a confusion matrix.

    Parameters:
    - model: The trained model to be evaluated.
    - X_test (list or array): Test set features.
    - y_test (list or array): True labels.
    - y_pred (list or array): Predicted labels.

    Returns:
    - metrics_df (pd.DataFrame): DataFrame containing the metrics and scores.
    - confusion_df (pd.DataFrame): DataFrame containing a confusion matrix.
    """
    # Initialize dataframes
    metrics_df = pd.DataFrame(columns=['Metric', 'Score'])
    confusion_df = pd.DataFrame(columns=['Actual Positive', 'Actual Negative', 'Predicted Positive', 'Predicted Negative'])

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
                                                      'Score': [accuracy, precision, recall, f1]})])

    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    auc = roc_auc_score(y_test, y_pred)
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['AUC'],
                                                      'Score': [auc]})])

    logloss = log_loss(y_test, model.predict_proba(X_test))
    metrics_df = pd.concat([metrics_df, pd.DataFrame({'Metric': ['Log-Loss'],
                                                      'Score': [logloss]})])

    # Reset index
    metrics_df = metrics_df.reset_index(drop=True)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])

    # Print dataframes
    print("Metrics:")
    print(metrics_df)

    print("\nConfusion Matrix:")
    print(confusion_df)

    return metrics_df, confusion_df

In [None]:
stop_words = {'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 #'he',
 #'her',
 'here',
 #'hers',
 #'herself',
 #'him',
 #'himself',
 #'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 #'she',
 #"she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [None]:
def preprocess_text(text):
    """
    Applies text preprocessing to a given text, including:
    - Removing special characters and digits
    - Converting to lowercase
    - Tokenization and removing stopwords
    - Lemmatization and stemming

    Parameters:
    - text (str): Input text to be preprocessed.

    Returns:
    - processed_text (str): Preprocessed text after applying the specified steps.
    """
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenization and removing stopwords
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    #tokens = [porter.stem(word) for word in tokens]

    # Rejoin tokens into a processed text
    processed_text = ' '.join(tokens)

    return processed_text

In [None]:
def model_rank(model_list, model_str, metric):
    """
    Finds the model with the best score based on a specified metric.

    Parameters:
    - models_list (list): List of dictionaries, each representing a model's details.
    - model_str (list): List of model names corresponding to models_list.
    - metric (str): Metric to rank the models by (e.g., 'Accuracy', 'F1-Score').

    Returns:
    - all_models (pd.DataFrame): DataFrame with metric scores and model names.
    - models_by_metric (pd.DataFrame): DataFrame filtered by the specified metric and sorted in descending order.
    """
    all_models = [model_dict['metrics'].assign(Model=model_name) for model_dict, model_name in zip(model_list, model_str)]

    # Concatenate the DataFrames in the list
    all_models = pd.concat(all_models, ignore_index=True)


    # Sort the DataFrame by the specified metric in descending order
    models_by_metric = all_models[all_models['Metric'] == metric].sort_values(by='Score', ascending=False)

    return all_models, models_by_metric

In [None]:
# Function to preprocess
def preprocess_batch(data, batch_index):
    processed_data = list(map(preprocess_text, data))
    return processed_data

# Train, Validate, Test Split

In [None]:
# All responses combined
# Set train-test split variables
X = responses_combined['response_text']
y = responses_combined['op_gender_binary']

# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=responses_combined['source']
)

## Preprocess Data

In [None]:
# Specify the batch size
batch_size = 500000
# Create batches
X_train_batches = [X_train[i:i + batch_size] for i in range(0, len(X_train), batch_size)]
# Preprocess data storage
X_train_preprocessed = []
# Initialize total time elapsed
total_start_time = time.time()

for batch_index, batch in enumerate(X_train_batches):
    print(f"Processing batch {batch_index + 1}...")
    # Start time for each batch
    start_time = time.time()

    # Run preprocessing
    X_train_preprocessed.extend(preprocess_batch(batch, batch_index))

    # End time for each batch
    end_time = time.time()
    total_end_time = time.time()

    # Calculate elapsed time for the batch
    elapsed_time = end_time - start_time
    total_elapsed_time = total_end_time - total_start_time
    print(f"Batch {batch_index + 1} processed in {elapsed_time/60:.2f} seconds.")
    print(f"Total time elapsed: {total_elapsed_time/60:.2f} minutes.")

Processing batch 1...
Batch 1 processed in 71.67 seconds.
Total time elapsed: 1.19 minutes.
Processing batch 2...
Batch 2 processed in 71.56 seconds.
Total time elapsed: 2.39 minutes.
Processing batch 3...
Batch 3 processed in 70.82 seconds.
Total time elapsed: 3.57 minutes.
Processing batch 4...
Batch 4 processed in 71.27 seconds.
Total time elapsed: 4.76 minutes.
Processing batch 5...
Batch 5 processed in 70.83 seconds.
Total time elapsed: 5.94 minutes.
Processing batch 6...
Batch 6 processed in 71.52 seconds.
Total time elapsed: 7.13 minutes.
Processing batch 7...
Batch 7 processed in 71.07 seconds.
Total time elapsed: 8.31 minutes.
Processing batch 8...
Batch 8 processed in 71.50 seconds.
Total time elapsed: 9.50 minutes.
Processing batch 9...
Batch 9 processed in 70.80 seconds.
Total time elapsed: 10.68 minutes.
Processing batch 10...
Batch 10 processed in 70.82 seconds.
Total time elapsed: 11.86 minutes.
Processing batch 11...
Batch 11 processed in 71.35 seconds.
Total time elaps

In [None]:
# Specify the batch size
batch_size = 500000
# Create batches
X_test_batches = [X_test[i:i + batch_size] for i in range(0, len(X_test), batch_size)]
# Preprocess data storage
X_test_preprocessed = []
# Initialize total time elapsed
total_start_time = time.time()

for batch_index, batch in enumerate(X_test_batches):
    print(f"Processing batch {batch_index + 1}...")

    # Start time for each batch
    start_time = time.time()

    # Run preprocessing
    X_test_preprocessed.extend(preprocess_batch(batch, batch_index))

    # End time for each batch
    end_time = time.time()
    total_end_time = time.time()

    # Calculate elapsed time for the batch
    elapsed_time = end_time - start_time
    total_elapsed_time = total_end_time - total_start_time
    print(f"Batch {batch_index + 1} processed in {elapsed_time/60:.2f} seconds.")
    print(f"Total time elapsed: {total_elapsed_time/60:.2f} minutes.")

Processing batch 1...
Batch 1 processed in 71.34 seconds.
Total time elapsed: 1.19 minutes.
Processing batch 2...
Batch 2 processed in 71.37 seconds.
Total time elapsed: 2.38 minutes.
Processing batch 3...
Batch 3 processed in 71.38 seconds.
Total time elapsed: 3.57 minutes.
Processing batch 4...
Batch 4 processed in 71.93 seconds.
Total time elapsed: 4.77 minutes.
Processing batch 5...
Batch 5 processed in 71.30 seconds.
Total time elapsed: 5.96 minutes.
Processing batch 6...
Batch 6 processed in 72.16 seconds.
Total time elapsed: 7.16 minutes.
Processing batch 7...
Batch 7 processed in 71.12 seconds.
Total time elapsed: 8.34 minutes.
Processing batch 8...
Batch 8 processed in 71.08 seconds.
Total time elapsed: 9.53 minutes.
Processing batch 9...
Batch 9 processed in 71.76 seconds.
Total time elapsed: 10.72 minutes.
Processing batch 10...
Batch 10 processed in 71.20 seconds.
Total time elapsed: 11.91 minutes.
Processing batch 11...
Batch 11 processed in 42.38 seconds.
Total time elaps

In [None]:
# Convert lists to DataFrames
df_X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=['text'])
df_X_test_preprocessed = pd.DataFrame(X_test_preprocessed, columns=['text'])
df_y_train = pd.DataFrame({'op_gender_binary': y_train})
df_y_test = pd.DataFrame({'op_gender_binary': y_test})

# Save DataFrames to CSV files
df_X_train_preprocessed.to_csv(folder_path + 'X_train_preprocessed.csv', index=False)
df_X_test_preprocessed.to_csv(folder_path + 'X_test_preprocessed.csv', index=False)
df_y_train.to_csv(folder_path + 'y_train.csv', index=False)
df_y_test.to_csv(folder_path + 'y_test.csv', index=False)

Check for class imbalance:

In [None]:
class_distribution = pd.Series(y_train).value_counts()
class_distribution

1.0    12321744
0.0     8868747
Name: op_gender_binary, dtype: int64

In [None]:
len(X_train_preprocessed)

21190491

## XGBoost

In [None]:
def model_testing(X_train, y_train, X_test, y_test, model_type, vectorizer, ngram, params):
    """
    Runs a specified model and dimensionality reduction method with tuned hyperparameters

    Parameters:
    - X_train (array-like): Training set features, preprocessed.
    - y_train (array-like): Training set labels.
    - X_test (array-like): Test set features, preprocessed.
    - y_test (array-like): Test set labels.
    - model_type (str): Type of model to test. Choose from 'log' (Logistic Regression), 'xgb' (XGBoost), 'knn' (k-Nearest Neighbors), 'svm' (Support Vector Machine).
    - vectorizer (str): Type of vectorizer to test. Choose from 'count' (Count Vecotizer) or 'tfidf' (TF-IDF Vecotizer).
    - ngram (int): Feature representation to test. Choose 1 for unigrams, 2 for bigrams, and so on.
    - params (dict): Hyperparameter grid for the specified model and dimensionality reduction method.

    Returns:
    - selected_model: Trained model with the best hyperparameters.
    - X_train_ (array-like): Vectorized training set features.
    - X_test_ (array-like): Vectorized test set features.
    """
    if vectorizer == 'count':
        vect = CountVectorizer(ngram_range=(ngram, ngram))
        X_train_ = vect.fit_transform(X_train)
        X_test_ = vect.transform(X_test)
    elif vectorizer == 'tfidf':
        vect = TfidfVectorizer(ngram_range=(ngram, ngram))
        X_train_ = vect.fit_transform(X_train)
        X_test_ = vect.transform(X_test)
    else:
        raise ValueError("Invalid vector type. Use 'count' or 'tfidf'.")

    if model_type == 'log':
        model = LogisticRegression(max_iter=1000, random_state=42, **params)
    elif model_type == 'xgb':
        model = XGBClassifier(random_state=42, **params)
    elif model_type == 'knn':
        model = KNeighborsClassifier(**params)
    elif model_type == 'svm':
        model = SVC(probability=True, **params)
    else:
        raise ValueError("Invalid model type. Use 'xgb', 'svm', 'knn', or 'log'.")

    # Pipeline with dimensionality reduction method and model to test
    pipeline = make_pipeline(
        TruncatedSVD(random_state=42),
        model
    )

    # Train the model
    pipeline.fit(X_train_, y_train)

    # Evaluate the model on the test set
    y_test_pred = pipeline.predict(X_test_)
    metrics_val_df, confusion_matrix = model_eval(pipeline, X_test_, y_test, y_test_pred)

    return pipeline, X_train_, X_test_, metrics_val_df


### XGB Final Model:
*   Vectorization: TF-IDF
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'xgb'
vectorization = 'tfidf'
ngram = 1
params = {'xgbclassifier__subsample': 0.8, 'xgbclassifier__n_estimators': 150, 'xgbclassifier__max_depth': 9, 'xgbclassifier__learning_rate': 0.05, 'xgbclassifier__colsample_bytree': 0.5, 'truncatedsvd__n_components': 150}


# Run model search
model, train, test, metrics = model_testing(X_train_preprocessed, y_train, X_test_preprocessed, y_test, model, vectorization, ngram, params)

# Save results to dictionary
xgb_tfidf_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_test': test,
    'metrics': metrics
}

## Logistic Regression

### Logistic Regression Model Method:
*   Vectorization: Count
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'log'
vectorization = 'count'
ngram = 1
params = {'logisticregression__solver': 'saga', 'logisticregression__penalty': 'l1', 'logisticregression__C': 0.1}

# Run model search
model, train, test, metrics = model_testing(X_train_preprocessed, y_train, X_test_preprocessed, y_test, model, vectorization, ngram, params)

# Save results to dictionary
log_count_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_test': test,
    'metrics': metrics
}

Hyperparameters: {'logisticregression__solver': 'saga', 'logisticregression__penalty': 'l1', 'logisticregression__C': 0.1}
Metrics:
      Metric     Score
0   Accuracy  0.525630
1  Precision  0.521610
2     Recall  0.890585
3   F1-Score  0.657895
4        AUC  0.516530
5   Log-Loss  0.692039

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 160                 963
Actual Negative                 129                1050


## Support Vector Machine

### Support Vector Machine Model Method:
*   Vectorization: Count
*   Feature Representation: Unigram

In [None]:
# Define variables
model = 'svm'
vectorization = 'count'
ngram = 1
params = {'svc__kernel': 'rbf', 'svc__gamma': 'scale', 'svc__C': 10}

# Run model search
model, train, test, metrics = model_testing(X_train_preprocessed, y_train, X_test_preprocessed, y_test, model, vectorization, ngram, params)

# Save results to dictionary
svm_count_1 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_test': test,
    'metrics': metrics
}

Hyperparameters: {'svc__kernel': 'rbf', 'svc__gamma': 'scale', 'svc__C': 10}
Metrics:
      Metric     Score
0   Accuracy  0.526499
1  Precision  0.524930
2     Recall  0.794741
3   F1-Score  0.632254
4        AUC  0.519811
5   Log-Loss  0.692200

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 275                 848
Actual Negative                 242                 937


## K-Nearest Neighbors

### K-Nearest Neighbors Model Method:
*   Vectorization: TF-IDF
*   Feature Representation: Bigram

In [None]:
# Define variables
model = 'knn'
vectorization = 'tfidf'
ngram = 2
params = {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 1, 'kneighborsclassifier__n_neighbors': 3}

# Run model search
model, train, test, metrics = model_testing(X_train_preprocessed, y_train, X_test_preprocessed, y_test, model, vectorization, ngram, params)

# Save results to dictionary
knn_tfidf_2 = {
    'model': model,
    'params': params,
    'X_train': train,
    'X_test': test,
    'metrics': metrics
}

Hyperparameters: {'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__p': 1, 'kneighborsclassifier__n_neighbors': 3}
Metrics:
      Metric     Score
0   Accuracy  0.551694
1  Precision  0.550034
2     Recall  0.685327
3   F1-Score  0.610272
4        AUC  0.548362
5   Log-Loss  7.951702

Confusion Matrix:
                 Predicted Positive  Predicted Negative
Actual Positive                 462                 661
Actual Negative                 371                 808


# Model Ranking Comparison

In [None]:
# Model Names (Need string values for dataframe column)
model_list = [xgb_count_1, xgb_tfidf_1, xgb_count_1, xgb_count_2, log_count_1, log_count_2, log_tfidf_1, log_tfidf_2, svm_count_1, svm_count_2, svm_tfidf_1, svm_tfidf_2, knn_count_1, knn_count_2, knn_tfidf_1, knn_tfidf_2]
model_str = ['xgb_count_1', 'xgb_tfidf_1', 'xgb_count_1', 'xgb_count_2', 'log_count_1', 'log_count_2', 'log_tfidf_1', 'log_tfidf_2', 'svm_count_1', 'svm_count_2', 'svm_tfidf_1', 'svm_tfidf_2', 'knn_count_1', 'knn_count_2', 'knn_tfidf_1', 'knn_tfidf_2']

# Specify the metric to rank the models by
all_models, models_by_metric = model_rank(model_list, model_str, 'AUC')
models_by_metric

Unnamed: 0,Metric,Score,Model
10,AUC,0.571551,xgb_tfidf_1
4,AUC,0.570937,xgb_count_1
16,AUC,0.570937,xgb_count_1
22,AUC,0.549021,xgb_count_2
94,AUC,0.548362,knn_tfidf_2
76,AUC,0.545513,knn_count_1
82,AUC,0.541344,knn_count_2
52,AUC,0.519811,svm_count_1
88,AUC,0.519561,knn_tfidf_1
28,AUC,0.51653,log_count_1


In [None]:
all_models.to_csv(folder_path+test_num+'all_models.csv', index=False)
models_by_metric.to_csv(folder_path+test_num+'models_by_metric.csv', index=False)

# Save the vectorizer and associated data
joblib.dump(vectorizer_tfidf_bi,folder_path+'tfidf_vectorizer_bi.pkl')
joblib.dump(X_train_vtfidf_bi, folder_path+'X_train_vtfidf_bi.pkl')
joblib.dump(X_validation_vtfidf_bi, folder_path+'X_validation_vtfidf_bi.pkl')
joblib.dump(X_test_vtfidf_bi, folder_path+'X_test_vtfidf_bi.pkl')