<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/main/03_Supervised_Responses_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import, Download, & Variable Statements

In [None]:
# Import & download statements
# General Statements
import pandas as pd
import string
import re
import joblib
from implicit_gender_bias import config as cf
import os
import numpy as np
import time

# Feature selection & Model tuning
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, StratifiedKFold, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import TruncatedSVD,PCA, NMF
from sklearn.metrics import confusion_matrix,precision_score, recall_score, f1_score, accuracy_score, roc_curve, roc_auc_score, log_loss, make_scorer, average_precision_score
from sklearn.utils import resample

# NLTK resources
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gibsonce/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gibsonce/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/gibsonce/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Variables
folder_path = '/home/gibsonce/datallah-jaymefis-gibsonce/'

# Inputs
responses_combined = pd.read_csv(folder_path+'responses_combined.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Define Functions


In [None]:
stop_words = {'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 #'he',
 #'her',
 'here',
 #'hers',
 #'herself',
 #'him',
 #'himself',
 #'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's',
 'same',
 'shan',
 "shan't",
 #'she',
 #"she's",
 'should',
 "should've",
 'shouldn',
 "shouldn't",
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 "that'll",
 'the',
 'their',
 'theirs',
 'them',
 'themselves',
 'then',
 'there',
 'these',
 'they',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 've',
 'very',
 'was',
 'wasn',
 "wasn't",
 'we',
 'were',
 'weren',
 "weren't",
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with',
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'y',
 'you',
 "you'd",
 "you'll",
 "you're",
 "you've",
 'your',
 'yours',
 'yourself',
 'yourselves'}

In [None]:
def preprocess_text(text):
    """
    Applies text preprocessing to a given text, including:
    - Removing special characters and digits
    - Converting to lowercase
    - Tokenization and removing stopwords
    - Lemmatization

    Parameters:
    - text (str): Input text to be preprocessed.

    Returns:
    - processed_text (str): Preprocessed text after applying the specified steps.
    """
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenization and removing stopwords
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Rejoin tokens into a processed text
    processed_text = ' '.join(tokens)

    return processed_text

# Train, Validate, Test Split

In [None]:
# Identify the data source with the minimum number of rows
min_source = responses_combined['source'].value_counts().idxmin()

# Determine the minimum number of rows for any data source
min_rows = responses_combined['source'].value_counts().min()

# Create a new DataFrame to store the balanced data
balanced_data = pd.DataFrame()

# Iterate through each unique data source
for source in responses_combined['source'].unique():
    # If the current data source has more rows than the minimum, randomly sample to match the minimum
    if source != min_source:
        subset = responses_combined[responses_combined['source'] == source].sample(n=min_rows, replace=True)
    else:
        # If the current data source has fewer rows than the minimum, use all rows
        subset = responses_combined[responses_combined['source'] == source]

    # Append the subset to the balanced_data DataFrame
    balanced_data = balanced_data.append(subset)

# Shuffle the balanced_data DataFrame to ensure randomness
balanced_data = balanced_data.sample(frac=1).reset_index(drop=True)

In [None]:
# Sample data
# Set train-test split variables
X = balanced_data['response_text']
y = balanced_data['op_gender_binary']

# Perform stratified train-test split for the training data
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=balanced_data['source']
)

# Perform stratified sampling on the training data
sample_size = int(0.4 * len(X_train_full))
X_train, _, y_train, _ = train_test_split(
    X_train_full, y_train_full, train_size=sample_size, random_state=42, stratify=y_train_full
)

# Separate majority and minority classes
majority_class = 1.0
minority_class = 0.0

majority_data = X_train[y_train == majority_class]
minority_data = X_train[y_train == minority_class]

# Upsample the minority class
minority_upsampled = resample(minority_data, replace=True, n_samples=len(majority_data), random_state=42)

# Combine the upsampled minority class with the majority class
X_train_balanced = pd.concat([majority_data, minority_upsampled])
y_train_balanced = np.concatenate([np.full(len(majority_data), majority_class), np.full(len(minority_upsampled), minority_class)])

# Convert y_train_balanced to a pandas Series
y_train_balanced_series = pd.Series(y_train_balanced, index=X_train_balanced.index)

# Shuffle the balanced data
shuffle_indices = np.arange(len(X_train_balanced))
np.random.shuffle(shuffle_indices)

X_train = X_train_balanced.iloc[shuffle_indices]
y_train = y_train_balanced_series.iloc[shuffle_indices]

In [None]:
print(y_train.value_counts(),len(X_train))

0.0    193356
1.0    193356
dtype: int64 386712


## Preprocess Data

In [None]:
# Apply preprocessing to each set (X_train, X_validation, X_test)
X_train_preprocessed = X_train.apply(preprocess_text)
X_test_preprocessed = X_test.apply(preprocess_text)

## Write Outputs

In [None]:
X_train_preprocessed.to_pickle(folder_path+'X_train_preprocessed.pkl')
X_test_preprocessed.to_pickle(folder_path+'X_test_preprocessed.pkl')
y_train.to_pickle(folder_path+'y_train.pkl')
y_test.to_pickle(folder_path+'y_test.pkl')

## Unused code

X = balanced_data['response_text']
y = balanced_data['op_gender_binary']

# Set train-test split variables
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=responses_combined['source']
)

# Separate majority and minority classes (if needed)
majority_class = 1.0
minority_class = 0.0

majority_data = X_train[y_train == majority_class]
minority_data = X_train[y_train == minority_class]

# Upsample the minority class (if needed)
minority_upsampled = resample(minority_data, replace=True, n_samples=len(majority_data), random_state=42)

# Combine the upsampled minority class with the majority class (if needed)
X_train_balanced = pd.concat([majority_data, minority_upsampled])
y_train_balanced = np.concatenate([np.full(len(majority_data), majority_class), np.full(len(minority_upsampled), minority_class)])

# Convert y_train_balanced to a pandas Series (if needed)
y_train_balanced_series = pd.Series(y_train_balanced, index=X_train_balanced.index)

# Shuffle the balanced data (if needed)
shuffle_indices = np.arange(len(X_train_balanced))
np.random.shuffle(shuffle_indices)

X_train = X_train_balanced.iloc[shuffle_indices]
y_train = y_train_balanced_series.iloc[shuffle_indices]