<a href="https://colab.research.google.com/github/d-atallah/implicit_gender_bias/blob/cluster_da/01_unsupervised_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing
## Dependencies

In [1]:
import pandas as pd
import numpy as np
import joblib
import re
import os
from collections import Counter

### text preprocessing dependencies
import nltk
from nltk.tokenize.casual import TweetTokenizer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

### sklearn dependencies
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

### spaCy dependencies
import spacy
# !python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package wordnet to /home/datallah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
filepath = '/home/datallah/datallah-jaymefis-gibsonce/'
random_state = 42
stop = {'a', 'about', 'above', 'after', 'again', 'against', 'ain',
        'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
        'as', 'at', 'be', 'because', 'been', 'before', 'being',
        'below', 'between', 'both', 'but', 'by', 'can', 'couldn',
        "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does',
        'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during',
        'each', 'few', 'for', 'from', 'further', 'had', 'hadn',
        "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't",
        'having', 'here', 'how', 'i', 'if', 'in', 'into', 'is', 'isn',
        "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm',
        'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn',
        "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor',
        'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or',
        'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
        're', 's', 'same', 'shan', "shan't", 'should', "should've",
        'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than',
        'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves',
        'then', 'there', 'these', 'they', 'this', 'those', 'through',
        'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn',
        "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',
        "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll",
        "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'}
wnl = WordNetLemmatizer()
lemma_stop_words = [wnl.lemmatize(wrd) for wrd in stop]

## Import Response CSVs

In [3]:
# create list of filenames
filenames = [filename[:-4] for filename in os.listdir(filepath) if '_responses.csv' in filename]
# create sampling/bootstrapping function
def bootstrap_sample(df, sample_size, random_state = random_state):
    """
    Accepts a dataframe and sample size to other
    reduce the size of a dataset or bootstrap it.
    """
    a = df[df.op_gender == 1].sample(n = int(sample_size/2),
                                     replace = True,
                                     axis = 0,
                                     random_state = 42) 
    b = df[df.op_gender != 1].sample(n = int(sample_size/2),
                                     replace = True,
                                     axis = 0,
                                     random_state = 42) 
    return pd.concat([a, b], ignore_index = True, copy = False)

# iterate through files and create sampled df
def concat_dfs(filepath = filepath, filenames = filenames, sample_size = 500000):
    """
    Takes a list of filenames, samples them according to sample_size, 
    and concatenate all sampled df's together.
    """
    df_lst = []
    for filename in filenames:
        temp_df = pd.read_csv(filepath + filename + '.csv')[['response_text', 'op_gender']]
        temp_df.dropna(subset = ['response_text', 'op_gender'], inplace = True)
        temp_df['op_gender'] = np.where(temp_df.op_gender == 'W', 1, 0)
        temp_df = bootstrap_sample(temp_df, sample_size)
        temp_df['source'] = filename
        df_lst.append(temp_df)
    # concat df's into one df
    concat_df = pd.concat(df_lst, ignore_index = True, copy = False)
    return concat_df

## Create Train, Validation, Test Splits

In [4]:
# create splitting function
def split_df(df, filepath = filepath, random_state = random_state,
            train_size = 0.6, val_size = 0.2, test_size = 0.2):
    """
    Take extracted and split DataFrame and write it to a shared location.
    Returns all split DataFrames.

    Parameters:
    - df: a DataFrame to be written to the shared location.
    - filepath: path to where split data files will reside.
    - random_state: a seed integer.
    - train_size: size of train set proportional to overall df.
    - val_size: size of validation set proportional to overall df.
    - test_size: size of test set proportional to overall df.

    Returns:
    - All of the split df's.
    """
    if train_size + val_size + test_size != 1.0:
        raise ValueError("Train, test, and validation splits must sum to 1.")
    X = df.loc[:, df.columns != 'op_gender']
    y = df['op_gender']
    X_train_temp, X_test, y_train_temp, y_test = train_test_split(
        X, y, test_size = test_size, random_state = random_state, 
        stratify = X['source'])
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_temp, y_train_temp, 
        test_size = val_size/(train_size + val_size),
        random_state = random_state)
    X_train.to_csv(filepath + 'X_train')
    X_test.to_csv(filepath + 'X_test')
    X_val.to_csv(filepath + 'X_val')
    y_train.to_csv(filepath + 'y_train')
    y_test.to_csv(filepath + 'y_test')
    y_val.to_csv(filepath + 'y_val')
    return X_train, X_val, X_test, y_train, y_val, y_test

## Apply Text Preprocessing
- Remove stopwords
- Lowercase
- Remove punctuation
- Tokenize (using TweetTokenizer)
- Remove links
- Lemmatize
- If POS tagging, we will also remove emojis

In [32]:
# as to not confuse the POS tagger, we will need to remove emojis and tags
# below function and regex was posted by toshi456 on stackoverflow
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'', text)

# similarly we want to remove user tags
#  as these will only apply to specific responses
def rm_userid(text):
    return re.sub(r'@[^\s]+', '', text)

# similarly we want to remove links
#  as these will only apply to specific responses
def rm_link(text):
    return re.sub(r'https?://\S+', '', text)

# create class that lemmatizes tweet tokens
# this will be used when creating the term matrix
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tt = TweetTokenizer(preserve_case=False, reduce_len=True,
                                 strip_handles=True, match_phone_numbers=False)
    def __call__(self, docs):
        return [self.wnl.lemmatize(t) for t in self.tt.tokenize(rm_link(docs))]
    
# creates a term matrix
def train_vectorizer(text_data, vectorizer=CountVectorizer, tokenizer=LemmaTokenizer(),
                     ngram_range_lower = 1, ngram_range_upper = 1, min_df = None, max_df = 1):
    """
    Trains a vectorizer on the provided text data and returns the vectorizer instance,
    the document-term matrix, and the feature names.

    Parameters:
    - text_data: List of text documents to be vectorized.
    - vectorizer: Vectorizer class to be used for text vectorization. Defaults to CountVectorizer.
    - tokenizer: Tokenizer class to be used for tokenizing the text documents. Defaults to TweetTokenizer.
    - ngram_range_lower: What's the minimum length of n-grams we want.
    - ngram_range_upper: What's the maximum length of n-grams we want.
    - min_df: Minimum data frequency.
    - max_df: Maximum data frequency.

    Returns:
    - instance: The trained vectorizer instance.
    - matrix: The document-term matrix resulting from fitting the vectorizer on `text_data`.
    - features: An array of feature names generated by the vectorizer.
    """
    # Initialize the vectorizer with specified configurations
    instance = vectorizer(
        strip_accents=None,  # Do not strip accents
        lowercase=True,  # Do not convert characters to lowercase
        tokenizer=tokenizer,  # Use the tokenize method of the tokenizer instance
        token_pattern=None,  # Since a tokenizer is provided, token_pattern is not used
        stop_words=list(lemma_stop_words),  # Remove stop_words but keep pronouns
        ngram_range=(ngram_range_lower, ngram_range_upper),  # Consider only single words (1-grams)
        min_df=min_df,  # Minimum document frequency for filtering terms
        max_df=max_df,  # Maximum document frequency for filtering terms
        max_features=None  # No limit on the number of features
    )

    # Fit the vectorizer on the provided text data and transform the data into a matrix
    vector = instance.fit(text_data)

    return vector

In [33]:
# initialize tweet tokenizer
tt = TweetTokenizer(preserve_case = False, reduce_len = True,
                    strip_handles = True, match_phone_numbers = False)

# create POS/dep matrices
def pos_matrix(responses):
    """
    Accepts a response series to turn into a POS matrix and a dependency matrix
    """
    response_lst = [' '.join(tt.tokenize(deEmojify(rm_link(rm_userid(doc))))) for doc in responses]
    response_lst = list(nlp.pipe(response_lst))
    pos_cnt_lst = []
    dep_cnt_lst = []
    for doc in response_lst:
        pos_cnt = Counter(tok.pos_ for tok in doc)
        dep_cnt = Counter(tok.dep_ for tok in doc)
        pos_cnt_lst.append(pos_cnt)
        dep_cnt_lst.append(dep_cnt)
    df_pos_cnts = pd.DataFrame(pos_cnt_lst).fillna(0)
    df_dep_cnts = pd.DataFrame(dep_cnt_lst).fillna(0)
    return df_pos_cnts, df_dep_cnts

## Vectorize

In [34]:
# load df's in dict iterable
df_dict = {}
for sample in os.listdir(filepath + 'samples/'):
    name = sample.replace('.csv', '')
    temp_df = pd.read_csv(filepath + 'samples/' + sample)[['source', ' response_text', ' op_gender']]
    temp_df = temp_df.rename(columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()
    df_dict[name] = temp_df

In [35]:
# create iterables
vectorizers = [CountVectorizer, TfidfVectorizer]
ngram_lower_lst = np.arange(1, 3)
ngram_upper_lst = np.arange(1, 4)
min_df_lst = [1, 2, 3, 4, 5]
sample_path = filepath + 'samples/' 
sample_nms = os.listdir(sample_path)

In [None]:
# iterate over params and fit vectorizers
v_dict = {}
for d in df_dict.keys():
    if 'train' in d:
        for v in vectorizers:
            if v == CountVectorizer: v_n = 'CntV'
            elif v == TfidfVectorizer: v_n = 'Tfidf'
            for nl in ngram_lower_lst:
                nl_n = f'nl{nl}'
                for nu in ngram_upper_lst:
                    nu_n = f'nu{nu}'
                    for f in min_df_lst:
                        f_n = f'f{f}'
                        name = d + '_' + v_n + '_' + nl_n + '_' + nu_n + '_' + f_n
                        v_dict[name] = train_vectorizer(text_data = df_dict[d]['response_text'],
                                                        vectorizer = v, 
                                                        tokenizer = LemmaTokenizer(),
                                                        ngram_range_lower = nl, 
                                                        ngram_range_upper = nu,
                                                        min_df = f)

In [37]:
# train vectorizers on train set
tfid_1m = TfidfVectorizer().fit(train_1m.response_text)
tfid_3m = TfidfVectorizer().fit(train_3m.response_text)
tfid_5m = TfidfVectorizer().fit(train_5m.response_text)
cnt_1m  = CountVectorizer().fit(train_1m.response_text)
cnt_3m  = CountVectorizer().fit(train_3m.response_text)
cnt_5m  = CountVectorizer().fit(train_5m.response_text)

In [38]:
# create storage location
if os.path.exists(filepath + 'matrices/') == False:
    os.mkdir(filepath + 'matrices/')

In [None]:
# create iterator function
def v_param_iter(name, text_data, vectorizers, ngram_lower_lst,
                 ngram_upper_lst, min_df_lst):
    for v in vectorizers:
        for nl in ngram_lower_lst:
            for nu in ngram_upper_lst:
                for f in min_df_lst:
                    tfidf_dict[tfidf_1m] = 

In [None]:
for sample_nm in sample_path:
    # load df, rename columns, create filename
    df_dict = {}
    temp_df = pd.read_csv(sample_path + sample_nm)[['source', ' response_text', ' op_gender']]
    temp_df['response_text'] = temp_df[' response_text']
    temp_df['op_gender'] = temp_df[' op_gender']
    name = sample_nm.replace('.csv', '')
    df_dict[name] = temp_df
    # fit vectorizer if train set
    tfidf_dict = {}
    cnt_dict = {}
    if 'one' in name and 'train' in name:
        for nl in ngram_lower_lst:
            for nu in ngram_upper_lst:
                for f in min_df_lst:
                    tfidf_dict[tfidf_1m] = 
    
    # loop over iterables and create files
    for nl in ngram_lower_lst:
        for nu in ngram_upper_lst:
            for f in min_df_lst:
                if 'one' in name:
                    instance, matrix, features = train_vectorizer()
                elif 'three' in name:
                    
                elif 'five' in name:
                    