# Create Clustering Matrices
## Dependencies

In [1]:
# !pip install -r requirements.txt

In [1]:
import pandas as pd
import numpy as np
import joblib
import os
import re
from collections import Counter

### text preprocessing dependencies
import nltk
from nltk.tokenize.casual import TweetTokenizer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

### sklearn dependencies
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy import sparse

### gensim dependencies
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

[nltk_data] Downloading package wordnet to /home/datallah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
filepath = '/home/datallah/datallah-jaymefis-gibsonce/'
sample_size = 'one'
random_state = 42
stop = {'a', 'about', 'above', 'after', 'again', 'against', 'ain',
        'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
        'as', 'at', 'be', 'because', 'been', 'before', 'being',
        'below', 'between', 'both', 'but', 'by', 'can', 'couldn',
        "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does',
        'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during',
        'each', 'few', 'for', 'from', 'further', 'had', 'hadn',
        "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't",
        'having', 'here', 'how', 'i', 'if', 'in', 'into', 'is', 'isn',
        "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm',
        'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn',
        "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor',
        'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or',
        'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
        're', 's', 'same', 'shan', "shan't", 'should', "should've",
        'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than',
        'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves',
        'then', 'there', 'these', 'they', 'this', 'those', 'through',
        'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn',
        "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',
        "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll",
        "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'}

In [3]:
v_path = filepath + 'matrices/'
if os.path.exists(v_path) == False:
    os.mkdir(v_path)

## Load & Preprocess

In [4]:
# load df's in dict iterable
def load_df_dict(size = sample_size, typ = None):
    """
    Accepts 'one', 'three', or 'five' as input sizes.
    If 'train', 'validate', or 'test' are inserted for type,
    only that kind of sample will be loaded.
    """
    df_dict = {}
    for sample in os.listdir(filepath + 'samples/'):
        if size in sample and (typ is None or typ in sample):
            name = sample.replace('.csv', '')
            temp_df = pd.read_csv(filepath + 'samples/' + sample)[['source', ' response_text', ' op_gender']]
            temp_df = temp_df.rename(columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()
#             df_dict[name] = temp_df
            df_dict['X_' + name] = temp_df.response_text
            df_dict['y_' + name] = temp_df.op_gender
    return df_dict

In [5]:
df_dict = load_df_dict(size = sample_size)
print(df_dict.keys())

dict_keys(['X_validate_one_million', 'y_validate_one_million', 'X_test_one_million', 'y_test_one_million', 'X_train_one_million', 'y_train_one_million'])


In [6]:
# remove links as these will only apply to specific responses
def rm_link(text):
    return re.sub(r'https?://\S+', '', text)

# remove punctuation
def rm_punct(text):
    return re.sub(r'[^\w\s]', '', text)

# create class that lemmatizes tweet tokens
# this will be used when creating the term matrix
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tt = TweetTokenizer(preserve_case=False, reduce_len=True,
                                 strip_handles=True, match_phone_numbers=False)
    def __call__(self, docs):
        return [self.wnl.lemmatize(t) for t in self.tt.tokenize(rm_link(rm_punct(docs))) if t not in stop]

# creates a term matrix
def train_vectorizer(text_data, vectorizer=CountVectorizer, tokenizer=LemmaTokenizer(),
                     ngram_range_lower = 1, ngram_range_upper = 1, min_df = 1):
    """
    Trains a vectorizer on the provided text data and returns the vectorizer instance,
    the document-term matrix, and the feature names.

    Parameters:
    - text_data: List of text documents to be vectorized.
    - vectorizer: Vectorizer class to be used for text vectorization. Defaults to CountVectorizer.
    - tokenizer: Tokenizer class to be used for tokenizing the text documents. Defaults to TweetTokenizer.
    - ngram_range_lower: What's the minimum length of n-grams we want.
    - ngram_range_upper: What's the maximum length of n-grams we want.
    - min_df: Minimum data frequency.
    - max_df: Maximum data frequency.

    Returns:
    - instance: The trained vectorizer instance.
    - matrix: The document-term matrix resulting from fitting the vectorizer on `text_data`.
    - features: An array of feature names generated by the vectorizer.
    """
    # Initialize the vectorizer with specified configurations
    instance = vectorizer(
        strip_accents=None,  # Do not strip accents
        lowercase=True,  # Do not convert characters to lowercase
        tokenizer=tokenizer,  # Use the tokenize method of the tokenizer instance
        token_pattern=None,  # Since a tokenizer is provided, token_pattern is not used
        ngram_range=(ngram_range_lower, ngram_range_upper),  # Consider only single words (1-grams)
        min_df=min_df,  # Minimum document frequency for filtering terms
        max_features=None  # No limit on the number of features
    )

    # Fit the vectorizer on the provided text data and transform the data into a matrix
    vector = instance.fit(text_data)

    return vector

In [7]:
X_train = df_dict[[key for key in df_dict.keys() if 'X_train' in key][0]]
X_val = df_dict[[key for key in df_dict.keys() if 'X_val' in key][0]]
X_test = df_dict[[key for key in df_dict.keys() if 'X_test' in key][0]]
X_train.head()

0                       oure welcome. Hope youre fine!
1                                            Thank you
2    As someone that worked in the world of for-pro...
3                                                 Neat
4    Seriously. My bathroom and bedroom are a freak...
Name: response_text, dtype: object

## Create & Store Count Vector & Matrix

In [9]:
cnt_v = train_vectorizer(text_data = X_train,
                         vectorizer = CountVectorizer,
                         ngram_range_lower = 1,
                         ngram_range_upper = 3,
                         min_df = 5)

In [10]:
print("Transforming X_train")
cnt_m = cnt_v.transform(X_train)
print("Transforming X_val")
cnt_m_val = cnt_v.transform(X_val)
print("Transforming X_test")
cnt_m_test = cnt_v.transform(X_test)

Transforming X_train
Transforming X_val
Transforming X_test


In [13]:
# save transformed matrices
sparse.save_npz(v_path + 'cnt_m_' + [key for key in df_dict.keys() if 'X_train' in key][0] + '.npz', cnt_m)
sparse.save_npz(v_path + 'cnt_m_' + [key for key in df_dict.keys() if 'X_val' in key][0] + '.npz', cnt_m_val)
sparse.save_npz(v_path + 'cnt_m_' + [key for key in df_dict.keys() if 'X_test' in key][0] + '.npz', cnt_m_test)

## Create & Store TF-IDF Vector & Matrix

In [14]:
tfidf_v = train_vectorizer(text_data = X_train,
                           vectorizer = TfidfVectorizer,
                           ngram_range_lower = 1,
                           ngram_range_upper = 3,
                           min_df = 5)

In [15]:
print("Transforming X_train")
tfidf_m = tfidf_v.transform(X_train)
print("Transforming X_val")
tfidf_m_val = tfidf_v.transform(X_val)
print("Transforming X_test")
tfidf_m_test = tfidf_v.transform(X_test)

Transforming X_train
Transforming X_val
Transforming X_test


In [16]:
# save transformed matrices
sparse.save_npz(v_path + 'tfidf_m_' + [key for key in df_dict.keys() if 'X_train' in key][0] + '.npz', tfidf_m)
sparse.save_npz(v_path + 'tfidf_m_' + [key for key in df_dict.keys() if 'X_val' in key][0] + '.npz', tfidf_m_val)
sparse.save_npz(v_path + 'tfidf_m_' + [key for key in df_dict.keys() if 'X_test' in key][0] + '.npz', tfidf_m_test)

## Create & Store Word2Vec Matrix

In [8]:
# manually preprocess since we can't use vectorizer function here
print("Tokenizing X_train")
tokenized_ser = X_train.str.lower().apply(LemmaTokenizer())
print("Tokenizing X_val")
tokenized_ser_val = X_val.str.lower().apply(LemmaTokenizer())
print("Tokenizing X_test")
tokenized_ser_test = X_test.str.lower().apply(LemmaTokenizer())

Tokenizing X_train
Tokenizing X_val
Tokenizing X_test


In [9]:
# implement minimum word frequency threshold
# term_cnts = Counter(term for doc in tokenized_ser for term in doc)
# tok_lst = [[term for term in doc if term_cnts[term] >= 5] for doc in tokenized_ser]

In [10]:
# use pretrained word2vec model to create term matrix...load model
w2v_model = KeyedVectors.load(datapath(v_path + 'word2vec_1m.wordvectors'))

In [19]:
# iterate over document and calculate average vector for each term
# this will create a document vector that we can cluster on
def create_w2v_matrix(tok_lst, w2v_model = w2v_model):
    doc_vectors = []
    for doc in tok_lst:
        doc_v_lst = []
        for tok in doc:
            if tok in w2v_model:
                tok_v = w2v_model[tok]
                doc_v_lst.append(tok_v)
        if doc_v_lst: doc_v = np.mean(doc_v_lst, axis = 0)
        else: doc_v = np.zeros(w2v_model.vector_size)
        doc_vectors.append(doc_v)
    return np.vstack(doc_vectors)

In [20]:
print("Transforming X_train")
w2v_m = create_w2v_matrix(tokenized_ser)
print("Transforming X_val")
w2v_m_val = create_w2v_matrix(tokenized_ser_val)
print("Transforming X_test")
w2v_m_test = create_w2v_matrix(tokenized_ser_test)

Transforming X_train
Transforming X_val
Transforming X_test


In [23]:
# save transformed matrices
np.save(v_path + 'w2v_m_' + [key for key in df_dict.keys() if 'X_train' in key][0] + '.npy', w2v_m)
np.save(v_path + 'w2v_m_' + [key for key in df_dict.keys() if 'X_val' in key][0] + '.npy', w2v_m_val)
np.save(v_path + 'w2v_m_' + [key for key in df_dict.keys() if 'X_test' in key][0] + '.npy', w2v_m_test)