# BGMM
## Dependencies

In [1]:
# !pip install -r requirements.txt

In [26]:
import pandas as pd
import numpy as np
import joblib
import os
import json
from collections import Counter
import re
import tqdm
from datetime import datetime
import multiprocessing

### sklearn dependencies
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import BayesianGaussianMixture
from sklearn.model_selection import GridSearchCV
from scipy import sparse

### text preprocessing dependencies
import nltk
from nltk.tokenize.casual import TweetTokenizer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

### gensim dependencies
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

[nltk_data] Downloading package wordnet to /home/datallah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
filepath = '/home/datallah/datallah-jaymefis-gibsonce/'
random_state = 42
stop = {'a', 'about', 'above', 'after', 'again', 'against', 'ain',
        'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't",
        'as', 'at', 'be', 'because', 'been', 'before', 'being',
        'below', 'between', 'both', 'but', 'by', 'can', 'couldn',
        "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does',
        'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during',
        'each', 'few', 'for', 'from', 'further', 'had', 'hadn',
        "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't",
        'having', 'here', 'how', 'i', 'if', 'in', 'into', 'is', 'isn',
        "isn't", 'it', "it's", 'its', 'itself', 'just', 'll', 'm',
        'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn',
        "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor',
        'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or',
        'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
        're', 's', 'same', 'shan', "shan't", 'should', "should've",
        'shouldn', "shouldn't", 'so', 'some', 'such', 't', 'than',
        'that', "that'll", 'the', 'their', 'theirs', 'them', 'themselves',
        'then', 'there', 'these', 'they', 'this', 'those', 'through',
        'to', 'too', 'under', 'until', 'up', 've', 'very', 'was', 'wasn',
        "wasn't", 'we', 'were', 'weren', "weren't", 'what', 'when', 'where',
        'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'won',
        "won't", 'wouldn', "wouldn't", 'y', 'you', "you'd", "you'll",
        "you're", "you've", 'your', 'yours', 'yourself', 'yourselves'}

In [16]:
size = 'one'
train = pd.read_csv(filepath + f'samples/train_{size}_million.csv').rename(
    columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()
val   = pd.read_csv(filepath + f'samples/validate_{size}_million.csv').rename(
    columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()
test  = pd.read_csv(filepath + f'samples/test_{size}_million.csv').rename(
    columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()

In [17]:
X_train = train[train.source == 'TED'].response_text
y_train = train[train.source == 'TED'].op_gender
X_val = val[val.source == 'TED'].response_text
y_val = val[val.source == 'TED'].op_gender
X_test  = test[test.source == 'TED'].response_text
y_test  = test[test.source == 'TED'].op_gender

In [18]:
del train, val, test

In [19]:
X_train.shape

(139982,)

## Load TF-IDF

In [20]:
# tokenize functions
# remove links as these will only apply to specific responses
def rm_link(text):
    return re.sub(r'https?://\S+', '', text)

# remove punctuation
def rm_punct(text):
    return re.sub(r'[^\w\s]', '', text)

# create class that lemmatizes tweet tokens
# this will be used when creating the term matrix
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.tt = TweetTokenizer(preserve_case=False, reduce_len=True,
                                 strip_handles=True, match_phone_numbers=False)
    def __call__(self, docs):
        return [self.wnl.lemmatize(t) for t in self.tt.tokenize(rm_link(rm_punct(docs))) if t not in stop]
# creates a term matrix
def train_vectorizer(text_data, vectorizer=CountVectorizer, tokenizer=LemmaTokenizer(),
                     ngram_range_lower = 1, ngram_range_upper = 1, min_df = 1):
    """
    Trains a vectorizer on the provided text data and returns the vectorizer instance,
    the document-term matrix, and the feature names.

    Parameters:
    - text_data: List of text documents to be vectorized.
    - vectorizer: Vectorizer class to be used for text vectorization. Defaults to CountVectorizer.
    - tokenizer: Tokenizer class to be used for tokenizing the text documents. Defaults to TweetTokenizer.
    - ngram_range_lower: What's the minimum length of n-grams we want.
    - ngram_range_upper: What's the maximum length of n-grams we want.
    - min_df: Minimum data frequency.
    - max_df: Maximum data frequency.

    Returns:
    - instance: The trained vectorizer instance.
    - matrix: The document-term matrix resulting from fitting the vectorizer on `text_data`.
    - features: An array of feature names generated by the vectorizer.
    """
    # Initialize the vectorizer with specified configurations
    instance = vectorizer(
        strip_accents=None,  # Do not strip accents
        lowercase=True,  # Do not convert characters to lowercase
        tokenizer=tokenizer,  # Use the tokenize method of the tokenizer instance
        token_pattern=None,  # Since a tokenizer is provided, token_pattern is not used
        ngram_range=(ngram_range_lower, ngram_range_upper),  # Consider only single words (1-grams)
        min_df=min_df,  # Minimum document frequency for filtering terms
        max_features=None  # No limit on the number of features
    )

    # Fit the vectorizer on the provided text data and transform the data into a matrix
    vector = instance.fit(text_data)

    return vector, instance

In [21]:
tfidf_v, instance = train_vectorizer(text_data = X_train,
                           vectorizer = TfidfVectorizer,
                           ngram_range_lower = 1,
                           ngram_range_upper = 3,
                           min_df = 5)

In [22]:
features = '|\n|'.join(instance.get_feature_names())
file = open('/home/datallah/datallah-jaymefis-gibsonce/bgmm/features.txt', "w")
file.write(features)
file.close()  # to change file access modes

In [23]:
# normalize and truncate

print("Transforming X_train")
tfidf_m = tfidf_v.transform(X_train)
sparse.save_npz('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_m.npz', tfidf_m)
del X_train
# truncate
svd = TruncatedSVD(n_components = 100, random_state = random_state)
svd.fit(normalize(tfidf_m))
tfidf_m = svd.transform(normalize(tfidf_m))
np.save('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_trunc.npy', tfidf_m)

print("Transforming X_val")
tfidf_m_val = tfidf_v.transform(X_val)
del X_val
sparse.save_npz('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_m_val.npz', tfidf_m_val)
tfidf_m_val = svd.transform(normalize(tfidf_m_val))
np.save('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_trunc_val.npy', tfidf_m_val)

print("Transforming X_test")
tfidf_m_test = tfidf_v.transform(X_test)
del X_test
sparse.save_npz('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_m_test.npz', tfidf_m_test)
tfidf_m_test = svd.transform(normalize(tfidf_m_test))
np.save('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_trunc_test.npy', tfidf_m_test)

Transforming X_train
Transforming X_val
Transforming X_test


In [24]:
tfidf_m.shape

(139982, 100)

## Train Base Model

In [None]:
# init variables
best_model = None
best_log_likelihood = -np.inf
log_lik_lst = []
time_start_lst = []
time_end_lst = []
max_components = np.arange(1, 30 + 1)
breaker = 0

# iterate over different numbers of components
for n_components in tqdm.tqdm(max_components):
    time_start_lst.append(datetime.now())
    # Initialize and fit the BGMM with current number of components
    bgmm = BayesianGaussianMixture(n_components = n_components,
                                   random_state = random_state, 
                                   max_iter = 1000,
                                   warm_start = True)
    bgmm.fit(tfidf_m)
    
    # Calculate log likelihood of the current model
    log_likelihood = bgmm.score(tfidf_m_val)
    log_lik_lst.append(log_likelihood)
    print(f"Number of components: {n_components}, Log likelihood: {log_likelihood}")
    
    time_end_lst.append(datetime.now())
    # Check if log likelihood is better than the best so far
    if log_likelihood > best_log_likelihood:
        best_log_likelihood = log_likelihood
        best_model = bgmm
        breaker = 0
    elif breaker < 3:
        breaker ++ 1
        # If log likelihood starts to dip, stop iterating
    else:
        break

  3%|▎         | 1/30 [00:02<01:12,  2.49s/it]

Number of components: 1, Log likelihood: 196.5713518830757


  7%|▋         | 2/30 [01:36<26:15, 56.27s/it]

Number of components: 2, Log likelihood: 243.11340103156027


 10%|█         | 3/30 [05:31<1:02:02, 137.87s/it]

Number of components: 3, Log likelihood: 260.99605489375665


 13%|█▎        | 4/30 [08:36<1:07:47, 156.45s/it]

Number of components: 4, Log likelihood: 290.68748356450806


 17%|█▋        | 5/30 [14:30<1:34:57, 227.92s/it]

Number of components: 5, Log likelihood: 302.53934624201463


 20%|██        | 6/30 [18:57<1:36:22, 240.95s/it]

Number of components: 6, Log likelihood: 311.82450073233014


 23%|██▎       | 7/30 [25:00<1:47:41, 280.93s/it]

Number of components: 7, Log likelihood: 319.21092295241766


 27%|██▋       | 8/30 [28:35<1:35:18, 259.91s/it]

Number of components: 8, Log likelihood: 316.05980845011544


 30%|███       | 9/30 [36:06<1:51:53, 319.67s/it]

Number of components: 9, Log likelihood: 324.16009071553844


 33%|███▎      | 10/30 [43:32<1:59:36, 358.81s/it]

Number of components: 10, Log likelihood: 329.4315077583327


 37%|███▋      | 11/30 [49:25<1:53:00, 356.86s/it]

Number of components: 11, Log likelihood: 329.75458349761567


 40%|████      | 12/30 [1:01:46<2:22:09, 473.86s/it]

Number of components: 12, Log likelihood: 335.2015776342215


 43%|████▎     | 13/30 [1:20:33<3:10:19, 671.75s/it]

Number of components: 13, Log likelihood: 338.5252807338983


 47%|████▋     | 14/30 [1:48:10<4:18:27, 969.22s/it]

Number of components: 14, Log likelihood: 340.07432639411326


 50%|█████     | 15/30 [2:07:35<4:17:01, 1028.10s/it]

Number of components: 15, Log likelihood: 344.4102559634466


In [16]:
max_components = np.arange(1, 30 + 1)
log_lik_lst = [196.57135188307578, 243.11340103156016, 260.9960548937567, 290.687483564508, 302.5393462420144, 311.82450073233025, 319.2109229524175, 316.05980845011544, 324.16009071553856, 329.4315077583328, 329.7545834976157, 335.2015776342216, 338.5252807338984, 340.07432639411326, 344.4102559634466, 346.4648811376254, 349.14685280030614, 349.6416119071051, 351.64877838142047, 351.8723286693082, 353.37432998827825]
times_str = ['00:00:00', '00:00:02', '00:01:36', '00:05:31', '00:08:36', '00:14:30', '00:18:57', '00:25:00', '00:28:35', '00:36:06', '00:43:32', '00:49:25', '01:01:46', '01:20:33', '01:48:10', '02:07:35', '02:32:11', '03:14:35', '03:42:39', '04:29:00', '04:57:43', '05:23:08']

In [20]:
times = [datetime.strptime(time_str, '%H:%M:%S') for time_str in times_str]
time_diff_lst = [(times[i + 1] - times[i]).total_seconds() for i in range(len(times) - 1)]
print(time_diff_lst)

[2.0, 94.0, 235.0, 185.0, 354.0, 267.0, 363.0, 215.0, 451.0, 446.0, 353.0, 741.0, 1127.0, 1657.0, 1165.0, 1476.0, 2544.0, 1684.0, 2781.0, 1723.0, 1525.0]


In [21]:
i = len(log_lik_lst) - 1
# create and write sensitivity df
sens_df = pd.DataFrame()
sens_df['n_componenets'] = max_components[0:i]
sens_df['max_log_likelihood'] = log_lik_lst[0:i]
sens_df['train_time'] = time_diff_lst[0:i]
sens_df
# write df
sens_df.to_csv('/home/datallah/datallah-jaymefis-gibsonce/bgmm/sensitivity.csv', index = False) 

## Train Base Model (Larger step)

In [None]:
!python bgmm_multi_train.py

Already trained for 1 components.
Already trained for 5 components.
Already trained for 13 components.
Already trained for 3 components.
Already trained for 15 components.
Already trained for 7 components.
Already trained for 21 components.
Already trained for 17 components.
Already trained for 23 components.
Already trained for 25 components.
Already trained for 29 components.
Already trained for 31 components.
Already trained for 33 components.
Already trained for 35 components.
Already trained for 9 components.
Already trained for 11 components.
Initialization 0
Initialization 0
Initialization 0
Initialization 0
  Iteration 10
  Iteration 10
  Iteration 20
  Iteration 10
  Iteration 10
  Iteration 30
  Iteration 20
  Iteration 40
  Iteration 20
  Iteration 50
  Iteration 30
  Iteration 20
  Iteration 60
  Iteration 40
  Iteration 30
  Iteration 70
  Iteration 30
  Iteration 50
  Iteration 80
  Iteration 40
  Iteration 40
  Iteration 90
  Iteration 60
  Iteration 50
  Iteration 100
 