# Explore BGMM

## Dependencies

In [1]:
import pandas as pd
import numpy as np
import joblib
import os
import json
from collections import Counter
import re
import tqdm
from datetime import datetime

### sklearn dependencies
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import BayesianGaussianMixture
from sklearn.model_selection import GridSearchCV
from scipy import sparse

### text preprocessing dependencies
import nltk
from nltk.tokenize.casual import TweetTokenizer
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

### gensim dependencies
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath

[nltk_data] Downloading package wordnet to /home/datallah/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
filepath = '/home/datallah/datallah-jaymefis-gibsonce/'
random_state = 42

In [50]:
size = 'one'
train = pd.read_csv(filepath + f'samples/train_{size}_million.csv').rename(
    columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()
val   = pd.read_csv(filepath + f'samples/validate_{size}_million.csv').rename(
    columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()
test  = pd.read_csv(filepath + f'samples/test_{size}_million.csv').rename(
    columns = {' response_text': 'response_text', ' op_gender': 'op_gender'}).dropna()

In [51]:
X_train = train[train.source == 'TED'].response_text
y_train = train[train.source == 'TED'].op_gender
X_val = val[val.source == 'TED'].response_text
y_val = val[val.source == 'TED'].op_gender
X_test  = test[test.source == 'TED'].response_text
y_test  = test[test.source == 'TED'].op_gender

In [52]:
del train, val, test

In [3]:
tfidf_m = np.load('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_trunc.npy')
tfidf_m_val = np.load('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_trunc_val.npy')
tfidf_m_test = np.load('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_trunc_test.npy')

In [5]:
sens_df = pd.read_csv('/home/datallah/datallah-jaymefis-gibsonce/bgmm/sensitivity.csv')
sens_df

Unnamed: 0,n_componenets,max_log_likelihood,train_time
0,1,196.571352,2.0
1,2,243.113401,94.0
2,3,260.996055,235.0
3,4,290.687484,185.0
4,5,302.539346,354.0
5,6,311.824501,267.0
6,7,319.210923,363.0
7,8,316.059808,215.0
8,9,324.160091,451.0
9,10,329.431508,446.0


## Fit Model with Best Base Params

In [24]:
n_components = 20

In [25]:
bgmm = BayesianGaussianMixture(n_components = n_components,
                               random_state = random_state, 
                               max_iter = 1000)
bgmm.fit(tfidf_m)

BayesianGaussianMixture(max_iter=1000, n_components=20, random_state=42)

## Analyze Component Gender Makeup

In [26]:
train_preds = bgmm.predict(tfidf_m)
train_preds_df = pd.DataFrame({'preds' : train_preds, 
                               'label' : np.where(y_train == 'W', 0, 1)})
train_preds_df.sample(5)

Unnamed: 0,preds,label
20715,6,1
62647,15,0
94194,16,1
109690,18,0
67685,13,0


In [49]:
components = np.sort(train_preds_df.preds.unique())
men_ratio = train_preds_df.groupby('preds').apply(lambda x: sum(x.label)/len(x))
cnts = train_preds_df.groupby('preds')['label'].count()
grpd = pd.DataFrame({'component'   : components,
                     'men_ratio'   : men_ratio,
                     'women_ratio' : 1 - men_ratio,
                     'comp_cnts'   : cnts})
grpd.sort_values(by = 'men_ratio', ascending = False)

Unnamed: 0_level_0,component,men_ratio,women_ratio,comp_cnts
preds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0.623349,0.376651,5602
14,14,0.611401,0.388599,14928
16,16,0.544816,0.455184,7687
9,9,0.52437,0.47563,13767
19,19,0.522467,0.477533,3583
13,13,0.515214,0.484786,6836
5,5,0.514547,0.485453,6462
11,11,0.508679,0.491321,5300
18,18,0.493118,0.506882,4577
17,17,0.490485,0.509515,2575


In [57]:
sig_comp_df = grpd[(grpd.men_ratio >= 0.6) | (grpd.women_ratio >= 0.6)].sort_values(by = 'men_ratio', ascending = False)
sig_comps = list(sig_comp_df.component.unique())
print(sig_comps)

[1, 14, 3]


In [55]:
posterior_probs = bgmm.predict_proba(tfidf_m)

In [65]:
sig_docs = {}
n_top_docs = sig_comp_df.comp_cnts.min()
for comp in sig_comps:
    # Get the indices of the top n documents for the current component
    top_docs_idx = list(np.argsort(posterior_probs[:, comp])[::-1][:n_top_docs])
    # Retrieve the top n documents for the current component
    top_docs = [X_train.iloc[idx] for idx in top_docs_idx]
    sig_docs[comp] = top_docs

In [73]:
# example doc
for key in sig_comps:
    print(f'Examples for component {str(key)}:')
    for i in range(10):
        print(f'{str(i + 1)}: {sig_docs[key][i]}')
    print('\n')

Examples for component 1:
1: OH YES!  That is a great perspective and attitude. Promoting the commensalism is a great way to strengthen us. Targeting specific bacteria is wonderful.   I am curious if there is also a a friend receptor to know when symbiotic bacteria are present to work together with other groups of bacteria. That would be like the an organ detecting sugar and releasing chemicals to help other cells process the sugar.  Is any of this communications or virulence blocking valid for Viruses?
2: By complete coincidence I started watching this talk by Nathan Myhrvold on Ted.com and as it started an episode of Charlie Rose came on TV with Nathan Myhrvold speaking for an hour....which was by far more interesting and very much worth watching. I believe Myhrvolds point here on this Ted Talk was thats it is ok to follow your varied interests in life. I would suggest that most people are afraid to do so mostly cause of societal pressures.
3: Very inspirational talk. Does anyone kno

## Find Significant Words

In [74]:
# import tf-idf
full_train_tfidf = normalize(sparse.load_npz('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_m.npz'))

In [85]:
file = open('/home/datallah/datallah-jaymefis-gibsonce/bgmm/features.txt', "r")
features = file.read().split('|\n|')
file.close()
features[:5]

['0', '0 1', '000', '000 000', '01']

In [89]:
sig_vec = {}
n_top_docs = sig_comp_df.comp_cnts.min()
for comp in sig_comps:
    # Get the indices of the top n documents for the current component
    top_docs_idx = list(np.argsort(posterior_probs[:, comp])[::-1][:n_top_docs])
    # Retrieve the top n documents for the current component
    top_docs = [full_train_tfidf[idx] for idx in top_docs_idx]
    sig_vec[comp] = top_docs

In [107]:
sig_terms = {}
# example doc
for key in sig_comps:
    print(f'Significant terms for component {str(key)}:')
    # find significant terms by average weight
    avg_tfidf_scores = sparse.csr_matrix.mean(np.array(sig_vec[key]), axis = 0)
    top_term_indx = np.argsort(avg_tfidf_scores.toarray()[0])[::-1]
    n_top_terms = 50
    top_terms = [features[idx] for idx in top_term_indx[:n_top_terms]]
    sig_terms[key] = top_terms
    
    print(top_terms)

Significant terms for component 1:
['first', 'ok', 'mr', 'yes', 'know', 'cool', 'he', 'funny', 'wow', 'technology', 'true', 'talk', 'people', 'opinion', 'really', 'like', 'think', 'oh', 'his', 'would', 'unfortunately', 'amazing', 'one', 'see', 'guy', 'dont', 'interesting', 'thing', 'video', 'god', 'love', 'hello', 'thanks', 'u', 'make', 'world', 'time', 'say', 'idea', 'way', 'good', 'nice', 'agree', 'need', 'thought', 'want', 'much', 'work', 'great', 'hi']
Significant terms for component 14:
['he', 'his', 'god', 'talk', 'im', 'technology', 'actually', 'people', 'video', 'great', 'comment', 'like', 'think', 'dont', 'one', 'say', 'man', 'ted', 'guy', 'know', 'question', 'make', 'u', 'time', 'great talk', 'thing', 'would', 'good', 'point', 'said', 'book', 'get', 'right', 'many', 'sorry', 'world', 'year', 'way', 'yeah', 'problem', 'want', 'need', 'agree', 'life', 'first', 'see', 'watching', 'she', 'could', 'believe']
Significant terms for component 3:
['beautiful', 'talk', 'amazing', 'true

In [108]:
# remove redundant values to exclude special values
top_terms = [term for lst in sig_terms.values() for term in lst]
term_counts = Counter(top_terms)
exclude_terms = {term for term, count in term_counts.items() if count > 1}
for key in sig_comps:
    print(f'Significant (non-redundant) terms for component {str(key)}:')
    temp_lst = [term for term in sig_terms[key] if term not in exclude_terms]
    print(temp_lst)

Significant (non-redundant) terms for component 1:
['ok', 'mr', 'yes', 'cool', 'funny', 'wow', 'opinion', 'oh', 'unfortunately', 'interesting', 'love', 'hello', 'idea', 'nice', 'thought', 'much', 'work', 'hi']
Significant (non-redundant) terms for component 14:
['im', 'actually', 'comment', 'question', 'great talk', 'point', 'said', 'book', 'get', 'right', 'sorry', 'year', 'yeah', 'problem', 'life', 'could', 'believe']
Significant (non-redundant) terms for component 3:
['beautiful', 'inspiring', 'powerful', 'story', 'thank', 'hey', 'absolutely', 'word', 'sharing', 'speech', 'speaker', 'truly', 'inspirational', 'amazing talk', 'her', 'best', 'loved', 'watched', 'ted talk', 'thank sharing', 'absolutely amazing', 'job', 'presentation', 'touching', 'watching video', 'well', 'thanks sharing', 'ive', 'beautiful talk', 'great story', 'moving', 'great job', 'seen', 'amazing speech', 'ever', 'inspiring speech']


## Analyze Component Gender Makeup (Test)

In [109]:
test_preds = bgmm.predict(tfidf_m_test)
test_preds_df = pd.DataFrame({'preds' : test_preds, 
                              'label' : np.where(y_test == 'W', 0, 1)})
test_preds_df.sample(5)

Unnamed: 0,preds,label
16446,5,1
3314,12,0
12278,11,0
5166,0,0
23243,19,0


In [110]:
components = np.sort(test_preds_df.preds.unique())
men_ratio = test_preds_df.groupby('preds').apply(lambda x: sum(x.label)/len(x))
cnts = test_preds_df.groupby('preds')['label'].count()
grpd = pd.DataFrame({'component'   : components,
                     'men_ratio'   : men_ratio,
                     'women_ratio' : 1 - men_ratio,
                     'comp_cnts'   : cnts})
grpd.sort_values(by = 'men_ratio', ascending = False)

Unnamed: 0_level_0,component,men_ratio,women_ratio,comp_cnts
preds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0.615449,0.384551,1191
14,14,0.600064,0.399936,3133
19,19,0.545932,0.454068,762
16,16,0.532365,0.467635,1653
9,9,0.522601,0.477399,2876
5,5,0.512857,0.487143,1400
13,13,0.510204,0.489796,1470
11,11,0.509044,0.490956,1161
18,18,0.504149,0.495851,964
15,15,0.502582,0.497418,581


In [111]:
sig_comp_df = grpd[(grpd.men_ratio >= 0.6) | (grpd.women_ratio >= 0.6)].sort_values(by = 'men_ratio', ascending = False)
sig_comps = list(sig_comp_df.component.unique())
print(sig_comps)

[1, 14, 3]


In [113]:
posterior_probs = bgmm.predict_proba(tfidf_m_test)

In [114]:
sig_docs = {}
n_top_docs = sig_comp_df.comp_cnts.min()
for comp in sig_comps:
    # Get the indices of the top n documents for the current component
    top_docs_idx = list(np.argsort(posterior_probs[:, comp])[::-1][:n_top_docs])
    # Retrieve the top n documents for the current component
    top_docs = [X_test.iloc[idx] for idx in top_docs_idx]
    sig_docs[comp] = top_docs

In [115]:
# example doc
for key in sig_comps:
    print(f'Examples for component {str(key)}:')
    for i in range(10):
        print(f'{str(i + 1)}: {sig_docs[key][i]}')
    print('\n')

Examples for component 1:
1: ok lets clear things up 1.Hell is not a place of fire and flames.       how do we know this? The bible uses imagery very often because that was the way people spoke.  Mathew 25:30 describes hell as a dark place’ Revelations 20:14 says hell is a place of fire are they contradictory? NO Hebrews 12:29 says God is a consuming fire
2: In my opinion
3: To share in the joy of that womans first steps in years was fantastic! Yesterday we saw a brief clip of a monkey controlling a robot arm with its mind.It is only a matter of time before these technologies combine to create amazing new devices that will  enrich our lives.
4: Yes we need to increase law enforcement in the world to stop the violence.  It has worked out so well here in the U.S.A.  https://www.youtube.com/watch?v=eiq4Cht49o8
5: This is wonderful technology that is likely to revolutionize manufacturing. Very soon we may see 3D printers all around the world...... that are likely to be made in China!
6: Un

## Find Significant Words

In [116]:
# import tf-idf
full_test_tfidf = normalize(sparse.load_npz('/home/datallah/datallah-jaymefis-gibsonce/bgmm/tfidf_m_test.npz'))

In [117]:
# file = open('/home/datallah/datallah-jaymefis-gibsonce/bgmm/features_test.txt', "r")
# features = file.read().split('|\n|')
# file.close()
# features[:5]

In [118]:
sig_vec = {}
n_top_docs = sig_comp_df.comp_cnts.min()
for comp in sig_comps:
    # Get the indices of the top n documents for the current component
    top_docs_idx = list(np.argsort(posterior_probs[:, comp])[::-1][:n_top_docs])
    # Retrieve the top n documents for the current component
    top_docs = [full_test_tfidf[idx] for idx in top_docs_idx]
    sig_vec[comp] = top_docs

In [119]:
sig_terms = {}
# example doc
for key in sig_comps:
    print(f'Significant terms for component {str(key)}:')
    # find significant terms by average weight
    avg_tfidf_scores = sparse.csr_matrix.mean(np.array(sig_vec[key]), axis = 0)
    top_term_indx = np.argsort(avg_tfidf_scores.toarray()[0])[::-1]
    n_top_terms = 50
    top_terms = [features[idx] for idx in top_term_indx[:n_top_terms]]
    sig_terms[key] = top_terms
    
    print(top_terms)

Significant terms for component 1:
['first', 'ok', 'mr', 'opinion', 'cool', 'yes', 'he', 'wow', 'funny', 'know', 'oh', 'unfortunately', 'his', 'people', 'think', 'like', 'would', 'technology', 'talk', 'true', 'god', 'really', 'world', 'time', 'one', 'way', 'thing', 'love', 'see', 'interesting', 'say', 'amazing', 'hello', 'idea', 'make', 'need', 'u', 'video', 'guy', 'dont', 'good', 'want', 'work', 'actually', 'human', 'hey', 'right', 'much', 'something', 'use']
Significant terms for component 14:
['he', 'his', 'god', 'talk', 'im', 'video', 'dont', 'technology', 'think', 'great', 'actually', 'comment', 'people', 'say', 'like', 'ted', 'one', 'guy', 'u', 'make', 'said', 'time', 'get', 'know', 'great talk', 'thing', 'would', 'question', 'need', 'want', 'sorry', 'agree', 'book', 'good', 'point', 'right', 'year', 'first', 'human', 'man', 'many', 'world', 'see', 'life', 'game', 'much', 'go', 'thank', 'way', 'watching']
Significant terms for component 3:
['beautiful', 'amazing', 'talk', 'inspir

In [120]:
# remove redundant values to exclude special values
top_terms = [term for lst in sig_terms.values() for term in lst]
term_counts = Counter(top_terms)
exclude_terms = {term for term, count in term_counts.items() if count > 1}
for key in sig_comps:
    print(f'Significant (non-redundant) terms for component {str(key)}:')
    temp_lst = [term for term in sig_terms[key] if term not in exclude_terms]
    print(temp_lst)

Significant (non-redundant) terms for component 1:
['ok', 'mr', 'opinion', 'cool', 'yes', 'wow', 'funny', 'oh', 'unfortunately', 'love', 'interesting', 'hello', 'idea', 'work', 'something', 'use']
Significant (non-redundant) terms for component 14:
['im', 'comment', 'said', 'get', 'great talk', 'question', 'sorry', 'agree', 'book', 'point', 'year', 'life', 'game', 'go']
Significant (non-redundant) terms for component 3:
['beautiful', 'inspiring', 'story', 'she', 'word', 'sharing', 'powerful', 'truly', 'absolutely', 'amazing talk', 'thanks', 'speech', 'touching', 'her', 'speaker', 'presentation', 'watched', 'ted talk', 'loved', 'thank sharing', 'inspirational', 'truly inspiring', 'best', 'thanks sharing', 'ive', 'job', 'well', 'seen', 'watching video', 'ever', 'heard', 'great ted', 'really inspiring', 'amazing speech', 'inspiring talk', 'brave']
