In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from nltk import RegexpTokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from src.model_insights import get_word_covariance, get_class_features

In [2]:
from src.model_insights import get_word_covariance, get_class_features

In [3]:
# having to define tokenizer again here
# because its not easy to pickle classes
# and the vectorizer we need to load
# takes it as a parameter

with open('pickle_files/stop_words.pickle', 'rb') as file:
    stop_words_complete = pickle.load(file)
    
class Tokenizer(object):
    def __init__(self):
        self.pt = PorterStemmer()
        self.wnl = WordNetLemmatizer()
        self.tk = RegexpTokenizer(r'\b[a-zA-Z]{3,}\b')
        self.stpwrd = set(stop_words_complete)
        
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tk.tokenize(doc) if not t in self.stpwrd]

my_tokenizer = Tokenizer()

with open('pickle_files/vectorizer.pickle', 'rb') as file:
    tfdif_vectorizer = pickle.load(file)
    X_train_vect = pickle.load(file)
    X_val_vect = pickle.load(file)
    
    nb = pickle.load(file)

In [4]:
df, cov = get_word_covariance(tfdif_vectorizer, nb, n=100, top=True)
df.index.to_list()

['claim',
 'contribution',
 'reliable',
 'trying',
 'review',
 'last',
 'try',
 'nothing',
 'delete',
 'anyone',
 'using',
 'understand',
 'hello',
 'however',
 'though',
 'subject',
 'give',
 'thought',
 'book',
 'thing',
 'free',
 'opinion',
 'believe',
 'vandalize',
 'ask',
 'post',
 'show',
 'best',
 'already',
 'part',
 'template',
 'yes',
 'reverted',
 'account',
 'hope',
 'request',
 'state',
 'actually',
 'feel',
 'remove',
 'without',
 'agree',
 'seems',
 'comment',
 'might',
 'tag',
 'history',
 'another',
 'year',
 'note',
 'case',
 'two',
 'policy',
 'done',
 'issue',
 'sure',
 'problem',
 'used',
 'since',
 'content',
 'redirect',
 'welcome',
 'continue',
 'added',
 'removed',
 'deleted',
 'list',
 'find',
 'first',
 'point',
 'reference',
 'add',
 'stop',
 'question',
 'new',
 'editor',
 'deletion',
 'work',
 'information',
 'discussion',
 'could',
 'section',
 'help',
 'utc',
 'page',
 'image',
 'get',
 'may',
 'editing',
 'use',
 'thank',
 'link',
 'also',
 'source',
 '

In [5]:
df, cov = get_word_covariance(tfdif_vectorizer, nb, n=1000, top=True)
# cov['hate'].sort_values(ascending=False).to_frame()
cov['trying'].sort_values(ascending=False).to_frame()

Unnamed: 0,trying
fuck,0.469063
fucking,0.447279
shit,0.434526
suck,0.429108
bitch,0.423916
...,...
consensus,-0.352352
contribs,-0.354575
review,-0.356513
tag,-0.365332


In [6]:
df

Unnamed: 0,non_toxic_coefs,toxic_coefs
election,-8.395956,
intended,-8.395831,
send,-8.392951,-7.995728
basic,-8.392132,
telling,-8.391973,-7.538181
...,...,...
bitch,,-5.235449
suck,,-5.101332
shit,,-4.961371
fucking,,-4.631957


In [7]:
# neg, pos = get_class_features(tfdif_vectorizer, nb, n=10, top=True, indices=False)
# pos

In [8]:
from sklearn.decomposition import NMF

topics = 10
cols = ['topic' + str(i) for i in range(topics)]
nmf = NMF(n_components=topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(X_train_vect)



In [9]:
with open('nmf_model.pickle', 'wb') as file:
    # pickling vectorizer and vectorized data
    pickle.dump(nmf, file)

In [10]:
def get_topic_words(model, feature_names, n_top_words):
    topic_words = []
    for topic_idx, topic in enumerate(model.components_):
        message = ""
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words.append(message)
    return topic_words

In [91]:
tfidf_feature_names = tfdif_vectorizer.get_feature_names()
n_top_words = 300
all_topic_words = get_topic_words(nmf, tfidf_feature_names, n_top_words)  
all_topic_words


import re
from collections import defaultdict
def get_topic_matches(input_str):
    idxs = []
    for item in all_topic_words:
        if input_str in item:
            idxs.append(all_topic_words.index(item))

    matches = []
    for i in idxs:
        # finding surrounding words with re i know its sloppy but deadlines
        sub = '(\w*)\W*(\w*)\W*(\w*)\W*(\w*)\W*({})\W*(\w*)\W*(\w*)\W*(\w*)\W*(\w*)'.format(input_str)
        str1 = all_topic_words[i]
        #printing the topic we are on
        for j in re.findall(sub, str1, re.I):
            words = " ".join([x for x in j if x != ""])
            matches.append([str(i), words])
            
            
    return matches

pd.set_option('display.max_colwidth', None)
df = pd.DataFrame(get_topic_matches('hate'))
df.columns = ['topic', 'document text']
# f = pd.DataFrame([[k] + v[0] for k, v in a.items()], 
#                    columns=['id', 'score'])
df


Unnamed: 0,topic,document text
0,0,unless community obviously w hate ver great day week
1,8,care man son prick hate kill pussy face eat
2,8,guy mom dirty w hate ver retard twat cocksucker


In [None]:
matches

In [None]:
4*'\W*(\w*)\W*(\w*)'

In [None]:
all_topic_words[8]

In [None]:
"(\w*)"*10

In [None]:
_str = 'hey'
# def get_surrounding_words(your_str):
    
idxs = []
for item in all_topic_words:
    if _str in item:
        idxs.append(all_topic_words.index(item))
        
for i in idxs:
    print(all_topic_words[i])
    print()


In [92]:
topic_df = pd.DataFrame(nmf.components_, index=cols, columns=tfdif_vectorizer.get_feature_names()).T
neg, pos = get_class_features(tfdif_vectorizer, nb, n=20, top=True)
topic_formatted = topic_df.T[pos].T
topic_formatted.head()

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
hate,0.194625,0.001932,0.0,0.0,0.0,0.0,0.0,0.0,0.106518,0.0
hey,0.28293,0.015418,0.053642,0.008052,0.0,0.0,0.360388,0.030796,0.296126,0.019841
little,0.701948,0.0,0.0,0.0,0.027076,0.0,0.021624,0.0,0.199716,0.0
hell,0.181384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.174353,0.0
cunt,0.028361,0.008298,0.0,0.0,0.0,0.0,0.001444,0.0,0.525553,0.0


In [None]:
import seaborn as sns

def graph_topic(words,topic=1):
    topic_formatted = topic_df.T[words].T
    cols = 'topic'+str(topic)
    plt.figure(figsize=[5,12])
    plt.barh(topic_formatted.index, topic_formatted[cols])
        
graph_topic(pos,8)

In [104]:
import plotly.express as px
topic=8
topic_formatted = topic_df.T[pos].T
cols = 'topic'+str(topic)
data_canada = px.data.gapminder().query("country == 'Canada'")
fig = px.bar(data_canada, x=topic_formatted.index, y=topic_formatted[cols])
fig.show()