In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle

from nltk import RegexpTokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import plotly.express as px

from src.model_insights import get_word_covariance, get_class_features

In [3]:
train_df = pd.read_csv('/Users/collinswestnedge/programming/Metis_Online/project_03/data/jigsaw-toxic-comment-classification-challenge/train.csv')
test_df_labels = pd.read_csv('/Users/collinswestnedge/programming/Metis_Online/project_03/data/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
test_df = pd.read_csv('/Users/collinswestnedge/programming/Metis_Online/project_03/data/jigsaw-toxic-comment-classification-challenge/test.csv')

In [None]:
[]

In [16]:
toxic_gay = train_df[(train_df.toxic==1)&(train_df.comment_text.str.contains('gay'))].shape[0]
total_gay = train_df[train_df.comment_text.str.contains('gay')].shape[0]
toxic_gay/total_gay

0.5522012578616352

In [9]:
train_df[(train_df.toxic==1)&(train_df.comment_text.str.contains('gay'))].shape[0]

439

In [None]:
# having to define tokenizer again here
# because its not easy to pickle classes
# and the vectorizer we need to load
# takes it as a parameter

with open('pickle_files/stop_words.pickle', 'rb') as file:
    stop_words_complete = pickle.load(file)
    
class Tokenizer(object):
    def __init__(self):
        self.pt = PorterStemmer()
        self.wnl = WordNetLemmatizer()
        self.tk = RegexpTokenizer(r'\b[a-zA-Z]{3,}\b')
        self.stpwrd = set(stop_words_complete)
        
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tk.tokenize(doc) if not t in self.stpwrd]

my_tokenizer = Tokenizer()

with open('pickle_files/vectorizer.pickle', 'rb') as file:
    tfdif_vectorizer = pickle.load(file)
    X_train_vect = pickle.load(file)
    X_val_vect = pickle.load(file)
    
    nb = pickle.load(file)

In [None]:
test = 'gay'.split()

hey = tfdif_vectorizer.transform(test)
nb.predict(hey)

In [None]:
X_val_vect.toarray()[1,:]

In [None]:

df, cov = get_word_covariance(tfdif_vectorizer, nb, n=100, top=True)
neg, pos = get_class_features(tfdif_vectorizer, nb, n=100)
temp = df.reset_index()
temp['label'] = temp['index'].apply(lambda x: 0 if x in list(neg) else 1)
temp

# a = temp[['non_toxic_coefs', 'index']].dropna()
# a
df.fillna(np.log(.0000001), inplace=True)
df['Difference'] = df['non_toxic_coefs'] - df['toxic_coefs']
df2 = df.reset_index().copy()
df2.rename(columns={'index':'word'},inplace=True)
df2['label'] = df2.word.apply(lambda x: 1 if x in list(pos) else 0)
df3 = df2.sort_values(by=['Difference'],ascending=False)
smallest_difference = df3[(df3.Difference <= 4) & (df3.Difference>=-4)]
fig = px.bar(smallest_difference, x='Difference', y='word', orientation='h')
fig.update_layout(
                   width=500,
                   title_text="Coefficient Differences",
                   font_family="Times New Roman",
                   showlegend=True,
                   hovermode='x')
fig.show()

In [None]:
df

In [None]:
df3.toxic_coefs.min()

In [None]:
word_num = 50
word = 'please'

df, cov = get_word_covariance(tfdif_vectorizer, nb, n=word_num, top=True)
# cov['hate'].sort_values(ascending=False).to_frame()
testing = cov[word].sort_values(ascending=False).to_frame().reset_index()

neg, pos = get_class_features(tfdif_vectorizer, nb, n=word_num)
testing['label'] = testing['index'].apply(lambda x: 0 if x in list(neg) else 1)

In [None]:
# neg, pos = get_class_features(tfdif_vectorizer, nb, n=10, top=True, indices=False)
# pos

In [None]:
from sklearn.decomposition import NMF

topics = 6
cols = ['topic' + str(i) for i in range(topics)]
nmf = NMF(n_components=topics, random_state=1,
          alpha=.2, l1_ratio=.5).fit(X_train_vect)



In [None]:
# cols

In [None]:
with open('nmf_model.pickle', 'wb') as file:
    # pickling vectorizer and vectorized data
    pickle.dump(nmf, file)

In [None]:
def get_topic_words(model, feature_names, n_top_words):
    topic_words = []
    for topic_idx, topic in enumerate(model.components_):
        message = ""
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words.append(message)
    return topic_words

In [None]:
# all_topic_words[1]
nmf.components_.shape

In [None]:
tfidf_feature_names = tfdif_vectorizer.get_feature_names()
n_top_words = 1000
all_topic_words = get_topic_words(nmf, tfidf_feature_names, n_top_words)  
# all_topic_words


# import re
# from collections import defaultdict
# def get_topic_matches(input_str):
#     idxs = []
#     for item in all_topic_words:
#         if input_str in item:
#             idxs.append(all_topic_words.index(item))

#     matches = []
#     for i in idxs:
#         # finding surrounding words with re i know its sloppy but deadlines
#         sub = '(\w*)\W*(\w*)\W*(\w*)\W*(\w*)\W*({})\W*(\w*)\W*(\w*)\W*(\w*)\W*(\w*)'.format(input_str)
#         str1 = all_topic_words[i]
#         #printing the topic we are on
#         for j in re.findall(sub, str1, re.I):
#             words = " ".join([x for x in j if x != ""])
#             matches.append([str(i), words])
            
            
#     return matches

# pd.set_option('display.max_colwidth', None)
# df = pd.DataFrame(get_topic_matches('hate'))
# df.columns = ['topic', 'nearest_words']
# # f = pd.DataFrame([[k] + v[0] for k, v in a.items()], 
# #                    columns=['id', 'score'])
# df


In [None]:
_str = 'fuck'
# def get_surrounding_words(your_str):
    
idxs = []
for item in all_topic_words:
    if _str in item:
        idxs.append(all_topic_words.index(item))
        
for i in idxs:
    print(all_topic_words[i])
    print()


In [None]:
# topic_df = pd.DataFrame(nmf.components_, index=cols, columns=tfdif_vectorizer.get_feature_names()).T
# neg, pos = get_class_features(tfdif_vectorizer, nb, n=20, top=True)
# topic_formatted = topic_df.T[neg].T
# topic_formatted.head()

In [None]:
# import seaborn as sns

# def graph_topic(words,topic=1):
#     topic_formatted = topic_df.T[words].T
#     cols = 'topic'+str(topic)
#     plt.figure(figsize=[5,12])
#     plt.barh(topic_formatted.index, topic_formatted[cols])
        
# graph_topic(pos,8)

# from sklearn.decomposition import NMF

# topics = 10
# cols = ['topic' + str(i) for i in range(topics)]
# nmf = NMF(n_components=topics, random_state=1,
#           alpha=.1, l1_ratio=.5).fit(X_train_vect)

In [None]:
# topic_df = pd.DataFrame(nmf.components_, index=cols, columns=tfdif_vectorizer.get_feature_names()).T
# neg, pos = get_class_features(tfdif_vectorizer, nb, n=100, top=True)
# all_words = list(neg) + list(pos)

# topic_formatted = topic_df.T[all_words].T.reset_index()
# topic_formatted.rename(columns={'index':'word'},inplace=True)
# topic_formatted['toxic'] = topic_formatted.word.apply(lambda x: predict_label(x))
# topic_formatted.head()

In [None]:
topic_formatted['topic3'].max()

In [None]:
def predict_label(word):
    word_formatted = word.split()
    vectorized_word = tfdif_vectorizer.transform(word_formatted)
    return nb.predict(vectorized_word)[0]

In [None]:
topic_df = pd.DataFrame(nmf.components_, index=cols, columns=tfdif_vectorizer.get_feature_names()).T
neg, pos = get_class_features(tfdif_vectorizer, nb, n=100, top=True)
all_words = list(neg) + list(pos)

topic_formatted = topic_df.T[all_words].T.reset_index()
topic_formatted.rename(columns={'index':'word'},inplace=True)
topic_formatted['toxic'] = topic_formatted.word.apply(lambda x: predict_label(x))
topic_formatted.head()

In [None]:
def predict_label(word):
    word_formatted = word.split()
    vectorized_word = tfdif_vectorizer.transform(word_formatted)
    return nb.predict(vectorized_word)

In [None]:
predict_label('i want to hurt you badly you butt')

In [None]:
topic_df = pd.DataFrame(nmf.components_, index=cols, columns=tfdif_vectorizer.get_feature_names()).T
neg, pos = get_class_features(tfdif_vectorizer, nb, n=100, top=True)
all_words = list(set(list(neg) + list(pos)))

topic_formatted = topic_df.T[all_words].T.reset_index()
topic_formatted.rename(columns={'index':'word'},inplace=True)
topic_formatted['toxic'] = topic_formatted.word.apply(lambda x: predict_label(x))
topic_formatted.head()

topic=1
topic_col = 'topic'+str(topic)
filtered_topics = topic_formatted.sort_values(by=[topic_col], ascending=False).head(10)

In [None]:
filtered_topics

In [None]:
topic_df = pd.DataFrame(nmf.components_, index=cols, columns=tfdif_vectorizer.get_feature_names()).T
neg, pos = get_class_features(tfdif_vectorizer, nb, n=100, top=True)
all_words = list(set(list(neg) + list(pos)))

topic_formatted = topic_df.T[all_words].T.reset_index()
topic_formatted.rename(columns={'index':'word'},inplace=True)
topic_formatted['toxic'] = topic_formatted.word.apply(lambda x: predict_label(x))
topic_formatted.head()

topic=1
topic_col = 'topic'+str(topic)
filtered_topics = topic_formatted.sort_values(by=[topic_col], ascending=False).head(25)

fig = px.bar(y=filtered_topics.word,
             x=filtered_topics[topic_col],
#              color=filtered_topics.toxic,
             orientation='h',
             width=500,
             height=500)
fig.show()

In [None]:
topic_formatted.word

In [None]:
import plotly.express as px
topic=7

topic_col = 'topic'+str(topic)
fig = px.bar(y=topic_formatted.word, x=topic_formatted[topic_col], color=topic_formatted.toxic, orientation='h', width=500)
fig.show()

In [None]:
'get' in pos

In [None]:
pred