In [1]:
import pandas as pd
import numpy as np
import re
from urlextract import URLExtract
import matplotlib.pyplot as plt
import pickle 

from src.model_insights import get_word_covariance, get_class_features

extractor = URLExtract()

In [2]:
train_df = pd.read_csv('/Users/collinswestnedge/programming/Metis_Online/project_03/data/jigsaw-toxic-comment-classification-challenge/train.csv')
test_df_labels = pd.read_csv('/Users/collinswestnedge/programming/Metis_Online/project_03/data/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
test_df = pd.read_csv('/Users/collinswestnedge/programming/Metis_Online/project_03/data/jigsaw-toxic-comment-classification-challenge/test.csv')

In [None]:
# replacing the -1 labels in test labels and 
# and joining it with the test.csv 
test_labels_temp = test_df_labels.replace(-1,np.nan)
test_labels_clean = test_labels_temp.dropna()
test_data = test_df.merge(test_labels_clean)

In [None]:
def preprocess(data):
    
    def replace_urls(x):
        urls = extractor.find_urls(x)
        if urls:
            x_new = replace_urls(x.replace(urls[0],''))
            return x_new
        else:
            return x

    data['comment_text'] = data.comment_text.map(lambda x: replace_urls(x))
    #get rid of duplicate letters just trying this out may comment out later
    data['comment_text'] = data.comment_text.map(lambda x: re.sub(r'(.)\1{2,}', '', str(x).lower()))
    # removing indents
    data['comment_text'] = data.comment_text.map(lambda x: re.sub('\\n',' ',str(x)))
    # remove weird user occurence 
    data['comment_text'] = data.comment_text.map(lambda x: re.sub("\[\[User.*",'',str(x)))
    # remove ip address
    data['comment_text'] = data.comment_text.map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
    return data  

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import plot_roc_curve

In [None]:
def get_metrics(classifier, X_train, y_train, X_test, y_test, plot_curve=False):
    
    # literally making this for my own practice 
    # i know theres a lot quicker/simpler ways
    
    model = classifier
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    actuals = y_test
    
    print('-'*20 + ' ' + type(model).__name__  + ' ' + '-'*20)
        
    err_df = pd.DataFrame({'pred':preds, 'actual':actuals})

    TP = err_df.actual[(err_df.actual==1) & (err_df.pred==1)].count()
    print('TP: ', TP)
    # False Positives:
    FP = err_df.actual[(err_df.actual==0) & (err_df.pred==1)].count()
    print('FP: ', FP)
    # #True Negatives:
    TN = err_df.actual[(err_df.actual==0) & (err_df.pred==0)].count()
    print('TN: ', TN)
    #False Negatives:
    FN = err_df.actual[(err_df.actual==1) & (err_df.pred==0)].count()
    print('FN: ', FN)
    print()

    precision = TP/(TP+FP)
    recall =  TP/(TP+FN)
    print('Precision:', precision)
    print('Recall:', recall)
    print('Accuracy:', model.score(X_test, y_test))
    print('F1 Score:', 2*((precision*recall)/(precision+recall)))
    
    if plot_curve:
        print('Area Under ROC Curve:', roc_auc_score(y_test, model.predict_proba(X_test)[:,1]))
        fig, ax = plt.subplots(figsize=(6,4))
        test_disp = plot_roc_curve(model, X_test, y_test, ax=ax, color='red', linewidth=3)
        plt.show()

In [None]:
train_clean = preprocess(train_df)

In [None]:
test_clean = preprocess(test_data)

In [None]:
# ---------------------- playing with count vectorizer ------------------------

In [None]:
from nltk import RegexpTokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [None]:
# found sorted/ the highest coefficients for both labels and then
# looked where the the difference between the 
new_stop_words = ['anything',
 'person',
 'day',
 'even',
 'wrong',
 'said',
 'personal',
 'message',
 'site',
 'vandalism',
 'thing',
 'keep',
 'right',
 'really',
 'know',
 'make',
 'back',
 'let',
 'put',
 'take',
 'better',
 'something',
 'mean',
 'say',
 'want',
 'never',
 'think',
 'fact',
 'time',
 'attack',
 'warning',
 'world',
 'blocked',
 'still',
 'got',
 'edits',
 'someone',
 'way',
 'people',
 'going',
 'well',
 'come',
 'user',
 'one',
 'like',
 'look',
 'change',
 'much',
 'good',
 'tell',
 'day',
 'even',
 'said',
 'vandalism',
 'thing',
 'keep',
 'right',
 'really',
 'know',
 'make',
 'back',
 'let',
 'put',
 'take',
 'better',
 'something',
 'mean',
 'say',
 'want',
 'think',
 'fact',
 'time',
 'blocked',
 'still',
 'edits',
 'someone',
 'way',
 'people',
 'going',
 'well',
 'word',
 'user',
 'one',
 'like',
 'look',
 'change',
 'much',
 'good',
 'comment',
 'read',
 'many',
 'reason',
 'sorry',
 'page',
 'need',
 'made',
 'edit',
 'place',
 'name',
 'block',
 'wikipedia',
 'wiki']



In [None]:
from nltk import RegexpTokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

stop_words_complete = stopwords.words('english') + new_stop_words

# pickling stop words so i can do this in different notebook
# unfortunately pickling a class is hard so im going to have to
# copy paste the tokenizer class over 

with open('stop_words.pickle', 'wb') as file:
#     pickle.dump(my_tokenizer, file)
    pickle.dump(stop_words_complete, file)
    
class Tokenizer(object):
    def __init__(self):
        self.pt = PorterStemmer()
        self.wnl = WordNetLemmatizer()
        self.tk = RegexpTokenizer(r'\b[a-zA-Z]{3,}\b')
        self.stpwrd = set(stop_words_complete)
        
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in self.tk.tokenize(doc) if not t in self.stpwrd]

my_tokenizer = Tokenizer()

In [None]:
# train_df.sentiment_neutral.value_counts(normalize=True)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [None]:
# X, y = train_clean['comment_text'], train_clean['toxic']
# # X, y = train_df['text'], train_df['sentiment_positive']

# # X, y = train_balanced['comment_text'], train_balanced['toxic']

# metrics = []
# skf = StratifiedKFold(n_splits=5)
# for train_index, test_index in skf.split(X, y):
    
#     X_train, X_val = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_val = y.iloc[train_index], y.iloc[test_index]

    
#     vect = TfidfVectorizer(

# #                           lowercase = True,
# #                           token_pattern=None,
# #                           tokenizer=my_tokenizer,
# #                           ngram_range=(1, 1), 
# #                           max_features=8000
        
#                           lowercase = True,
#                           strip_accents='unicode',
#                           stop_words=stop_words_complete,
#                           analyzer='word',
#                           token_pattern = r'\b[a-zA-Z]{3,}\b',
#                           ngram_range = (1,1),
#                           max_features=8000
#                           )
    
#     X_train_vect = vect.fit_transform(X_train)
#     X_val_vect = vect.transform(X_val)
    
#     nb = MultinomialNB()
#     nb.fit(X_train_vect, y_train)

#     y_pred_class = nb.predict(X_val_vect)
#     metrics.append(accuracy_score(y_val, y_pred_class))

# metrics = np.array(metrics)
# print('Mean accuracy: ', np.mean(metrics, axis=0))
# print('Std for accuracy: ', np.std(metrics, axis=0))

In [None]:
# def vectorize_data()

In [None]:
X, y = train_clean['comment_text'], train_clean['toxic']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, random_state=1)

vect = TfidfVectorizer(

                      lowercase = True,
                      token_pattern=None,
                      tokenizer=my_tokenizer,
                      ngram_range=(1, 1),
                      max_features=8000

                      )

X_train_vect = vect.fit_transform(X_train)
X_val_vect = vect.transform(X_val)

nb = MultinomialNB()
nb.fit(X_train_vect, y_train)
nb.score(X_val_vect, y_val)

In [None]:
# with open('vectorizer.pickle', 'wb') as file:
#     # pickling vectorizer and vectorized data
#     pickle.dump(vect, file)
#     pickle.dump(X_train_vect, file)
#     pickle.dump(X_val_vect, file)
    
#     # pickling fitted model
#     pickle.dump(nb, file)

In [None]:
get_metrics(MultinomialNB(), X_train_vect, y_train, X_val_vect, y_val, plot_curve=True)

In [None]:
df, cov = get_word_covariance(vect, nb, n=5, top=True)
temp = df.reset_index()
filtered_temp = temp.sort_values(by=['toxic_coefs'], ascending=False)
filtered_temp

In [None]:
df, cov = get_word_covariance(vect, nb, n=2000)
temp = df.reset_index()
filtered_temp = temp.sort_values(by=['non_toxic_coefs'], ascending=False)
filtered_temp

In [None]:
# get highest coefficients for model
# sort dataframe by highest toxic coefficients and THEN
# look where coefficients have small difference for both labels
# and use this to create stop words

df, cov = get_word_covariance(vect, nb, n=100)
temp = df.reset_index()
filtered_temp = temp.sort_values(by=['toxic_coefs'], ascending=False)
filtered_temp = filtered_temp[(filtered_temp.non_toxic_coefs.notnull()) & (filtered_temp.toxic_coefs.notnull())].head(100).copy()
filtered_temp['coefficient_diff'] = (filtered_temp['non_toxic_coefs'] - filtered_temp['toxic_coefs'])**2
filtered_temp.sort_values(by=['coefficient_diff']).head(10)


In [None]:
df, cov = get_word_covariance(vect, nb, n=100)
temp = df.reset_index()
filtered_temp = temp.sort_values(by=['non_toxic_coefs'], ascending=False)
filtered_temp = filtered_temp[(filtered_temp.non_toxic_coefs.notnull()) & (filtered_temp.toxic_coefs.notnull())].head(100).copy()
filtered_temp['coefficient_diff'] = (filtered_temp['non_toxic_coefs'] - filtered_temp['toxic_coefs'])**2
filtered_temp.sort_values(by=['coefficient_diff']).head(10)

In [None]:
df = get_word_covariance(vect, nb, n=1000, top=False)
df

In [None]:
from sklearn.decomposition import NMF

topics = 20
cols = ['topic' + str(i+1) for i in range(topics)]
nmf = NMF(n_components=topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(X_train_vect)

topic_df = pd.DataFrame(nmf.components_, index=cols, columns=vect.get_feature_names()).T

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

tfidf_feature_names = vect.get_feature_names()
n_top_words = 20
print_top_words(nmf, tfidf_feature_names, n_top_words)  

In [None]:
neg, pos = get_class_features(vect, nb, n=10, top=True)
topic_formatted = topic_df.T[pos].T

In [None]:
pos

In [None]:
topic_formatted

In [None]:
import seaborn as sns


def graph_topic(words,topic=[1,2]):
    topic_formatted = topic_df.T[words].T
    if len(topic) == 2:
        cols = ['topic'+str(val) for val in topic]
        plt.figure(figsize=[5,12])
        plt.barh(topic_formatted.index, topic_formatted[cols[0]])
        plt.barh(topic_formatted.index, topic_formatted[cols[1]])
    else:
        print('redo')
# plt.xticks(rotation=90);

In [None]:
graph_topic(pos,[1,17])

In [None]:
from sklearn.decomposition import TruncatedSVD, PCA

topics = 5
cols = ['topic' + str(i+1) for i in range(topics)]
svd = TruncatedSVD(n_components=topics)
lsa = svd.fit_transform(X_vect)
lsa_df = pd.DataFrame(lsa, columns=cols)
lsa_df['document'] = train_df['comment_text']
lsa_df['toxic'] = train_df.toxic
lsa_df

In [None]:
# i think i may be indexing into these wrong look/fix later
neg, pos = get_class_features(vect, nb, n=20, top=True, indices=True)
lsa_df.iloc[neg]

In [None]:
pd.DataFrame(svd.components_, index=cols, columns=vect.get_feature_names()).T

In [None]:
# pca = PCA(2)
# X = pca.fit_transform(df)

In [None]:
df, cov = get_word_covariance(vect, nb, n=1000, top=True)
# cov['hate'].sort_values(ascending=False).to_frame()
cov['please'].sort_values(ascending=False).to_frame()



In [None]:
# testing = train_df[train_df['comment_text'].str.contains('nazi') & (train_df.toxic==1)].comment_text.values[6]
# print(testing)
# print('-'*100)
# tokenized_test = np.array(my_tokenizer(testing), dtype=np.object)
# idx = np.where(tokenized_test=='nazi')[0]

# if len(idx) > 0:
#     for item in idx:
#         if item > 0:
#             print(tokenized_test[item-1])
#         print(tokenized_test[item])
#         if item < len(tokenized_test) -1:
#             print(tokenized_test[item+1])


# testing = train_df[train_df['comment_text'].str.contains('nazi')&(train_df.toxic==0)].comment_text.values[3]
# print(testing[-700::])
# print('-'*100)
# tokenized_test = np.array(my_tokenizer(testing), dtype=np.object)
# idx = np.where(tokenized_test=='nazi')[0]

# if len(idx) > 0:
#     for item in idx:
#         if item > 0:
#             print(tokenized_test[item-1])
#         print(tokenized_test[item])
#         if item < len(tokenized_test) -1:
#             print(tokenized_test[item+1])


In [None]:
X_holdout = test_clean.comment_text
y_holdout = test_clean.toxic

vect = TfidfVectorizer(
                      lowercase = True,
                      token_pattern=None,
                      tokenizer=my_tokenizer,
                      ngram_range=(1, 1),
                      max_features=8000
                      )

X_train_vect = vect.fit_transform(X)
X_holdout_vect = vect.transform(X_holdout)

nb = MultinomialNB()
nb.fit(X_train_vect, y)
nb.score(X_holdout_vect, y_holdout)

# pickling the vectorized data from training set so we can do 
# eda in another notebook


In [None]:
# with open('vectorizer.pickle', 'wb') as file:
# #     pickling trained and vectorized data
#     pickle.dump(vect, file)
#     pickle.dump(X_train_vect, file)
#     pickle.dump(X_holdout_vect, file)
    
#     # pickling fitted model
#     pickle.dump(nb, file)

In [None]:
get_metrics(MultinomialNB(), X_train_vect, y, X_holdout_vect, y_holdout, plot_curve=True)

In [None]:
# TP:  3588
# FP:  1579
# TN:  56309
# FN:  2502

# Precision: 0.694406812463712
# Recall: 0.5891625615763547
# Accuracy: 0.9362124480290099
# F1 Score: 0.637470018655059
# Area Under ROC Curve: 0.9470568584508582