In [None]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import numpy as np
from wordcloud import WordCloud
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
# stopwords = nltk.corpus.stopwords.words('english')
import string
import spacy
from PIL import Image

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [None]:
MAX_SEQUENCE_LENGTH = 100
possible_labels = ["toxic", "severe_toxic", "obscene", "threat", 
                   "insult", "identity_hate"]

In [None]:
new_model = load_model('model_tokenizer/pre_trained_toxic_model.h5')

In [None]:
with open('model_tokenizer/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
test = pd.read_csv("Twitter_data_files/clean_tweet.csv")
test_sentences = test["comment_text"].fillna(" ")
test_sequences = tokenizer.texts_to_sequences(test_sentences)
x_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
len(test['comment_text'])

In [None]:
y_test = new_model.predict(x_test)
predicted_file = pd.read_csv("data/sample_submission.csv")
predictor = predicted_file.iloc[:14000]
predictor[possible_labels] = y_test
predictor.to_csv('Twitter_data_files/brexit_submission.csv', index=False)

In [None]:
df1 = pd.read_csv("Twitter_data_files/new_clean_tweet.csv")
df2 = pd.read_csv("Twitter_data_files/brexit_submission.csv")

In [None]:
result = pd.concat([df1, df2], axis=1)

In [None]:
result.head()

In [None]:
result = result[["id", "comment_text", "toxic", "severe_toxic", "obscene", "threat", 
                   "insult", "identity_hate"]]

In [None]:
result.head()

In [None]:
result.to_csv('Twitter_data_files/toxic_brexit.csv', index=False)

In [None]:
train = pd.read_csv('Twitter_data_files/toxic_brexit.csv')

In [None]:
train.loc[train['severe_toxic'].idxmax()]

In [None]:
train["toxic"].sum()

In [None]:
train.loc[train['toxic'] < 0.3, ['toxic']] = 0
train.loc[train['toxic'] > 0.3, ['toxic']] = 1

train.loc[train['severe_toxic'] < 0.1, ['severe_toxic']] = 0
train.loc[train['severe_toxic'] > 0.1, ['severe_toxic']] = 1

train.loc[train['obscene'] < 0.2, ['obscene']] = 0
train.loc[train['obscene'] > 0.2, ['obscene']] = 1

train.loc[train['threat'] < 0.05, ['threat']] = 0
train.loc[train['threat'] > 0.05, ['threat']] = 1

train.loc[train['insult'] < 0.3, ['insult']] = 0
train.loc[train['insult'] > 0.3, ['insult']] = 1

train.loc[train['identity_hate'] < 0.08, ['identity_hate']] = 0
train.loc[train['identity_hate'] > 0.08, ['identity_hate']] = 1

In [None]:
train["toxic"].values[146]

In [None]:
train.head()

In [None]:
train.to_csv('Twitter_data_files/final_brexit_toxic.csv', index=False)

In [None]:
df_train = pd.read_csv('Twitter_data_files/final_brexit_toxic.csv')

df_train.index = df_train['id']
x_train = df_train['comment_text']
y_train = df_train.iloc[:, 2:]

In [None]:
y_train['clean'] = 1 - y_train.sum(axis=1) >= 1

In [None]:
kinds, counts = zip(*y_train.sum(axis=0).items())

In [None]:
trace = go.Pie(labels=kinds, values=counts)

iplot([trace], filename='basic_pie_chart')

In [None]:
bars = go.Bar(
        y=counts,
        x=kinds,
    )

layout = go.Layout(
    title="Class distribution"
)

fig = go.Figure(data=[bars], layout=layout)
iplot(fig, filename='bar')

In [None]:
nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
stops = stopwords.words("english")

In [None]:
def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)

In [None]:
x_train_lemmatized = x_train.apply(normalize, lowercase=True, remove_stopwords=True)

In [None]:
x_train_lemmatized.sample(1).iloc[0]

In [None]:
word_counts = dict()

for kind in y_train.columns:
    word_counts[kind] = Counter()
    comments = x_train_lemmatized[y_train[kind]==1]
    for _, comment in comments.iteritems():
        word_counts[kind].update(comment.split(" "))

In [None]:
def most_common_words(kind, num_words=15):
    words, counts = zip(*word_counts[kind].most_common(num_words)[::-1])
    bars = go.Bar(
        y=words,
        x=counts,
        orientation="h"
    )

    layout = go.Layout(
        title="Most common words of the class \"{}\"".format(kind),
        yaxis=dict(
            ticklen=8
        )
    )

    fig = go.Figure(data=[bars], layout=layout)
    iplot(fig, filename='bar')

In [None]:
most_common_words("toxic")

In [None]:
most_common_words("severe_toxic")

In [None]:
most_common_words("threat")

In [None]:
most_common_words("obscene")

In [None]:
most_common_words("insult")

In [None]:
most_common_words("identity_hate")

In [None]:
most_common_words("clean")

In [None]:
df_train = pd.read_csv('Twitter_data_files/final_brexit_toxic.csv')

COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df_train['none'] = (df_train[COLUMNS].max(axis=1) == 0).astype(int)
COLUMNS.append('none')
CATEGORIES = COLUMNS.copy()

In [None]:
word_counter = {}

def clean_text(text):
    text = re.sub('[{}]'.format(string.punctuation), ' ', text.lower())
    return ' '.join([word for word in text.split() if word not in (stop)])

for categ in CATEGORIES:
    d = Counter()
    df_train[df_train[categ] == 1]['comment_text'].apply(lambda t: d.update(clean_text(t).split()))
    word_counter[categ] = pd.DataFrame.from_dict(d, orient='index')\
                                        .rename(columns={0: 'count'})\
                                        .sort_values('count', ascending=False)

In [None]:
mask=np.array(Image.open('twitter.png'))
mask=mask[:,:,1]

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

def wordcloud_plot(category, name) :
    
    for category in CATEGORIES:
        d = Counter()
        df_train[df_train[categ] == 1]['comment_text'].apply(lambda t: d.update(clean_text(t).split()))
        word_counter[categ] = pd.DataFrame.from_dict(d, orient='index')\
                                            .rename(columns={0: 'count'})\
                                            .sort_values('count', ascending=False)
    
    plt.figure(figsize=(20,15))
    wc = WordCloud(background_color="black", max_words=500,mask=mask, min_font_size=6 
                 , stopwords=stopwords, max_font_size= 60)
    wc.generate(" ".join(category))
    plt.title("Twitter Wordlcloud " + name +  " Comments", fontsize=30)
    plt.imshow(wc.recolor( colormap= 'Set1' , random_state=21), alpha=0.98)
    plt.axis('off')
    plt.savefig(name+'_wc.png')
    return(True)

In [None]:
for w in word_counter:
    wc = word_counter[w]

    wordcloud = WordCloud(
          background_color='black',
          max_words=1000,
          max_font_size=90, 
          mask=mask
         ).generate_from_frequencies(wc.to_dict()['count'])

    fig = plt.figure(figsize=(16, 13))
    plt.title(w)
    plt.imshow(wordcloud.recolor( colormap= 'Set1' , random_state=21), alpha=0.98)
    plt.axis('off')

    plt.show()