In [56]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import sys
import nltk
import string
import os

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, models


In [2]:
sys.version

'3.7.5 (default, Apr 19 2020, 20:18:17) \n[GCC 9.2.1 20191008]'

In [3]:
tf.test.is_built_with_cuda()

True

In [4]:
tf.test.is_gpu_available(cuda_only=True)

True

In [5]:
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [6]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
train.isnull().any()

id               False
comment_text     False
toxic            False
severe_toxic     False
obscene          False
threat           False
insult           False
identity_hate    False
dtype: bool

In [8]:
test.isnull().any()

id              False
comment_text    False
dtype: bool

In [9]:
train["is_toxic"] = train.apply(
    lambda x: x.toxic or x.severe_toxic or x.obscene or x.threat or x.insult or x.identity_hate, axis=1
)

train = train.drop(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"], axis=1)

In [10]:
train.head()

Unnamed: 0,id,comment_text,is_toxic
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0


## Data Cleaning

In [11]:
# https://towardsdatascience.com/cleaning-text-data-with-python-b69b47b97b76

stop_words = nltk.corpus.stopwords.words("english")

def clean_text(x):
    x = x.lower()
    x = x.replace("\n", " ")
    x = x.replace("\t", " ")
    x = x.replace("'", "")
    x = x.replace('"', "")
    x = re.sub('[%s]' % re.escape(string.punctuation), "", x)
    x = ' '.join([word for word in x.split(' ') if word not in stop_words])
    x = x.encode('ascii', 'ignore').decode()
    x = re.sub(r'https*\S+', ' ', x)
    x = re.sub(r'@\S+', ' ', x)
    x = re.sub(r'#\S+', ' ', x)
    x = re.sub(r'\w*\d+\w*', '', x)
    x = re.sub(r'\s{2,}', ' ', x)
    return x

train["comment_text"] = train["comment_text"].apply(lambda x: clean_text(x))

In [12]:
train.head()

Unnamed: 0,id,comment_text,is_toxic
0,0000997932d777bf,explanation edits made username hardcore metal...,0
1,000103f0d9cfb60f,daww matches background colour im seemingly st...,0
2,000113f07ec002fd,hey man im really trying edit war guy constant...,0
3,0001b41b1c6bb37e,cant make real suggestions improvement wonder...,0
4,0001d958c54c6e35,sir hero chance remember page thats,0


In [21]:
#vectorizer = CountVectorizer(analyzer='char', ngram_range=(1,1), max_features=1000)
vectorizer = CountVectorizer(analyzer='word', max_features=1000)
vectorized_X = vectorizer.fit_transform(train["comment_text"])

In [22]:
X = vectorized_X.toarray()
y = train["is_toxic"].to_numpy()

print(X.shape)
print(y.shape)

print(X[0])
print(y[0])

(159571, 1000)
(159571,)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0


## Model

In [35]:
model = models.Sequential([
    layers.Dense(32, input_shape=(X.shape[1],)),
    layers.Dropout(0.3),
    layers.Activation('relu'),
    layers.Dropout(0.3),
    layers.Dense(32),
    layers.Activation('relu'),
    layers.Dense(1),
    layers.Activation('sigmoid')
])

adagrad = optimizers.Adagrad()
adam = optimizers.Adam()

model.compile(optimizer=adam,
              loss="binary_crossentropy",
              metrics=['accuracy'])

fit = model.fit(X, y, epochs=6, batch_size=64, verbose=True, validation_split=0.1)

Train on 143613 samples, validate on 15958 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


## Test model on Twitch data

In [63]:
sys.path.append("../twitch_reader")
from chat_file import ChannelFile

messages = []
labels = []

labeled_message_file = ChannelFile("../twitch_reader/hand_labeled.json", custom_json=True)

while True:
    message = labeled_message_file.next()
    if not message:
        break
        
    messages.append(message)
    labels.append(1 if message.toxicity == "negative" else 0)
    
print(messages[:5])
print(labels[:5])

[<chat_message.ChatMessage object at 0x7fb4a11995d0>, <chat_message.ChatMessage object at 0x7fb476a94510>, <chat_message.ChatMessage object at 0x7fb476acd450>, <chat_message.ChatMessage object at 0x7fb476acdf10>, <chat_message.ChatMessage object at 0x7fb476acea50>]
[0, 0, 0, 0, 0]


In [64]:
cleaned_message_content = [clean_text(message.content) for message in messages]
messages_X = vectorizer.transform(cleaned_message_content).toarray()

labels = np.array(labels)

predictions = model.predict_classes(messages_X)

confusion_matrix = metrics.confusion_matrix(labels, predictions)
print(confusion_matrix)

accuracy = metrics.accuracy_score(labels, predictions)
print(accuracy)

[[1202   19]
 [  61   18]]
0.9384615384615385


### Compare NN model to VADER score

In [74]:
vader_predictions = [message.vader_score for message in messages]
encoded_vader_predictions = [1 if vader_prediction["compound"] > 0.8 else 0 for vader_prediction in vader_predictions]

confusion_matrix = metrics.confusion_matrix(labels, encoded_vader_predictions)
print(confusion_matrix)

accuracy = metrics.accuracy_score(labels, encoded_vader_predictions)
print(accuracy)

[[1208   13]
 [  77    2]]
0.9307692307692308
