# CyberTrolls Classifier Prediction app
### Author : Evergreen Technologies
### This script loads pre-trained word embeddings (GloVe embeddings) into a frozen Keras Embedding layer, and uses it to train a cybertroll classifier (postive and negative sentiment)
### GloVe embedding data can be found at: http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

#### Cybertrolls dataset can be found at:
https://www.kaggle.com/dataturks/dataset-for-detection-of-cybertrolls 
Columns: 
Text: Tweet
Labels: 
1 - Tweet is classified as troll 
0 - Tweet is classified as not a troll

In [1]:
from __future__ import print_function
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from matplotlib import pyplot
from keras import backend as K
import pandas as pd
from sklearn.utils import shuffle
from keras.models import model_from_json


Using TensorFlow backend.


In [2]:

BASE_DIR = '/Volumes/My Passport for Mac/data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'cybertrolls_dataset')
CYBERTROLLS_FILE_NAME = "cybertrolls_dataset.csv"
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 125000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2




In [3]:
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

In [4]:

# second, prepare text samples and their labels
print('Processing text dataset')
index_to_label_dict = {}
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

input_df = read_csv(os.path.join(TEXT_DATA_DIR, CYBERTROLLS_FILE_NAME))




Processing text dataset


In [5]:
ct_df = input_df[['text','label']]
print("Here are Few Samples in data")
print(ct_df.head)

print("Here total number of positive, negative and unsupported (neutral) samples")
print(ct_df.groupby(['label']).count())

print("Converting pandas dataframe into lists")
texts = ct_df['text'].values.tolist()
labels = []
labels_text = []
labels_text_unique = ct_df.label.unique().tolist()
labels_text = ct_df['label'].values.tolist()

idxCounter = 0
for label in labels_text_unique:
    labels_index[label] = idxCounter
    index_to_label_dict[idxCounter] = label
    idxCounter = idxCounter + 1;

idxCounter = 0    
for label in labels_text:
    if idxCounter%100==0:
        print("processing row " + str(idxCounter))
    labels.append(labels_index[label])
    idxCounter = idxCounter + 1;
    

print("Labels Array")
print(len(labels))
print("Labels Dictionary")
print(labels_index)
print("Done")

Here are Few Samples in data
<bound method NDFrame.head of                                                     text  label
0                                 Get fucking real dude.      1
1       She is as dirty as they come  and that crook ...      1
2       why did you fuck it up. I could do it all day...      1
3       Dude they dont finish enclosing the fucking s...      1
4       WTF are you talking about Men? No men thats n...      1
...                                                  ...    ...
19996    I dont. But what is complaining about it goi...      0
19997   Bahah  yeah i&;m totally just gonna&; get pis...      0
19998       hahahahaha >:) im evil mwahahahahahahahahaha      0
19999            What&;s something unique about Ohio? :)      0
20000              Who is the biggest gossiper you know?      0

[20001 rows x 2 columns]>
Here total number of positive, negative and unsupported (neutral) samples
        text
label       
0      12179
1       7822
Converting pandas da

In [6]:
print("loading model .....")
# load json and create model
json_file = open('/Volumes/My Passport for Mac/model/cybertrolls/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/Volumes/My Passport for Mac/model/cybertrolls/model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print("done")


loading model .....
Loaded model from disk
done


In [7]:
#score = loaded_model.evaluate(X, Y, verbose=0)
#print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [8]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [52]:
test_texts = ["I am your worst nightmare. I'll come after you an make your life miserable" ,
              "You are complete train wreck amigo",
               "Let’s approach change with an attitude of self-compassion & curiosity as we explore & find our way back to – or into – what works for us & our own, dear, beloved, painful, shining hearts"
             ]
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [53]:
nn_output = loaded_model.predict(test_data)
print(nn_output)

[[0.70061845 0.29938164]
 [0.75756073 0.2424393 ]
 [0.00156466 0.9984353 ]]


In [54]:
i=0
for idx in np.argmax(nn_output, axis=1):
    print("Category: ", index_to_label_dict[idx])
    print("text: " , test_texts[i])
    print("=====================================")
    i = i + 1

Category:  1
text:  I am your worst nightmare. I'll come after you an make your life miserable
Category:  1
text:  You are complete train wreck amigo
Category:  0
text:  Let’s approach change with an attitude of self-compassion & curiosity as we explore & find our way back to – or into – what works for us & our own, dear, beloved, painful, shining hearts
