In [1]:
import csv
import pandas as pd
import numpy as np
import re
import nltk
from gensim.models import word2vec, KeyedVectors
import logging
import math
import random

# Prepare data (toxic comments dataset)

##### train data 

In [45]:
data = pd.read_csv('C:\\Users\\gande\\Desktop\\ProjektKlassifikation\\toxic_comments_data\\train.csv', 
                   sep=',', header=0, quotechar= '"', quoting=csv.QUOTE_MINIMAL, encoding='latin1')

##### test data and labels

In [3]:
testpath = 'C:\\Users\\gande\\Desktop\\ProjektKlassifikation\\toxic_comments_data\\test.csv'
test_df = pd.read_csv(testpath, sep=',', header=0, quotechar= '"', quoting=csv.QUOTE_MINIMAL, encoding='latin1')

In [4]:
testlabelpath = 'C:\\Users\\gande\\Desktop\\ProjektKlassifikation\\toxic_comments_data\\test_labels.csv'
test_label_df = pd.read_csv(testlabelpath, sep=',', header=0, quotechar= '"', quoting=csv.QUOTE_MINIMAL,
                            encoding='latin1')

### remove comments with [-1, -1, -1, -1, -1, -1] labeling from test data

In [5]:
def drop_minus_ones_comments(comments_df, label_df):
    
    for index, row in label_df.iterrows():
        rowlabels = [row['toxic'], row['severe_toxic'],
                     row['obscene'], row['threat'],
                     row['insult'], row['identity_hate']]
        if -1 in rowlabels:
            comments_df = comments_df.drop([index], axis=0)
            
    return comments_df.reset_index(drop=True)

In [6]:
def drop_minus_ones_labels(label_df):
    for index, row in label_df.iterrows():
        rowlabels = [row['toxic'], row['severe_toxic'],
                     row['obscene'], row['threat'],
                     row['insult'], row['identity_hate']]
        
        if -1 in rowlabels:
            label_df = label_df.drop([index], axis=0)
            
    return label_df.reset_index(drop=True)

In [7]:
test_df1 = drop_minus_ones_comments(test_df[0:70000], test_label_df[0:70000])

In [8]:
test_df2 = drop_minus_ones_comments(test_df[70000:], test_label_df[70000:])

In [9]:
test_label_df1 = drop_minus_ones_labels(test_label_df[0:70000])

In [10]:
test_label_df2 = drop_minus_ones_labels(test_label_df[70000:])

In [11]:
test_df = pd.concat([test_df1, test_df2])
test_df = test_df.reset_index(drop=True)
test_label_df = pd.concat([test_label_df1, test_label_df2])
test_label_df = test_label_df.reset_index(drop=True)

### Prepare labels

In [46]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_train_1 = [[row[l] for l in labels] for index, row in data.iterrows()]

In [13]:
y_test = [[row[l] for l in labels] for index, row in test_label_df.iterrows()]

### check distribution of labels in train data:

In [14]:
#percentage of "non-toxic" data
non_toxic = len([y for y in y_train_1 if not np.any(y)])
non_toxic/len(y_train_1)

0.8983211235124177

In [15]:
#distribution of types of "toxicity"
types = {"toxic": [len([y for y in y_train_1 if y[0] == 1])],
         "severe toxic": [len([y for y in y_train_1 if y[1] == 1])],
         "obscene": [len([y for y in y_train_1 if y[2] == 1])],
         "threat": [len([y for y in y_train_1 if y[3] == 1])],
         "insult": [len([y for y in y_train_1 if y[4] == 1])],
         "identity hate": [len([y for y in y_train_1 if y[5] == 1])]}
types_distribution = pd.DataFrame.from_dict(types)
types_distribution

Unnamed: 0,toxic,severe toxic,obscene,threat,insult,identity hate
0,15294,1595,8449,478,7877,1405


### upsample "toxic" data  with rare labels

In [47]:
data_less_nontoxic = data
threats = data_less_nontoxic[data_less_nontoxic["threat"] ==1]
threats_upsampled = pd.DataFrame(np.repeat(threats.values, 2, axis=0))
threats_upsampled.columns = threats.columns
severe_identity = data_less_nontoxic[(data_less_nontoxic["severe_toxic"] ==1)|(data_less_nontoxic["identity_hate"] ==1)]
rare_types = pd.concat([threats_upsampled, severe_identity])
rare_types_upsampled = pd.DataFrame(np.repeat(rare_types.values, 3, axis=0))
rare_types_upsampled.columns = rare_types.columns

upsampled_data = pd.concat([data_less_nontoxic, rare_types_upsampled])
#shuffle
resampled_data = upsampled_data.sample(frac=1)
resampled_data = resampled_data.reset_index(drop=True)

In [48]:
resampled_data[(resampled_data["toxic"] ==1)|(resampled_data["severe_toxic"] ==1)\
                           |(resampled_data["obscene"] ==1)|(resampled_data["insult"] ==1)\
                           |(resampled_data["threat"] ==1)|(resampled_data["identity_hate"] ==1)]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
12,e42607741bd5800c,Oh of fuck you asshole die im going to kill yo...,1,1,1,1,1,0
14,d54bd890c1c0eb16,i think this is mocking jesus because he was a...,0,0,1,0,1,0
25,34fe4bc9b7d3f227,You are a polesmoker \nPlease get run over by ...,1,0,0,0,0,0
29,01d5964243f5fdf8,"""\n\nYou may do that if you want to and I sugg...",1,0,1,0,0,0
42,471fa1324a14a0cc,STOP IT! \n\nStop putting stupid things on my ...,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
170457,b23bc7b738119a88,ATTENTION!!! \n\n IF YOU GOT A PROBLEM EMAIL M...,1,1,1,0,1,0
170465,eb999be610587eec,My Life Would Suck Without You \n\nMy Life Wou...,1,0,0,0,0,0
170471,dceb9def8888dde5,Goodbye \nI'm going to kill you,1,0,0,1,0,0
170489,dceb9def8888dde5,Goodbye \nI'm going to kill you,1,0,0,1,0,0


##### Prepare labels for resampled train data

In [50]:
y_train = [[row[l] for l in labels] for index, row in resampled_data.iterrows()]

### Preprocess

In [51]:
def prepare_data(df):
    
    """remove punctuation, 
    convert to lowercase
    """
    
    corpus = []
    for index, row in df.iterrows():
        corpus.append([re.sub("[^a-zA-Z']", ' ', 
                      row['comment_text'].lower())])
        
    return np.ravel(corpus).tolist()

In [52]:
traindata = prepare_data(resampled_data)
X = traindata
y = y_train

In [53]:
traindata_labels = [p for p in zip(X, y)]
traindata_labels

[('       many     hardly   you seem to believe that if you believe your edits improve the article  then ipso facto the article is improved   however  your edits clearly have not improved the article and in fact have resulted in the article being downgraded from   b   to   start   class    so far  there are two editors besides myself who have reverted your edits    and     again  as for   causing trouble    see wp agf        ',
  [0, 0, 0, 0, 0, 0]),
 (" this article has some strange pov problems   several times it unreservedly claims that rods are only insects  which is rather pov   not only that  but it then confusingly goes on to contradict this view by giving descriptions of rods according to the   skyfish   theory  also pov    it's not really presenting both theories as opinions on the matter  instead it seems to be presenting  both  of these mutually exclusive theories as fact       ",
  [0, 0, 0, 0, 0, 0]),
 ('english in puerto rico   hi  there is a new talk page discussion at t

In [26]:
prepared_testdata = prepare_data(test_df)
prepared_testlabels = y_test

##### Load Google's 300-dimensional vector embeddings

In [27]:
path = 'C:\\Users\\gande\\GoogleNews-vectors-negative300.bin'
googlevecs = KeyedVectors.load_word2vec_format(path, binary=True)

### Split test dataset in validation and test data (stratified split with k=2)

The following algorithm was implemented following the suggestions of Sechidis et al. (2011)* for stratified sampling of multi-label data. 

\* see README for full reference

In [29]:
def iterative_stratification(dataset, datalabels, labelnames, k=6):
    
    #put all negative samples in extra list
    all_neg = []
    data_and_labels = [x for x in zip(dataset, datalabels)]
    
    for d in data_and_labels:
        if not np.any(d[1]):
            all_neg.append(d)
    for f in all_neg:
        data_and_labels.remove(f)
    dataset = [z[0] for z in data_and_labels]
    datalabels = [z[1] for z in data_and_labels]
        
    #build dictionary that will contain actual subsets
    actual_subsets = dict()
    for n in range(1,k+1):
        actual_subsets[n] = []
    
    #Calculate desired number of samples per subset
    subsets = dict()
    proportion = 1/k
    subset_size = len(dataset) * proportion
    
    for i in range(1,k+1):
        
        subsets[i] = dict()
        subsets[i]['current_size'] = subset_size
        
    #Calculate desired number of samples of each label in each subset
    current_labelcount = dict()
    for l in labelnames:
        #Find the examples of each label in the initial set
        labelindex = labelnames.index(l)
        total_count_label = len([labelset[labelindex] for labelset in datalabels if labelset[labelindex]])
        current_labelcount[l] = total_count_label
        for k in subsets.keys():
            #we want the same number in all subsets if possible
            subsets[k][l] = proportion * total_count_label
            
    while len(dataset) > 0:
        #Find label with the fewest (but at least one) remaining samples, 
        nonempty = {label:count for (label, count) in current_labelcount.items() if current_labelcount[label] > 0}
        nonzero_counts = np.array(list(nonempty.values()))
        try:
            sparsest = np.argmin(nonzero_counts)
            number = np.min(nonzero_counts)
            name_of_label = list(nonempty.keys())[sparsest]
            index_of_label = list(current_labelcount.keys()).index(name_of_label)
            #Then, for each sample (x, Y ) with this label, select
            #an appropriate subset for distribution.
            distributed_pairs = []
            for idx, s in enumerate(dataset):
                if datalabels[idx][index_of_label] == 1:
                    #Find the subset with the largest number of desired samples for this label
                    desired_numbers_label = [subsets[k][name_of_label] for k in subsets.keys()]
                    max_desired_number = np.max(desired_numbers_label)
                    indices_maxima = np.where(desired_numbers_label == max_desired_number)[0]
                    howmany = len(indices_maxima)
                    if howmany == 1:
                        index_subset = np.argmax(desired_numbers_label)
                        put_in_subset = list(subsets.keys())[index_subset]
                    else:
                        #among the tying subsets, the one with the 
                        #highest number of desired examples gets selected
                        cand = [key for key in subsets.keys() if list(subsets.keys()).index(key) in indices_maxima]
                        desired_numbers_total = [subsets[j]['current_size'] for j in cand]
                        max_desired_total = np.max(desired_numbers_total)
                        indices_maxima_total = np.where(desired_numbers_total == max_desired_total)[0]
                        howmany_total = len(indices_maxima_total)
                        if howmany_total == 1:
                            index_subset_in_cand = np.argmax(desired_numbers_total)
                            put_in_subset = cand[index_subset_in_cand]
                        else:
                            #pick random element of cand
                            put_in_subset = random.choice(cand)
                    #Once the appropriate subset is selected, we add the sample (x, Y ) 
                    #to it and remove it from D.
                    actual_subsets[put_in_subset].append((s, datalabels[idx]))
                    distributed_pairs.append((s, datalabels[idx]))
                    #At the end of the iteration, we decrement the total number 
                    #of desired examples for subset m, cm
                    subsets[put_in_subset]['current_size'] = subsets[put_in_subset]['current_size'] - 1
                    #decrement the number of desired samples 
                    #for each label of this example in chosen subset
                    for labelind, lab in enumerate(datalabels[idx]):
                        if lab == 1:
                            name = labelnames[labelind]
                            subsets[put_in_subset][name] = subsets[put_in_subset][name] - 1
                            current_labelcount[name] = current_labelcount[name] - 1
                        
            data_and_labels = [x for x in zip(dataset, datalabels)]
            for p in distributed_pairs:
                data_and_labels.remove(p)
            dataset = [z[0] for z in data_and_labels]
            datalabels = [z[1] for z in data_and_labels]
        
        except ValueError:
            break
            
    #Samples that are not annotated with any label are distributed so as to 
    #balance the desired number of examples at each subset. 
    negs_per_subset = math.floor(len(all_neg) * proportion)
    if negs_per_subset:
        for everykey in actual_subsets.keys():
            add_negatives = random.sample(all_neg,  negs_per_subset)
            actual_subsets[everykey].extend(add_negatives)
        
    #SHUFFLE ACTUAL SUBSETS
    for v in actual_subsets.values():
        random.shuffle(v)
    
    return actual_subsets

In [30]:
test_val = iterative_stratification(prepared_testdata, prepared_testlabels, labelnames=labels, k=2)

In [31]:
#check distribution
distribution = dict()
for key in test_val.keys():
    distribution[key] = dict()
    for l in labels:
        labelindex = labels.index(l)
        number = np.count_nonzero([x[1][labelindex] for x in test_val[key]])
        distribution[key][l] = number
    distribution[key]['total'] = len(test_val[key])
distribution

{1: {'toxic': 3045,
  'severe_toxic': 183,
  'obscene': 1846,
  'threat': 106,
  'insult': 1713,
  'identity_hate': 356,
  'total': 31993},
 2: {'toxic': 3045,
  'severe_toxic': 184,
  'obscene': 1845,
  'threat': 105,
  'insult': 1714,
  'identity_hate': 356,
  'total': 31984}}

### tokenize + lemmatize words and convert to vector embeddings

In [32]:
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from nltk.tokenize import TweetTokenizer

In [54]:
def convert_to_w2v_nonstrat(x, model, lemmatize=True, pretrained=googlevecs):  
    
    datalist = []

    for pair in x:        
        tk = TweetTokenizer()
        splitcomment = tk.tokenize(pair[0])
        commentlist = []

        if lemmatize:
            verbs_lemmatized = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in splitcomment]
            splitcomment = [wordnet_lemmatizer.lemmatize(word, pos='n') for word in verbs_lemmatized]

        for word in splitcomment:
            try:
                commentlist.append(model[word])

            except KeyError:
                try:
                    commentlist.append(pretrained[word])
                except KeyError:
                    pass

        if len(commentlist) != 0:
            commentarr = np.array(commentlist)
            datalist.append((commentarr, pair[1]))
            
    return datalist

In [55]:
traindata_labels = convert_to_w2v_nonstrat(traindata_labels, googlevecs, lemmatize=True)

In [35]:
testdata_labels = convert_to_w2v_nonstrat(test_val[1], googlevecs, lemmatize=True)

In [36]:
valdata_labels = convert_to_w2v_nonstrat(test_val[2], googlevecs, lemmatize=True)

### Prepare input for Keras neural network

In [57]:
train_text = np.array([pair[0] for pair in traindata_labels], dtype=object)
train_labels = np.array([pair[1] for pair in traindata_labels])

val_text = np.array([pair[0] for pair in valdata_labels], dtype=object)
val_labels = np.array([pair[1] for pair in valdata_labels])

test_text = np.array([pair[0] for pair in testdata_labels], dtype=object)
test_labels = np.array([pair[1] for pair in testdata_labels])

# Train model

In [41]:
from keras.models import Model
from keras.layers import Input, Conv1D, Dense, GlobalMaxPooling1D, GlobalAveragePooling1D, Concatenate, Dropout
from keras import optimizers
import tensorflow as tf

In [56]:
stop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_precision', 
                                                 mode="max", patience=2, 
                                                 restore_best_weights=True)

In [58]:
#build model
learning_rate = 1e-3
my_sgd = optimizers.SGD(learning_rate=learning_rate, decay=0.0, momentum=0.9)

i = Input((None, 300))

conv1 = Conv1D(128, kernel_size=1, padding='valid', activation='relu')(i)
conv2 = Conv1D(256, kernel_size=2, padding='same', activation='relu')(conv1)
conv3 = Conv1D(512, kernel_size=3, strides=2, padding='same', activation='relu')(conv2)
poolmax = GlobalMaxPooling1D()(conv3)
dropmax = Dropout(0.25)(poolmax)

d = Dense(400, activation='relu')(dropmax)
drop2 = Dropout(0.25)(d)

o = Dense(6, activation='sigmoid')(drop2)

New_CNN = Model(i, o)

New_CNN.compile(loss='binary_crossentropy', optimizer=my_sgd, metrics=['Precision', 'Recall'])

def generate_inputs():

    while True:

        for pair in zip(train_text, train_labels):
            x_train = pair[0].reshape(1, pair[0].shape[0], 300)
            y_train = pair[1].reshape(1, 6)
            yield x_train, y_train

def generate_vals():

    while True:

        for pair in zip(val_text, val_labels):
            x_val = pair[0].reshape(1, pair[0].shape[0], 300)
            y_val = pair[1].reshape(1, 6)
            yield x_val, y_val
            
#train_model
train_steps = len(train_labels)
val_steps = len(val_labels)
New_CNN.fit(generate_inputs(), steps_per_epoch=train_steps, verbose=1, epochs=8, 
            validation_data=generate_vals(), validation_steps=val_steps, callbacks=[stop_callback])
#New_CNN.summary()

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8


<keras.callbacks.History at 0x21fcc6a5ed0>

In [65]:
New_CNN.save("C:/Users/gande/Desktop/ProjektKlassifikation/Bautzen_toxic_CNN.h5")

# Test model

In [81]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

In [67]:
def all_predictions(x_test, model):

    predarr = np.zeros(6).reshape(1, 6)

    for idx, x in enumerate(x_test):
        comment = x.reshape(1, x.shape[0], 300)
        prediction = np.round(model.predict(comment, steps=1, verbose=0))
        predarr = np.concatenate((predarr, prediction)) 
        #print(f"Prediction {idx} done")
    
    y_pred = predarr[1:]
    
    return y_pred

In [68]:
y_pred = all_predictions(test_text, New_CNN)

### Precision

In [73]:
#weighted macro-averaged precision
prec_weighted = precision_score(test_labels, y_pred, average="weighted")
prec_weighted

0.6901659802349205

In [74]:
#micro-averaged
prec_micro = precision_score(test_labels, y_pred, average='micro')
prec_micro

0.6629640456007015

### Recall

In [76]:
#weighted macro-averaged recall
rec_weighted = recall_score(test_labels, y_pred, average='weighted')
rec_weighted

0.6261733848702374

In [78]:
#micro-averaged recall
rec_micro = recall_score(test_labels, y_pred, average='micro')
rec_micro

0.6261733848702374

### F1 score

In [79]:
#weighted macro-averaged F1 score
f1_weighted = f1_score(test_labels, y_pred, average="weighted")
f1_weighted

0.6493197546376549

In [80]:
#micro-averaged F1 score
f1_mic = f1_score(test_labels, y_pred, average='micro')
f1_mic

0.6440437313644753