In [1]:
import csv
import pandas as pd
import numpy as np
import re
import nltk
from gensim.models import word2vec, KeyedVectors
import logging
import math
import random

## Lade Daten

In [2]:
data = pd.read_csv('C:\\Users\\gande\\Desktop\\ProjektKlassifikation\\toxic_comments_data\\train.csv', 
                   sep=',', header=0, quotechar= '"', quoting=csv.QUOTE_MINIMAL, encoding='latin1')

In [3]:
testpath = 'C:\\Users\\gande\\Desktop\\ProjektKlassifikation\\toxic_comments_data\\test.csv'
test_df = pd.read_csv(testpath, sep=',', header=0, quotechar= '"', quoting=csv.QUOTE_MINIMAL, encoding='latin1')

In [4]:
testlabelpath = 'C:\\Users\\gande\\Desktop\\ProjektKlassifikation\\toxic_comments_data\\test_labels.csv'
test_label_df = pd.read_csv(testlabelpath, sep=',', header=0, quotechar= '"', quoting=csv.QUOTE_MINIMAL,
                            encoding='latin1')

### Entferne Kommentare mit [-1, -1, -1, -1, -1, -1] Labeling in Testdaten

In [5]:
def drop_minus_ones_comments(comments_df, label_df):
    
    for index, row in label_df.iterrows():
        rowlabels = [row['toxic'], row['severe_toxic'],
                     row['obscene'], row['threat'],
                     row['insult'], row['identity_hate']]
        if -1 in rowlabels:
            comments_df = comments_df.drop([index], axis=0)
            
    return comments_df.reset_index(drop=True)

In [6]:
def drop_minus_ones_labels(label_df):
    for index, row in label_df.iterrows():
        rowlabels = [row['toxic'], row['severe_toxic'],
                     row['obscene'], row['threat'],
                     row['insult'], row['identity_hate']]
        
        if -1 in rowlabels:
            label_df = label_df.drop([index], axis=0)
            
    return label_df.reset_index(drop=True)

In [7]:
test_df1 = drop_minus_ones_comments(test_df[0:70000], test_label_df[0:70000])

In [8]:
test_df2 = drop_minus_ones_comments(test_df[70000:], test_label_df[70000:])

In [9]:
test_label_df1 = drop_minus_ones_labels(test_label_df[0:70000])

In [10]:
test_label_df2 = drop_minus_ones_labels(test_label_df[70000:])

In [11]:
test_df = pd.concat([test_df1, test_df2])
test_df = test_df.reset_index(drop=True)
test_label_df = pd.concat([test_label_df1, test_label_df2])
test_label_df = test_label_df.reset_index(drop=True)

#### Prepare train and test labels

In [12]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_train = [[row[l] for l in labels] for index, row in data.iterrows()]

In [15]:
y_test = [[row[l] for l in labels] for index, row in test_label_df.iterrows()]

### Concatenate train and test data for cross validation

In [17]:
all_data = pd.concat([data, test_df], sort=False)
all_data = all_data.reset_index(drop=True)
all_labels = y_train
all_labels.extend(y_test)

### Preprocess

In [20]:
def prepare_data(df):
    
    """remove punctuation, 
    convert to lowercase
    """
    
    corpus = []
    for index, row in df.iterrows():
        corpus.append([re.sub("[^a-zA-Z']", ' ', 
                      row['comment_text'].lower())])
        
    return np.ravel(corpus).tolist()

In [21]:
traindata = prepare_data(all_data)
X = traindata
y = all_labels

#### Load Google vectors

In [23]:
path = 'C:\\Users\\gande\\GoogleNews-vectors-negative300.bin'
googlevecs = KeyedVectors.load_word2vec_format(path, binary=True)

#### define functions for evaluation

In [24]:
def all_predictions(x_test, model):

    predarr = np.zeros(6).reshape(1, 6)

    for x in x_test:
        comment = x.reshape(1, x.shape[0], 300)
        prediction = np.round(model.predict(comment, steps=1))
        predarr = np.concatenate((predarr, prediction))    
    
    y_pred = predarr[1:]
    
    return y_pred

In [25]:
def exact_match_ratio(test_data, y_true, mymodel):
    
    comparisons = []
    toxic = []

    for idx, cl in enumerate(y_true):
        
        comment = test_data[idx]
        
        x = comment.reshape(1, comment.shape[0], 300)
        prediction = np.round(mymodel.predict(x, steps=1))
        prediction = prediction[0]
            
        same = np.array_equal(cl, prediction)
        
        if same and np.any(prediction):
            toxic.append(cl)
        
        comparisons.append(same)
    
    comp = np.array(comparisons)
    correct = np.count_nonzero(comp)
    total = len(comp)
    
    print("correctly classified toxic comments: ", len(toxic))
    
    return (correct/total) * 100

In [26]:
def get_scores(x_test, model):
    
    """compute probability scores
    for each label
    
    """
    
    predarr = np.zeros(6).reshape(1, 6)

    for x in x_test:
        comment = x.reshape(1, x.shape[0], 300)
        prediction = model.predict(comment, steps=1)
        predarr = np.concatenate((predarr, prediction))
    
    y_pred = predarr[1:]
    
    return y_pred

In [27]:
def compare_labels(predictions, truelabels, labelname):
    
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    labelindex = labels.index(labelname)
    
    count = 0
    
    for idx, labelset in enumerate(predictions):
        if labelset[labelindex] and truelabels[idx][labelindex]:
            count+= 1
    
    return count

## Prepare data for cross validation

Generate stratified folds. The following algorithm for stratified sampling of multi-label data was implemented following Sechidis et al.'s (2011) proposals.

In [28]:
def iterative_stratification(dataset, datalabels, labelnames, k=6):
    
    #put all negative samples in extra list
    all_neg = []
    data_and_labels = [x for x in zip(dataset, datalabels)]
    
    for d in data_and_labels:
        if not np.any(d[1]):
            all_neg.append(d)
    for f in all_neg:
        data_and_labels.remove(f)
    dataset = [z[0] for z in data_and_labels]
    datalabels = [z[1] for z in data_and_labels]
        
    #build dictionary that will contain actual subsets
    actual_subsets = dict()
    for n in range(1,k+1):
        actual_subsets[n] = []
    
    #Calculate desired number of samples per subset
    subsets = dict()
    proportion = 1/k
    subset_size = len(dataset) * proportion
    
    for i in range(1,k+1):
        
        subsets[i] = dict()
        subsets[i]['current_size'] = subset_size
        
    #Calculate desired number of samples of each label in each subset
    current_labelcount = dict()
    for l in labelnames:
        #Find the examples of each label in the initial set
        labelindex = labelnames.index(l)
        total_count_label = len([labelset[labelindex] for labelset in datalabels if labelset[labelindex]])
        current_labelcount[l] = total_count_label
        for k in subsets.keys():
            #we want the same number in all subsets if possible
            subsets[k][l] = proportion * total_count_label
            
    while len(dataset) > 0:
        #Find label with the fewest (but at least one) remaining samples, 
        nonempty = {label:count for (label, count) in current_labelcount.items() if current_labelcount[label] > 0}
        nonzero_counts = np.array(list(nonempty.values()))
        try:
            sparsest = np.argmin(nonzero_counts)
            number = np.min(nonzero_counts)
            name_of_label = list(nonempty.keys())[sparsest]
            index_of_label = list(current_labelcount.keys()).index(name_of_label)
            #Then, for each sample (x, Y ) with this label, select
            #an appropriate subset for distribution.
            distributed_pairs = []
            for idx, s in enumerate(dataset):
                if datalabels[idx][index_of_label] == 1:
                    #Find the subset with the largest number of desired samples for this label
                    desired_numbers_label = [subsets[k][name_of_label] for k in subsets.keys()]
                    max_desired_number = np.max(desired_numbers_label)
                    indices_maxima = np.where(desired_numbers_label == max_desired_number)[0]
                    howmany = len(indices_maxima)
                    if howmany == 1:
                        index_subset = np.argmax(desired_numbers_label)
                        put_in_subset = list(subsets.keys())[index_subset]
                    else:
                        #among the tying subsets, the one with the 
                        #highest number of desired examples gets selected
                        cand = [key for key in subsets.keys() if list(subsets.keys()).index(key) in indices_maxima]
                        desired_numbers_total = [subsets[j]['current_size'] for j in cand]
                        max_desired_total = np.max(desired_numbers_total)
                        indices_maxima_total = np.where(desired_numbers_total == max_desired_total)[0]
                        howmany_total = len(indices_maxima_total)
                        if howmany_total == 1:
                            index_subset_in_cand = np.argmax(desired_numbers_total)
                            put_in_subset = cand[index_subset_in_cand]
                        else:
                            #pick random element of cand
                            put_in_subset = random.choice(cand)
                    #Once the appropriate subset is selected, we add the sample (x, Y ) 
                    #to it and remove it from D.
                    actual_subsets[put_in_subset].append((s, datalabels[idx]))
                    distributed_pairs.append((s, datalabels[idx]))
                    #At the end of the iteration, we decrement the total number 
                    #of desired examples for subset m, cm
                    subsets[put_in_subset]['current_size'] = subsets[put_in_subset]['current_size'] - 1
                    #decrement the number of desired samples 
                    #for each label of this example in chosen subset
                    for labelind, lab in enumerate(datalabels[idx]):
                        if lab == 1:
                            name = labelnames[labelind]
                            subsets[put_in_subset][name] = subsets[put_in_subset][name] - 1
                            current_labelcount[name] = current_labelcount[name] - 1
                        
            data_and_labels = [x for x in zip(dataset, datalabels)]
            for p in distributed_pairs:
                data_and_labels.remove(p)
            dataset = [z[0] for z in data_and_labels]
            datalabels = [z[1] for z in data_and_labels]
        
        except ValueError:
            break
            
    #Samples that are not annotated with any label are distributed so as to 
    #balance the desired number of examples at each subset. 
    negs_per_subset = math.floor(len(all_neg) * proportion)
    if negs_per_subset:
        for everykey in actual_subsets.keys():
            add_negatives = random.sample(all_neg,  negs_per_subset)
            actual_subsets[everykey].extend(add_negatives)
        
    #SHUFFLE ACTUAL SUBSETS
    for v in actual_subsets.values():
        random.shuffle(v)
    
    return actual_subsets

In [29]:
stratified = iterative_stratification(X, y, labelnames=labels, k=5)

In [33]:
#control distribution
distribution = dict()
for key in stratified.keys():
    distribution[key] = dict()
    for l in labels:
        labelindex = labels.index(l)
        number = np.count_nonzero([x[1][labelindex] for x in stratified[key]])
        distribution[key][l] = number
    distribution[key]['total'] = len(stratified[key])
distribution

{1: {'toxic': 4277,
  'severe_toxic': 392,
  'obscene': 2428,
  'threat': 137,
  'insult': 2261,
  'identity_hate': 424,
  'total': 44713},
 2: {'toxic': 4277,
  'severe_toxic': 392,
  'obscene': 2428,
  'threat': 138,
  'insult': 2261,
  'identity_hate': 423,
  'total': 44699},
 3: {'toxic': 4277,
  'severe_toxic': 393,
  'obscene': 2428,
  'threat': 138,
  'insult': 2261,
  'identity_hate': 423,
  'total': 44701},
 4: {'toxic': 4276,
  'severe_toxic': 392,
  'obscene': 2428,
  'threat': 138,
  'insult': 2261,
  'identity_hate': 423,
  'total': 44727},
 5: {'toxic': 4277,
  'severe_toxic': 393,
  'obscene': 2428,
  'threat': 138,
  'insult': 2260,
  'identity_hate': 424,
  'total': 44708}}

## convert words in comments to vector embeddings

In [31]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gande\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
def convert_to_w2v_rep(stratdict, model, lemmatize=True, pretrained=googlevecs):    
    
    newstratdict = dict()
    
    for key in stratdict.keys():
        newstratdict[key] = []
        
        for pair in stratdict[key]:
            splitcomment = pair[0].split()
            commentlist = []
            
            if lemmatize:
                verbs_lemmatized = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in splitcomment]
                splitcomment = [wordnet_lemmatizer.lemmatize(word, pos='n') for word in verbs_lemmatized]
            
            for word in splitcomment:
                try:
                    commentlist.append(model[word])

                except KeyError:
                    try:
                        commentlist.append(pretrained[word])
                    except KeyError:
                        pass
            
            if len(commentlist) != 0:
                commentarr = np.array(commentlist)
                newstratdict[key].append((commentarr, pair[1]))
            
    return newstratdict

In [33]:
all_folds = convert_to_w2v_rep(stratified, googlevecs, lemmatize=True)

## Perform cross validation

In [37]:
from keras.models import Model
from keras.layers import Input, Conv1D, Dense, GlobalMaxPooling1D, Dropout
from keras import optimizers

Using TensorFlow backend.


In [31]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import hamming_loss

In [56]:
def cross_validate(stratified, epochs=5):

    accuracy = []
    precision_macro_weighted = []
    precision_micro = []
    recall_macro_weighted = []
    recall_micro = []
    f1_macro_weighted = []
    f1_micro = []
    my_hamming_loss = []
    
    correct_toxic = []
    correct_severe_toxic = []
    correct_obscene = []
    correct_threat = []
    correct_insult = []
    correct_hate = []
    

    for fold in stratified.keys():
        trainfolds = [f for f in stratified.keys() if f != fold]
        train_data = []
        train_labels = []
        for t in trainfolds:
            data_in_t = [pair[0] for pair in stratified[t]]
            labels_in_t = [pair[1] for pair in stratified[t]]
            train_data.extend(data_in_t)
            train_labels.extend(labels_in_t)
        train_data = np.array(train_data)
        train_labels = np.array(train_labels)
        train_steps = len(train_labels)
        
        val_data = np.array([p[0] for p in stratified[fold]])
        val_labels = np.array([p[1] for p in stratified[fold]])
        
        #build model
        learning_rate = 1e-3
        my_sgd = optimizers.SGD(lr=learning_rate, decay=0.0, momentum=0.9)

        i = Input((None, 300))

        conv1 = Conv1D(200, kernel_size=1, padding='valid', activation='relu')(i)
        conv2 = Conv1D(200, kernel_size=2, padding='same', activation='relu')(conv1)
        pool = GlobalMaxPooling1D()(conv2)
        drop = Dropout(0.25)(pool)

        d = Dense(400, activation='relu')(drop)
        drop2 = Dropout(0.25)(d)

        o = Dense(6, activation='sigmoid')(drop2)      

        My_CNN = Model(i, o)

        My_CNN.compile(loss='binary_crossentropy', optimizer=my_sgd, metrics=['accuracy'])

        def generate_inputs():

            while True:

                for pair in zip(train_data, train_labels):
                    x_train = pair[0].reshape(1, pair[0].shape[0], 300)
                    y_train = pair[1].reshape(1, 6)
                    yield x_train, y_train
                    
        #train_model
        My_CNN.fit_generator(generate_inputs(), steps_per_epoch=train_steps, verbose=0, epochs=epochs)


        #evaluate model
        y_pred = all_predictions(val_data, My_CNN)
        acc = exact_match_ratio(val_data, val_labels, My_CNN)
        accuracy.append(acc)
        prec_weighted = precision_score(val_labels, y_pred, average='weighted')
        precision_macro_weighted.append(prec_weighted)
        prec_micro = precision_score(val_labels, y_pred, average='micro')
        precision_micro.append(prec_micro)
        rec_weighted = recall_score(val_labels, y_pred, average='weighted')
        recall_macro_weighted.append(rec_weighted)
        rec_micro = recall_score(val_labels, y_pred, average='micro')
        recall_micro.append(rec_micro)
        f1_weighted = f1_score(val_labels, y_pred, average='weighted')
        f1_macro_weighted.append(f1_weighted)
        f1_mic = f1_score(val_labels, y_pred, average='micro')
        f1_micro.append(f1_mic)
        hamm_loss = hamming_loss(val_labels, y_pred)
        my_hamming_loss.append(hamm_loss)
        
        #calculate number of correctly predicted comments for each label
        corr_toxic = compare_labels(y_pred, val_labels, 'toxic')
        correct_toxic.append(corr_toxic)
        corr_sev_toxic = compare_labels(y_pred, val_labels, 'severe_toxic')
        correct_severe_toxic.append(corr_sev_toxic)
        corr_obscene = compare_labels(y_pred, val_labels, 'obscene')
        correct_obscene.append(corr_obscene)
        corr_threat = compare_labels(y_pred, val_labels, 'threat')
        correct_threat.append(corr_threat)
        corr_insult = compare_labels(y_pred, val_labels, 'insult')
        correct_insult.append(corr_insult)
        corr_hate = compare_labels(y_pred, val_labels, 'identity_hate')
        correct_hate.append(corr_hate)
        
        print("Finished training fold")
    
    print("\n")
    print("mean accuracy: %.2f%% (+/- %.2f)" % (np.mean(accuracy), np.std(accuracy)))
    print("mean weighted macro-averaged precision: %.3f (+/- %.3f)" % (np.mean(precision_macro_weighted), np.std(precision_macro_weighted)))
    print("mean micro-averaged precision: %.3f (+/- %.3f)" % (np.mean(precision_micro), np.std(precision_micro)))
    print("mean weighted macro-averaged recall: %.3f (+/- %.3f)" % (np.mean(recall_macro_weighted), np.std(recall_macro_weighted)))
    print("mean micro-averaged recall: %.3f (+/- %.3f)" % (np.mean(recall_micro), np.std(recall_micro)))
    print("mean weighted macro-averaged F1 score: %.3f (+/- %.3f)" % (np.mean(f1_macro_weighted), np.std(f1_macro_weighted)))
    print("mean micro-averaged F1 score: %.3f (+/- %.3f)" % (np.mean(f1_micro), np.std(f1_micro)))
    print("mean hamming loss: %.3f (+/- %.3f)" % (np.mean(my_hamming_loss), np.std(my_hamming_loss)))
    print("\n")
    
    print("Average number of correctly predicted samples per label: ")
    print("\n")
    print("toxic: ", int(np.mean(correct_toxic)))
    print("severe_toxic: ", int(np.mean(correct_severe_toxic)))
    print("obscene: ", int(np.mean(correct_obscene)))
    print("threat: ", int(np.mean(correct_threat)))
    print("insult: ", int(np.mean(correct_insult)))
    print("identity_hate: ", int(np.mean(correct_hate)))

In [57]:
cross_validate(all_folds, epochs=5)

correctly classified toxic comments:  1515
Finished training fold
correctly classified toxic comments:  1459
Finished training fold
correctly classified toxic comments:  1411
Finished training fold
correctly classified toxic comments:  1389
Finished training fold
correctly classified toxic comments:  1492
Finished training fold


mean accuracy: 91.89% (+/- 0.08)
mean weighted macro-averaged precision: 0.795 (+/- 0.012)
mean micro-averaged precision: 0.801 (+/- 0.012)
mean weighted macro-averaged recall: 0.661 (+/- 0.014)
mean micro-averaged recall: 0.661 (+/- 0.014)
mean weighted macro-averaged F1 score: 0.713 (+/- 0.005)
mean micro-averaged F1 score: 0.724 (+/- 0.005)
mean hamming loss: 0.019 (+/- 0.000)


Average number of correctly predicted samples per label: 


toxic:  3029
severe_toxic:  60
obscene:  1781
threat:  12
insult:  1513
identity_hate:  161


# Compare results to dummy classifier

In [35]:
from sklearn.dummy import DummyClassifier

In [27]:
stratified_dummy = iterative_stratification(X, y, labelnames=labels, k=5)

In [34]:
all_folds_dummy = convert_to_w2v_rep(stratified_dummy, googlevecs, lemmatize=True)

In [213]:
def cross_validate_dummy(stratified, pred_strategy):

    accuracy = []
    precision_macro_weighted = []
    precision_micro = []
    recall_macro_weighted = []
    recall_micro = []
    f1_macro_weighted = []
    f1_micro = []
    my_hamming_loss = []
    
    correct_toxic = []
    correct_severe_toxic = []
    correct_obscene = []
    correct_threat = []
    correct_insult = []
    correct_hate = []
    

    for fold in stratified.keys():
        trainfolds = [f for f in stratified.keys() if f != fold]
        train_data = []
        train_labels = []
        for t in trainfolds:
            data_in_t = [pair[0] for pair in stratified[t]]
            labels_in_t = [pair[1] for pair in stratified[t]]
            train_data.extend(data_in_t)
            train_labels.extend(labels_in_t)
        train_data = np.array(train_data)
        train_labels = np.array(train_labels)
        #train_steps = len(train_labels)
        
        val_data = np.array([p[0] for p in stratified[fold]])
        val_labels = np.array([p[1] for p in stratified[fold]])
        
        #initiate classifier
        clf = DummyClassifier(strategy=pred_strategy)
                    
        #fit classifier
        clf.fit(train_data, train_labels)

        #evaluate model
        y_pred = clf.predict(val_data)
        acc = clf.score(val_data, val_labels)
        accuracy.append(acc)
        prec_weighted = precision_score(val_labels, y_pred, average='weighted')
        precision_macro_weighted.append(prec_weighted)
        prec_micro = precision_score(val_labels, y_pred, average='micro')
        precision_micro.append(prec_micro)
        rec_weighted = recall_score(val_labels, y_pred, average='weighted')
        recall_macro_weighted.append(rec_weighted)
        rec_micro = recall_score(val_labels, y_pred, average='micro')
        recall_micro.append(rec_micro)
        f1_weighted = f1_score(val_labels, y_pred, average='weighted')
        f1_macro_weighted.append(f1_weighted)
        f1_mic = f1_score(val_labels, y_pred, average='micro')
        f1_micro.append(f1_mic)
        hamm_loss = hamming_loss(val_labels, y_pred)
        my_hamming_loss.append(hamm_loss)
        
        #calculate number of correctly predicted comments for each label
        corr_toxic = compare_labels(y_pred, val_labels, 'toxic')
        correct_toxic.append(corr_toxic)
        corr_sev_toxic = compare_labels(y_pred, val_labels, 'severe_toxic')
        correct_severe_toxic.append(corr_sev_toxic)
        corr_obscene = compare_labels(y_pred, val_labels, 'obscene')
        correct_obscene.append(corr_obscene)
        corr_threat = compare_labels(y_pred, val_labels, 'threat')
        correct_threat.append(corr_threat)
        corr_insult = compare_labels(y_pred, val_labels, 'insult')
        correct_insult.append(corr_insult)
        corr_hate = compare_labels(y_pred, val_labels, 'identity_hate')
        correct_hate.append(corr_hate)
        
               
    print("\n")
    print("mean accuracy: %.2f%% (+/- %.2f)" % (np.mean(accuracy), np.std(accuracy)))
    print("mean weighted macro-averaged precision: %.3f (+/- %.3f)" % (np.mean(precision_macro_weighted), np.std(precision_macro_weighted)))
    print("mean micro-averaged precision: %.3f (+/- %.3f)" % (np.mean(precision_micro), np.std(precision_micro)))
    print("mean weighted macro-averaged recall: %.3f (+/- %.3f)" % (np.mean(recall_macro_weighted), np.std(recall_macro_weighted)))
    print("mean micro-averaged recall: %.3f (+/- %.3f)" % (np.mean(recall_micro), np.std(recall_micro)))
    print("mean weighted macro-averaged F1 score: %.3f (+/- %.3f)" % (np.mean(f1_macro_weighted), np.std(f1_macro_weighted)))
    print("mean micro-averaged F1 score: %.3f (+/- %.3f)" % (np.mean(f1_micro), np.std(f1_micro)))
    print("mean hamming loss: %.3f (+/- %.3f)" % (np.mean(my_hamming_loss), np.std(my_hamming_loss)))
    print("\n")
    
    print("Average number of correctly predicted samples per label: ")
    print("\n")
    print("toxic: ", int(np.mean(correct_toxic)))
    print("severe_toxic: ", int(np.mean(correct_severe_toxic)))
    print("obscene: ", int(np.mean(correct_obscene)))
    print("threat: ", int(np.mean(correct_threat)))
    print("insult: ", int(np.mean(correct_insult)))
    print("identity_hate: ", int(np.mean(correct_hate)))

In [210]:
cross_validate_dummy(all_folds_dummy, 'stratified')



mean accuracy: 0.72% (+/- 0.00)
mean weighted macro-averaged precision: 0.068 (+/- 0.002)
mean micro-averaged precision: 0.068 (+/- 0.002)
mean weighted macro-averaged recall: 0.068 (+/- 0.002)
mean micro-averaged recall: 0.068 (+/- 0.002)
mean weighted macro-averaged F1 score: 0.068 (+/- 0.002)
mean micro-averaged F1 score: 0.068 (+/- 0.002)
mean hamming loss: 0.069 (+/- 0.000)


Average number of correctly predicted samples per label: 


toxic:  416
severe_toxic:  5
obscene:  136
threat:  0
insult:  114
identity_hate:  4


In [211]:
cross_validate_dummy(all_folds_dummy, 'uniform')



mean accuracy: 0.02% (+/- 0.00)
mean weighted macro-averaged precision: 0.067 (+/- 0.000)
mean micro-averaged precision: 0.037 (+/- 0.000)
mean weighted macro-averaged recall: 0.501 (+/- 0.005)
mean micro-averaged recall: 0.501 (+/- 0.005)
mean weighted macro-averaged F1 score: 0.116 (+/- 0.001)
mean micro-averaged F1 score: 0.069 (+/- 0.001)
mean hamming loss: 0.500 (+/- 0.000)


Average number of correctly predicted samples per label: 


toxic:  2127
severe_toxic:  200
obscene:  1222
threat:  69
insult:  1133
identity_hate:  212
