In [1]:
import numpy as np
import torch
from torch import nn
import math
from pprint import pprint
import pandas as pd
from tqdm.notebook import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
train = pd.read_csv("./train.csv")
test_comments = pd.read_csv("./test.csv")
test_labels = pd.read_csv("./test_labels.csv")

In [3]:
test = pd.merge(test_comments, test_labels, on="id")

In [4]:
test.toxic = test_labels.toxic.clip(0)
test.severe_toxic = test_labels.severe_toxic.clip(0)
test.obscene = test_labels.obscene.clip(0)
test.threat = test_labels.threat.clip(0)
test.insult = test_labels.insult.clip(0)
test.identity_hate = test_labels.identity_hate.clip(0)

def make_nontoxic(df):
  df["nontoxic"] = (1-df.toxic) * (1-df.severe_toxic) * (1 - df.obscene) * (1 - df.threat) * (1 - df.insult)

make_nontoxic(train)
make_nontoxic(test)
test

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,nontoxic
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,0,0,0,0,0,0,1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,0,0,0,0,0,0,1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",0,0,0,0,0,0,1
3,00017563c3f7919a,":If you have a look back at the source, the in...",0,0,0,0,0,0,1
4,00017695ad8997eb,I don't anonymously edit articles at all.,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu...",0,0,0,0,0,0,1
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...,0,0,0,0,0,0,1
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ...",0,0,0,0,0,0,1
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the...",0,0,0,0,0,0,1


In [5]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams

In [6]:
columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate", "nontoxic"]

In [43]:
counts = dict()
tokenize = lambda x: list(ngrams(list(x), 3))

tokens = train.comment_text.apply(tokenize)
targets = train[columns].to_numpy()
n_tokens = tokens.apply(len)

In [44]:
for i, comment_tokens in enumerate(tokens):
    if i % 10000 == 0:
        print(i)
    for token in comment_tokens:
        if token not in counts:
            counts[token] = np.zeros(7)
        counts[token] = counts[token] + targets[i]

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000


In [45]:
words = []
probs = []
for i, (k, v) in enumerate(counts.items()):
    words.append(k)
    probs.append(v)

In [46]:
p = np.concatenate([prob.reshape(1, -1) for prob in probs], axis=0)
p

array([[2.2000e+01, 0.0000e+00, 1.5000e+01, ..., 1.0000e+01, 1.0000e+00,
        8.2700e+02],
       [2.2000e+02, 3.0000e+00, 9.8000e+01, ..., 7.5000e+01, 1.2000e+01,
        8.6720e+03],
       [1.3470e+03, 5.7000e+01, 6.2300e+02, ..., 5.1100e+02, 9.0000e+01,
        3.4215e+04],
       ...,
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00],
       [0.0000e+00, 0.0000e+00, 0.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        1.0000e+00]])

In [47]:
total_tokens = np.sum(targets*n_tokens.to_numpy().reshape(-1, 1), axis=0)
total_tokens

array([ 4484905,   720358,  2406147,   146144,  2168418,   430690,
       57702764], dtype=int64)

In [48]:
label_counts = np.sum(targets, axis=0)
label_counts

array([ 15294,   1595,   8449,    478,   7877,   1405, 143400],
      dtype=int64)

In [49]:
p = p / np.sum(targets*n_tokens.to_numpy().reshape(-1, 1), axis=0)

In [50]:
top = np.argsort(p, axis=0)[:500:-1]

for i in range(50):
    print(top[i])


[   15  4877   623  4877   623 17106    15]
[623 819 622 135 622  15  16]
[4877  288   15  623   15  623   17]
[ 622 3871  624  622 4877  622  356]
[   16   832   288  5156   624 36030   280]
[  624   831    16   624    16 17604   405]
[  280  3870  4877 36477   288 36029   357]
[   17   623   280   365   280 36031   140]
[ 354  622  354 8396  530 7841  354]
[405 289 819 832 354 822 257]
[ 530 3843   17  831  819  624  167]
[ 167  833  530 1919  405  819   97]
[   97  1524   289 10516    17    16   398]
[  257  3872   405 24440   167   257   424]
[ 356 4485 3843  796  289  160    8]
[  357   624   167   280   832 12474   120]
[ 135  818   97 2071   97  405  143]
[ 55 272 257  15 831 280 623]
[ 288 1340 1524  302  272  530  622]
[  372   135   356 11958    55  3870   425]
[  438 10380   832   354   257    34   185]
[ 143  280 3871 2861 3843  288   55]
[  272   530  3870 10514   438   378   438]
[140 354 272 582 356 354 372]
[424 796 831 356 135 167  75]
[ 289  755  135  357 1524   17  5

In [54]:
words[623]

('y', 'o', 'u')

In [81]:
p[182]

array([8.88998091e-03, 3.41545353e-02, 1.28144669e-02, 1.46902609e-02,
       1.45218352e-02, 4.91852957e-03, 9.24038592e-05])

In [55]:
label_probs = np.sum(train[columns].to_numpy(), axis=0) / len(train)
label_probs

array([0.09584448, 0.00999555, 0.05294822, 0.00299553, 0.04936361,
       0.00880486, 0.89865953])

In [60]:
def predict(comment):
    tokens = tokenize(comment)
    prob = np.ones(7)
    for token in tokens:
        if token in counts:
            token_prob = counts[token] / total_tokens
            prob *= token_prob
        
    pred = prob[:6] > prob[6]
    return pred

results = test.comment_text.apply(predict)
results

0         [False, False, False, False, False, False]
1         [False, False, False, False, False, False]
2         [False, False, False, False, False, False]
3         [False, False, False, False, False, False]
4         [False, False, False, False, False, False]
                             ...                    
153159    [False, False, False, False, False, False]
153160    [False, False, False, False, False, False]
153161    [False, False, False, False, False, False]
153162    [False, False, False, False, False, False]
153163    [False, False, False, False, False, False]
Name: comment_text, Length: 153164, dtype: object

In [61]:
pred = np.concatenate([result.reshape(1, -1).astype(np.int32) for result in results], axis=0)
true = test[columns[:-1]].to_numpy()

In [34]:
from sklearn.metrics import roc_auc_score

In [58]:
def calc_auc(pred, target):
    result = []
    for i in range(pred.shape[1]):
        if len(np.unique(target[:,i])) == 2:
            result.append(roc_auc_score(target[:,i], pred[:,i], labels=[0,1]))
        else:
            extra = np.array([1-target[0,i]])
            target_i = np.concatenate((target[:,i], extra))
            pred_i = np.concatenate((pred[:,i], extra))
            result.append(roc_auc_score(target_i, pred_i, labels=[0,1]))
            
    
    return result

In [62]:
calc_auc(pred, true)

[0.6140203261729466,
 0.5593260645604856,
 0.6114013956553215,
 0.5403327131777277,
 0.6000316547621218,
 0.5680884784660163]