# Toxic Spans Detection

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from termcolor import colored
import string
from nltk.tokenize import word_tokenize 
from tqdm import trange

![title](pipeline.jpeg)

- [1. Exploración](#1)
  - [1.1 Mal etiquetados](#1.1)
- [2. Preprocesamiento](#2)
- [3. Modelos](#3)
  - [3.1 Hidden Markov Model](#3.1)
  - [3.2 Conditional Random Fields for Sequence Prediction](#3.2)
  - [3.3 LSTM](#3.3)

<a name = "1"></a>
# Exploración

In [2]:
from utils import color_toxic_words

In [3]:
train = pd.read_csv('Datos/tsd_train.csv', converters={'spans':ast.literal_eval})
test = pd.read_csv('Datos/tsd_trial.csv', converters={'spans':ast.literal_eval})

In [4]:
train.head()

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."


Veamos las secciones tóxicas de algunas oraciones:

In [5]:
for i,(index,text) in enumerate(train.values[0:5]):
    print('{} - '.format(i) + color_toxic_words(index, text) + '\n'*2)

0 - A[0mn[0mo[0mt[0mh[0me[0mr[0m [0m[41mv[0m[41mi[0m[41mo[0m[41ml[0m[41me[0m[41mn[0m[41mt[0m[41m [0m[41ma[0m[41mn[0m[41md[0m[41m [0m[41ma[0m[41mg[0m[41mg[0m[41mr[0m[41me[0m[41ms[0m[41ms[0m[41mi[0m[41mv[0m[41me[0m[41m [0m[41mi[0m[41mm[0m[41mm[0m[41mi[0m[41mg[0m[41mr[0m[41ma[0m[41mn[0m[41mt[0m [0mk[0mi[0ml[0ml[0mi[0mn[0mg[0m [0ma[0m [0mi[0mn[0mn[0mo[0mc[0me[0mn[0mt[0m [0ma[0mn[0md[0m [0mi[0mn[0mt[0me[0ml[0ml[0mi[0mg[0me[0mn[0mt[0m [0mU[0mS[0m [0mC[0mi[0mt[0mi[0mz[0me[0mn[0m.[0m.[0m.[0m.[0m [0mS[0ma[0mr[0mc[0ma[0ms[0mm[0m


1 - I[0m [0ma[0mm[0m [0m5[0m6[0m [0my[0me[0ma[0mr[0ms[0m [0mo[0ml[0md[0m,[0m [0mI[0m [0ma[0mm[0m [0mn[0mo[0mt[0m [0my[0mo[0mu[0mr[0m [0m[41mf[0m[41mu[0m[41mc[0m[41mk[0m[41mi[0m[41mn[0m[41mg[0m [0mj[0mu[0mn[0mi[0mo[0mr[0m [0mp[0ma[0ml[0m.[0m [0mW[0mh[0ma[0mt[0m [0my[0mo[0mu[0

<a name = "1.1"></a>
### ¿Mal etiquetados?

En algunas oraciones pone los espacios, comas y otros signos como caracteres tóxicos, hay que tener cuidado y al final concatenarlos si es que hay dos o más palabras tóxicas juntas. En la competencia paracen concatenarlos, pero no son consistententes en otras partes.

In [6]:
very_toxic = [i for i,x in zip(train.index, train['spans']) if len(x)>100][0:5]

for i,(index,text) in zip(very_toxic, train.values[very_toxic]):
    print('{} - '.format(i) + color_toxic_words(index, text) + '\n'*2)

158 - [41mT[0m[41mh[0m[41me[0m[41ms[0m[41me[0m[41m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41mi[0m[41mc[0m[41m [0m[41ms[0m[41mt[0m[41mu[0m[41md[0m[41mi[0m[41me[0m[41ms[0m[41m [0m[41ma[0m[41mr[0m[41me[0m[41m [0m[41mu[0m[41mp[0m[41ms[0m[41me[0m[41mt[0m[41mt[0m[41mi[0m[41mn[0m[41mg[0m[41m.[0m[41m [0m[41mW[0m[41mh[0m[41me[0m[41mr[0m[41me[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41mt[0m[41mh[0m[41me[0m[41m [0m[41mi[0m[41mn[0m[41mf[0m[41mo[0m[41mr[0m[41mm[0m[41ma[0m[41mt[0m[41mi[0m[41mo[0m[41mn[0m[41m [0m[41mt[0m[41mh[0m[41ma[0m[41mt[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41ms[0m[41mu[0m[41mr[0m[41mp[0m[41mr[0m[41mi[0m[41ms[0m[41mi[0m[41mn[0m[41mg[0m[41m.[0m[41m [0m[41mA[0m[41mn[0m[41my[0m[41m [0m[41mp[0m[41me[0m[41mr[0m[41ms[0m[41mo[0m[41mn[0m[41m [0m[41mi[0m[41mn[0m[41m [0m[41ma[0m[41mn[0m[41my[0m[41

Y hay otros que definitivamente están mal etiquetados, donde no ponen lenguaje tóxico donde realmente sí lo hay.

In [7]:
actually_toxic = [27, 60, 82, 134, 382]

for index,text in train.values[actually_toxic]:
    print(color_toxic_words(index, text) + '\n'*2)

T[0mh[0me[0m [0mm[0ma[0ml[0mi[0mg[0mn[0ma[0mn[0mt[0m [0mn[0ma[0mr[0mc[0mi[0ms[0ms[0mi[0ms[0mt[0m,[0m [0mm[0me[0mn[0mt[0ma[0ml[0ml[0my[0m [0md[0me[0mr[0ma[0mn[0mg[0me[0md[0m [0ms[0mo[0mc[0mi[0mo[0mp[0ma[0mt[0mh[0m,[0m [0mb[0mi[0mg[0mo[0mt[0m,[0m [0mr[0ma[0ms[0mc[0mi[0ms[0mt[0m,[0m [0md[0mi[0ms[0mi[0mn[0mg[0me[0mn[0mu[0mo[0mu[0ms[0m [0mb[0ml[0mo[0mw[0mh[0ma[0mr[0md[0m [0ma[0mn[0md[0m [0mp[0ma[0mt[0mh[0mo[0ml[0mo[0mg[0mi[0mc[0ma[0ml[0m [0ml[0mi[0ma[0mr[0m,[0m [0mh[0ma[0ms[0m [0mn[0mo[0mw[0m [0mc[0me[0mr[0mt[0mi[0mf[0mi[0me[0md[0m [0mh[0mi[0mm[0ms[0me[0ml[0mf[0m [0ma[0ms[0m [0ma[0m [0mf[0ma[0ms[0mc[0mi[0ms[0mt[0m.[0m [0mH[0mo[0mw[0m [0mm[0mu[0mc[0mh[0m [0ml[0mo[0mn[0mg[0me[0mr[0m [0mb[0me[0mf[0mo[0mr[0me[0m [0mh[0mi[0ms[0m [0mG[0mO[0mP[0m [0ms[0my[0mc[0mo[0mp[0mh[0ma[0mn[0mt[0ms[0m [0ma[0mn[0m

<a name = "2"></a>
## Preprocesamiento
Primero sería mejor tener las palabras tóxicas que los caracteres, también hay que tener cuidado cuando marcan un espacio o algún signo de puntuación como tóxico, al final sólo hay que concatenar si hay dos palabras tóxicas juntas (por hacer). También hay que pasar todo a minúscula y hacer el análisis en minúscula.

In [8]:
import string
from utils import remove_symbols

In [9]:
text = 'fuck you idiot!!!'
index = np.arange(len(text)).tolist()

print(color_toxic_words(index, text))
print(color_toxic_words(remove_symbols(index, text), text))

[41mf[0m[41mu[0m[41mc[0m[41mk[0m[41m [0m[41my[0m[41mo[0m[41mu[0m[41m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41m![0m[41m![0m[41m![0m
[41mf[0m[41mu[0m[41mc[0m[41mk[0m [0m[41my[0m[41mo[0m[41mu[0m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m![0m![0m![0m


In [10]:
indices_clean = [remove_symbols(index, text) for index,text in 
                 zip(train['spans'], train['text'])]

train['spans_clean'] = indices_clean

Veamos como se ven si lo hacemos así:

In [11]:
very_toxic = [i for i,x in zip(train.index, train['spans']) if len(x)>100][0:3]

for i, (index,text,index_clean) in zip(very_toxic, train.values[very_toxic]):
    print('{} - '.format(i) + color_toxic_words(index, text))
    print('{} - '.format(i) + color_toxic_words(index_clean, text) + '\n'*2)

158 - [41mT[0m[41mh[0m[41me[0m[41ms[0m[41me[0m[41m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41mi[0m[41mc[0m[41m [0m[41ms[0m[41mt[0m[41mu[0m[41md[0m[41mi[0m[41me[0m[41ms[0m[41m [0m[41ma[0m[41mr[0m[41me[0m[41m [0m[41mu[0m[41mp[0m[41ms[0m[41me[0m[41mt[0m[41mt[0m[41mi[0m[41mn[0m[41mg[0m[41m.[0m[41m [0m[41mW[0m[41mh[0m[41me[0m[41mr[0m[41me[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41mt[0m[41mh[0m[41me[0m[41m [0m[41mi[0m[41mn[0m[41mf[0m[41mo[0m[41mr[0m[41mm[0m[41ma[0m[41mt[0m[41mi[0m[41mo[0m[41mn[0m[41m [0m[41mt[0m[41mh[0m[41ma[0m[41mt[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41ms[0m[41mu[0m[41mr[0m[41mp[0m[41mr[0m[41mi[0m[41ms[0m[41mi[0m[41mn[0m[41mg[0m[41m.[0m[41m [0m[41mA[0m[41mn[0m[41my[0m[41m [0m[41mp[0m[41me[0m[41mr[0m[41ms[0m[41mo[0m[41mn[0m[41m [0m[41mi[0m[41mn[0m[41m [0m[41ma[0m[41mn[0m[41my[0m[41

También pasemos a minúscula el texto para el entrenamiento:

In [12]:
train['text'] = train['text'].apply(lambda x:x.lower())

<a name = "3"></a>
## Modelos

En la competencia usarán el F1 score para evaluar como sigue:

<blockquote cite="http://www.worldwildlife.org/who/index.html">
To evaluate the responses of a system participating in the challenge, we employ the $F_{1}$ score, as in [1]. Let system $A_i$ return a set $S^{t}_{A_{i}}$ of character offsets, for parts of the post found to be toxic. Let $G^{t}$ be the character offsets of the ground truth annotations of $t$. We compute the $F_{1}$ score of system $A_i$ with respect to the ground truth $G$ for post $t$ as follows, where $|·|$ denotes set cardinality.
    
$$ F_{1}^{t}(A_{i}, G) = \dfrac{2 \cdot P^{t}(A_{i}, G) \cdot R^{t}(A_{i}, G)}{P^{t}(A_{i}, G) + R^{t}(A_{i}, G)}$$
    
$$ P^{t}(A_{i}, G) = \dfrac{|S^{t}_{A_{i}} \cap S^{t}_{A_{i}}|}{|S^{t}_{A_{i}}|} $$
    
$$ R^{t}(A_{i}, G) = \dfrac{|S^{t}_{A_{i}} \cap S^{t}_{A_{i}}|}{|S^{t}_{G}|} $$    
    
If $S^{t}_{G}$ is empty for some post $t$ (no gold spans are given for $t$), we set $F_{1}^{t}(A_{i}, G)$ if $S^{t}_{A_{i}}$ is also empty, and $F_{1}^{t}(A_{i}, G) = 0$ otherwise. We finally average $F_{1}^{t}(A_{i}, G)$ over all the posts $t$ of an evaluation dataset $T$ to obtain a single score for system $A_{i}$.

</blockquote>

In [13]:
from utils import separate_words, get_index_toxic_words

In [14]:
# La siguiente función viene en el GitHub de la competencia.
def f1(predictions, gold):
    """
    F1 (a.k.a. DICE) operating on two lists of offsets (e.g., character).
    >>> assert f1([0, 1, 4, 5], [0, 1, 6]) == 0.5714285714285714
    :param predictions: a list of predicted offsets
    :param gold: a list of offsets serving as the ground truth
    :return: a score between 0 and 1
    """
    if len(gold) == 0:
        return 1. if len(predictions) == 0 else 0.
    if len(predictions) == 0:
        return 0.
    predictions_set = set(predictions)
    gold_set = set(gold)
    nom = 2 * len(predictions_set.intersection(gold_set))
    denom = len(predictions_set) + len(gold_set)
    return float(nom)/float(denom)

<a name = "3.1"></a>
### Hidden Markov Model

In [15]:
import nltk
from nltk.corpus import treebank
from nltk.tag import HiddenMarkovModelTrainer
from nltk.tokenize import word_tokenize

En nuestro caso debemos etiquetar en `toxic` y `non_toxic`.

In [16]:
train_data = []
for index, text in zip(train['spans_clean'], train['text']):
    toxic_words = [text[i[0]:i[-1]+1] for i in separate_words(index) if len(index) > 0]
#     print(toxic_words)
    
    tokens = word_tokenize(text)
    tagged_tokens = []
    
    for token in tokens:
        if token in toxic_words:
            tagged_tokens.append((token, 'toxic'))
            # Removemos en caso de que se repita posteriormente pero esté como 'non_toxic'
            toxic_words.remove(token) 
        else:
            tagged_tokens.append((token, 'non_toxic'))
            
    train_data.append(tagged_tokens)

Entrenamos:

In [17]:
trainer = HiddenMarkovModelTrainer()
tagger_HMM = trainer.train_supervised(train_data)

Ahora hagamos algunos experimentos:

In [18]:
text = 'fucking piece of shit, suck my dick'
tagged_sentence = tagger_HMM.tag(word_tokenize(text))
toxic_index = get_index_toxic_words(text, tagged_sentence)

print(tagged_sentence)
print(color_toxic_words(toxic_index, text))

[('fucking', 'toxic'), ('piece', 'toxic'), ('of', 'toxic'), ('shit', 'toxic'), (',', 'non_toxic'), ('suck', 'toxic'), ('my', 'toxic'), ('dick', 'toxic')]
[41mf[0m[41mu[0m[41mc[0m[41mk[0m[41mi[0m[41mn[0m[41mg[0m [0m[41mp[0m[41mi[0m[41me[0m[41mc[0m[41me[0m [0m[41mo[0m[41mf[0m [0m[41ms[0m[41mh[0m[41mi[0m[41mt[0m,[0m [0m[41ms[0m[41mu[0m[41mc[0m[41mk[0m [0m[41mm[0m[41my[0m [0m[41md[0m[41mi[0m[41mc[0m[41mk[0m


In [19]:
text = 'have a nice day, my dear nigger'
tagged_sentence = tagger_HMM.tag(word_tokenize(text))
toxic_index = get_index_toxic_words(text, tagged_sentence)

print(tagged_sentence)
print(color_toxic_words(toxic_index, text))

[('have', 'non_toxic'), ('a', 'non_toxic'), ('nice', 'non_toxic'), ('day', 'non_toxic'), (',', 'non_toxic'), ('my', 'non_toxic'), ('dear', 'non_toxic'), ('nigger', 'toxic')]
h[0ma[0mv[0me[0m [0ma[0m [0mn[0mi[0mc[0me[0m [0md[0ma[0my[0m,[0m [0mm[0my[0m [0md[0me[0ma[0mr[0m [0m[41mn[0m[41mi[0m[41mg[0m[41mg[0m[41me[0m[41mr[0m


Veamos que score tenemos en el dataset de `test` y comparemos nuestras predicciones:

In [20]:
scores_HMM = []

for gold_index, text in test.values:
    tagged_sentence = tagger_HMM.tag(word_tokenize(text.lower()))   
    prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)
    
    print(colored('Pred: ', color='cyan', attrs=['bold']) + 
          color_toxic_words(prediction_index, text))   
    print(colored('Gold: ', color='yellow', attrs=['bold']) + 
          color_toxic_words(gold_index, text) + '\n'*2)
    
    scores_HMM.append(f1(prediction_index, gold_index))

[1m[36mPred: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0mb[0mi[0mg[0mo[0mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m
[1m[33mGold: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0m[41mb[0m[41mi[0m[41mg[0m[41mo[0m[41mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m


[1m[36mPred: [0mH[0mo[0mw[0m [0ma[0mb[0mo[0mu[0mt[0m [0mw[0me[0m [0ms[0mt[0mo[0mp[0m [0mp[0mr[0mo[0mt[0me[0mc[0mt[0

[1m[36mPred: [0mT[0mh[0ma[0mt[0m'[0ms[0m [0mc[0mo[0mn[0mt[0mr[0ma[0md[0mi[0mc[0mt[0me[0md[0m [0mb[0my[0m [0mt[0mh[0me[0m [0mf[0ma[0mc[0mt[0ms[0m.[0m [0mF[0mi[0mr[0ms[0mt[0m [0mo[0mf[0m [0ma[0ml[0ml[0m,[0m [0mD[0me[0mm[0mo[0mc[0mr[0ma[0mt[0ms[0m [0mh[0ma[0mv[0me[0m [0ma[0mc[0mk[0mn[0mo[0mw[0ml[0me[0md[0mg[0me[0md[0m [0mt[0mh[0me[0m [0mp[0ma[0ms[0mt[0m [0ma[0mn[0md[0m [0mt[0mh[0me[0m [0mp[0ma[0mr[0mt[0my[0m [0me[0mv[0mo[0ml[0mv[0me[0md[0m [0mt[0mo[0m [0mt[0mh[0me[0m [0ma[0mn[0mt[0mi[0m-[0mr[0ma[0mc[0mi[0ms[0mt[0m [0ms[0mi[0md[0me[0m,[0m [0mw[0mh[0mi[0mc[0mh[0m [0mp[0mr[0mo[0mm[0mp[0mt[0me[0md[0m [0ms[0mo[0mu[0mt[0mh[0me[0mr[0mn[0m [0mr[0ma[0mc[0mi[0ms[0mt[0ms[0m [0mw[0mh[0mo[0m [0mu[0ms[0me[0md[0m [0mt[0mo[0m [0mc[0mo[0mn[0ms[0mt[0mi[0mt[0mu[0mt[0me[0m [0mt[0mh[0me[0m [0mS[0mo[0ml[0mi[0md[0m 

[1m[36mPred: [0mL[0mi[0mk[0me[0m [0mI[0m [0ms[0ma[0mi[0md[0m [0my[0mo[0mu[0m [0mc[0ma[0mn[0m'[0mt[0m [0mg[0mr[0ma[0ms[0mp[0m [0mi[0mt[0m.[0m [0mW[0me[0m [0mh[0ma[0mv[0me[0m [0mt[0mo[0m [0mp[0mu[0mt[0m [0mu[0mp[0m [0mw[0mi[0mt[0mh[0m [0mi[0mn[0mt[0mo[0ml[0me[0mr[0ma[0mn[0mt[0m [0mp[0me[0mo[0mp[0ml[0me[0m [0mh[0me[0mr[0me[0m [0mb[0me[0mc[0ma[0mu[0ms[0me[0m [0m [0mt[0mh[0me[0my[0m [0mw[0me[0mr[0me[0m [0mb[0mo[0mr[0mn[0m [0mh[0me[0mr[0me[0m.[0m [0mI[0ms[0m [0mi[0mt[0m [0mt[0mh[0ma[0mt[0m [0mh[0ma[0mr[0md[0m [0mo[0mf[0m [0ma[0m [0mc[0mo[0mn[0mc[0me[0mp[0mt[0m.[0m [0mW[0me[0m [0md[0mo[0m [0mn[0mo[0mt[0m [0ml[0me[0mt[0m [0mi[0mn[0m [0mt[0mh[0mo[0ms[0me[0m [0mw[0mh[0mo[0m [0ma[0mr[0me[0m [0mi[0mn[0mt[0mo[0ml[0me[0mr[0ma[0mn[0mt[0m [0ma[0mn[0md[0m [0mg[0mi[0mv[0me[0m [0mt[0mh[0me[0mm[0m [0mC[0ma[0mn

[1m[36mPred: [0mI[0m [0mt[0mh[0mi[0mn[0mk[0m [0mA[0mm[0me[0mr[0mi[0mc[0ma[0mn[0ms[0m [0mb[0me[0ml[0mi[0me[0mv[0me[0m [0mt[0mh[0me[0mi[0mr[0m [0mp[0mr[0me[0ms[0mi[0md[0me[0mn[0mt[0m [0ms[0mh[0mo[0mu[0ml[0md[0m [0mb[0me[0m [0mf[0mr[0me[0me[0m [0mt[0mo[0m [0ms[0mp[0me[0ma[0mk[0m [0mt[0mo[0m [0mw[0mh[0mo[0me[0mv[0me[0mr[0m [0mh[0me[0m [0mt[0mh[0mi[0mn[0mk[0ms[0m [0mb[0me[0ms[0mt[0m.[0m
[0m
[0mS[0mo[0m [0mP[0mR[0mC[0m [0mc[0ma[0mn[0m [0mg[0mo[0m [0ms[0mu[0mc[0mk[0m [0me[0mg[0mg[0ms[0m.[0m
[1m[33mGold: [0mI[0m [0mt[0mh[0mi[0mn[0mk[0m [0mA[0mm[0me[0mr[0mi[0mc[0ma[0mn[0ms[0m [0mb[0me[0ml[0mi[0me[0mv[0me[0m [0mt[0mh[0me[0mi[0mr[0m [0mp[0mr[0me[0ms[0mi[0md[0me[0mn[0mt[0m [0ms[0mh[0mo[0mu[0ml[0md[0m [0mb[0me[0m [0mf[0mr[0me[0me[0m [0mt[0mo[0m [0ms[0mp[0me[0ma[0mk[0m [0mt[0mo[0m [0mw[0mh[0mo[0me[0mv[0me

[1m[36mPred: [0mM[0mo[0mn[0ms[0me[0mf[0m [0mi[0ms[0m [0ma[0mn[0m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m [0ma[0mn[0md[0m [0ms[0mh[0mo[0mu[0ml[0md[0m [0mb[0me[0m [0mt[0mu[0mr[0mf[0me[0md[0m [0mf[0mr[0mo[0mm[0m [0mt[0mh[0me[0m [0mC[0ma[0mb[0mi[0mn[0me[0mt[0m [0m.[0m.[0m.[0m.[0mb[0mu[0mt[0m [0mC[0mo[0mm[0mr[0ma[0md[0me[0m [0mS[0mu[0mn[0mn[0my[0m [0mD[0ma[0my[0ms[0m [0mi[0ms[0m [0mt[0mo[0mo[0m [0me[0mn[0ma[0mm[0mo[0mu[0mr[0me[0md[0m [0mw[0mi[0mt[0mh[0m [0mh[0me[0mr[0m.[0m.[0m.[0m.[0m
[1m[33mGold: [0m[41mM[0m[41mo[0m[41mn[0m[41ms[0m[41me[0m[41mf[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41ma[0m[41mn[0m[41m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41m [0m[41ma[0m[41mn[0m[41md[0m[41m [0m[41ms[0m[41mh[0m[41mo[0m[41mu[0m[41ml[0m[41md[0m[41m [0m[41mb[0m[41me[0m[41m [0m[41mt[0m[41mu[0m[41mr[0m[41mf[0m

[1m[36mPred: [0mY[0mo[0mu[0mr[0m [0mn[0mo[0mt[0m [0mu[0mn[0md[0me[0mr[0ms[0mt[0ma[0mn[0mi[0mn[0mg[0m [0mg[0mr[0ma[0mm[0mm[0me[0mr[0m,[0m [0mf[0mo[0mo[0ml[0m.[0m [0mY[0mo[0mu[0mr[0m [0ma[0m [0mL[0me[0mf[0mt[0mi[0me[0m [0mp[0mo[0ms[0mt[0me[0mr[0m [0mw[0mi[0mt[0mh[0m [0mn[0mo[0m [0mc[0mo[0mm[0mo[0mn[0m [0ms[0me[0mn[0mz[0me[0m.[0m
[1m[33mGold: [0mY[0mo[0mu[0mr[0m [0mn[0mo[0mt[0m [0m[41mu[0m[41mn[0m[41md[0m[41me[0m[41mr[0m[41ms[0m[41mt[0m[41ma[0m[41mn[0m[41mi[0m[41mn[0m[41mg[0m [0mg[0mr[0ma[0mm[0mm[0me[0mr[0m,[0m [0m[41mf[0m[41mo[0m[41mo[0m[41ml[0m.[0m [0mY[0mo[0mu[0mr[0m [0ma[0m [0mL[0me[0mf[0mt[0mi[0me[0m [0mp[0mo[0ms[0mt[0me[0mr[0m [0mw[0mi[0mt[0mh[0m [0mn[0mo[0m [0mc[0mo[0mm[0mo[0mn[0m [0ms[0me[0mn[0mz[0me[0m.[0m


[1m[36mPred: [0mT[0mh[0ma[0mn[0mk[0ms[0m [0mt[0mo[0m [0mD[0mo[0mn[0ma[0ml[0md[0

[1m[33mGold: [0mT[0mr[0mu[0mm[0mp[0m [0mi[0ms[0m [0ma[0m [0m[41md[0m[41mi[0m[41ms[0m[41mg[0m[41mu[0m[41ms[0m[41mt[0m[41mi[0m[41mn[0m[41mg[0m[41m [0m[41mp[0m[41mi[0m[41mg[0m.[0m [0mI[0m [0ma[0mm[0m [0ma[0ms[0mh[0ma[0mm[0me[0md[0m [0mt[0mh[0ma[0mt[0m [0mh[0me[0m [0mi[0ms[0m [0mp[0mr[0me[0ms[0mi[0md[0me[0mn[0mt[0m.[0m


[1m[36mPred: [0mK[0mi[0ml[0ml[0m [0ma[0ml[0ml[0m [0mt[0mh[0me[0m [0mn[0mi[0mg[0mg[0mg[0me[0mr[0ms[0m [0ma[0mn[0md[0m [0mp[0me[0md[0mo[0mp[0mh[0mi[0ml[0me[0ms[0m![0m
[1m[33mGold: [0mK[0mi[0ml[0ml[0m [0ma[0ml[0ml[0m [0mt[0mh[0me[0m [0m[41mn[0m[41mi[0m[41mg[0m[41mg[0m[41mg[0m[41me[0m[41mr[0m[41ms[0m [0ma[0mn[0md[0m [0mp[0me[0md[0mo[0mp[0mh[0mi[0ml[0me[0ms[0m![0m


[1m[36mPred: [0mI[0mf[0m [0mt[0mh[0me[0m [0mp[0me[0md[0me[0ms[0mt[0mr[0mi[0ma[0mn[0ms[0m [0ma[0mr[0me[0m [0mt[0mo[0mo[0m 

[1m[36mPred: [0mW[0mh[0mo[0m [0mi[0ms[0m [0mt[0mh[0me[0m [0m[41ms[0m[41mt[0m[41mu[0m[41mp[0m[41mi[0m[41md[0m [0m[41mh[0m[41me[0m[41ma[0m[41md[0m [0mw[0mh[0mo[0m [0mb[0mu[0mi[0ml[0mt[0m [0ma[0m [0mr[0mo[0mc[0mk[0m [0mw[0ma[0ml[0ml[0m [0mt[0mo[0m [0me[0mn[0mc[0ml[0mo[0ms[0me[0m [0mt[0mh[0me[0m [0mb[0mo[0ma[0mt[0m [0md[0mo[0mc[0mk[0m [0ma[0mc[0mr[0mo[0ms[0ms[0m [0mt[0mh[0me[0m [0ms[0mt[0mr[0me[0me[0mt[0m [0mf[0mr[0mo[0mm[0m [0mt[0mh[0me[0m [0mO[0ma[0mh[0mu[0m [0mC[0ml[0mu[0mb[0m.[0m [0mT[0mh[0ma[0mt[0m [0mw[0ma[0ms[0m [0ms[0mu[0mp[0mp[0mo[0ms[0me[0md[0m [0mt[0mo[0m [0mb[0me[0m [0ml[0me[0mf[0mt[0m [0mo[0mp[0me[0mn[0m [0ms[0mo[0m [0mf[0ml[0mo[0mo[0md[0ms[0m [0mc[0ma[0mn[0m [0md[0mr[0ma[0mi[0mn[0m [0mn[0ma[0mt[0mu[0mr[0ma[0ml[0ml[0my[0m.[0m [0mN[0mo[0mw[0m [0mt[0mh[0mi[0ms[0m [0mw[0mi[0ml[0ml[0m 

In [21]:
print('HMM score: {:.3f}'.format(np.mean(scores_HMM)))

HMM score: 0.321


<a name = "3.2"></a>
### Conditional Random Fields for Sequence Prediction

In [22]:
from itertools import chain
from nltk import pos_tag
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
from tqdm import tqdm
from utils import word2features, sent2features, sent2labels, sent2tokens, token_postag_label

In [23]:
train_data_crf = []
for text, toxic_tags in tqdm(zip(train['text'], train_data), total=len(train_data)):
    pos_tags = token_postag_label(text.lower())
    sentence = []
    for x,y in zip(pos_tags, toxic_tags):
        sentence.append(x + (y[1],))
        
    train_data_crf.append(sentence)

100%|██████████| 7939/7939 [00:09<00:00, 812.50it/s]


In [24]:
X_train = [sent2features(s) for s in train_data_crf]
y_train = [sent2labels(s) for s in train_data_crf]

trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

Probablemente sea buena idea hacer una búsqueda de los mejores hiperparámetros, ya que el performance cambia significativamente dependiendo de los hiperparámetros.

In [27]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 0.01,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

# trainer.params()

In [31]:
%time trainer.train('toxic_speech.crfsuite')

tagger_crf = pycrfsuite.Tagger()
tagger_crf.open('toxic_speech.crfsuite')

CPU times: user 11.8 s, sys: 3.23 ms, total: 11.8 s
Wall time: 11.8 s


<contextlib.closing at 0x7f85f4ed6978>

In [29]:
test_data_crf =  []
for text in tqdm(test['text']):
    pos_tags = token_postag_label(text.lower())
    sentence = []
    for x in pos_tags:
        sentence.append(x + ('',))
        
    test_data_crf.append(sentence)

100%|██████████| 690/690 [00:00<00:00, 875.49it/s]


In [32]:
scores_CRF = []

for text_crf,(gold_index,text) in zip(test_data_crf, test.values):
    words = [x[0] for x in text_crf]
    tags = tagger_crf.tag(sent2features(text_crf))
    
    tagged_sentence = [(word, tag) for word,tag in zip(words,tags)]
    prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)
    
    print(colored('Pred: ', color='cyan', attrs=['bold']) + 
          color_toxic_words(prediction_index, text))   
    print(colored('Gold: ', color='yellow', attrs=['bold']) + 
          color_toxic_words(gold_index, text) + '\n'*2)
    
    scores_CRF.append(f1(prediction_index, gold_index))

[1m[36mPred: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0mb[0mi[0mg[0mo[0mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m
[1m[33mGold: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0m[41mb[0m[41mi[0m[41mg[0m[41mo[0m[41mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m


[1m[36mPred: [0mH[0mo[0mw[0m [0ma[0mb[0mo[0mu[0mt[0m [0mw[0me[0m [0ms[0mt[0mo[0mp[0m [0mp[0mr[0mo[0mt[0me[0mc[0mt[0

[1m[33mGold: [0m[41mA[0m[41mw[0m[41mf[0m[41mu[0m[41ml[0m [0mc[0mo[0mm[0mm[0me[0mn[0mt[0m


[1m[36mPred: [0mW[0mh[0me[0mn[0m [0mw[0mi[0ml[0ml[0m [0mn[0ma[0mt[0mi[0mv[0me[0ms[0m [0mc[0ml[0me[0ma[0mn[0m [0mu[0mp[0m [0mt[0mh[0me[0mi[0mr[0m [0ma[0mc[0mt[0m [0ma[0mn[0md[0m [0ms[0mt[0mo[0mp[0m [0mu[0ms[0mi[0mn[0mg[0m [0mt[0mh[0me[0m [0mc[0mr[0me[0ma[0mt[0mi[0mo[0mn[0ms[0m [0mo[0mf[0m [0mw[0mh[0mi[0mt[0me[0mm[0ma[0mn[0m?[0m [0m [0m [0mT[0mh[0me[0my[0m [0md[0mo[0mn[0m'[0mt[0m [0mt[0mh[0mi[0mn[0mk[0m [0mt[0mh[0me[0my[0m [0mh[0ma[0mv[0me[0m [0mt[0mo[0m.[0m [0mT[0mh[0me[0my[0m [0mt[0mh[0mi[0mn[0mk[0m [0mi[0mt[0m [0mi[0ms[0m [0mt[0mh[0me[0mi[0mr[0m [0mm[0mo[0mr[0ma[0ml[0m [0mr[0mi[0mg[0mh[0mt[0m [0mt[0mo[0m [0mt[0me[0ml[0ml[0m [0mu[0ms[0m [0me[0mv[0me[0mr[0my[0mt[0mh[0mi[0mn[0mg[0m [0mt[0mh[0me[0my[0m [0

[1m[33mGold: [0mT[0ma[0ml[0mk[0me[0me[0mt[0mn[0ma[0m [0mh[0ma[0ms[0m [0mn[0me[0mv[0me[0mr[0m [0mh[0ma[0md[0m [0ma[0mn[0my[0m [0mp[0mr[0mo[0mb[0ml[0me[0mm[0m [0mw[0mi[0mt[0mh[0m [0mM[0ma[0mr[0mi[0mj[0mu[0ma[0mn[0ma[0m.[0m [0mI[0m [0ms[0ma[0my[0m [0m"[0mO[0mp[0me[0mn[0m [0ma[0m [0mp[0mo[0mt[0m [0ms[0mh[0mo[0mp[0m [0mt[0mh[0me[0m [0ms[0mi[0mz[0me[0m [0mo[0mf[0m [0mw[0ma[0ml[0mm[0ma[0mr[0mt[0m [0ma[0mn[0md[0m [0mt[0me[0ml[0ml[0m [0mt[0mh[0mo[0ms[0me[0m [0ms[0mo[0mc[0mc[0me[0mr[0m [0mm[0mo[0mm[0ms[0m [0mt[0mo[0m [0m[41mk[0m[41mi[0m[41ms[0m[41ms[0m[41m [0m[41my[0m[41mo[0m[41mu[0m[41mr[0m[41m [0m[41mb[0m[41mu[0m[41mt[0m[41mt[0m.[0m"[0m [0mI[0mt[0m'[0ms[0m [0ma[0m [0mf[0mr[0me[0me[0m [0mc[0mo[0mu[0mn[0mt[0mr[0my[0m [0ma[0mn[0md[0m [0mp[0mo[0mt[0m [0mi[0ms[0m [0ml[0me[0mg[0ma[0ml[0m.[0m [0mH[0me[0my

[1m[36mPred: [0mH[0me[0m [0ms[0mh[0mo[0mu[0ml[0md[0m [0mh[0ma[0mv[0me[0m [0md[0mi[0me[0md[0m [0mi[0mn[0m [0mA[0mf[0mg[0mh[0ma[0mn[0mi[0ms[0mt[0ma[0mn[0m [0ma[0mn[0md[0m [0mb[0me[0m [0mw[0mi[0mt[0mh[0m [0mS[0ma[0mt[0ma[0mn[0m [0mn[0mo[0mw[0m-[0m-[0mb[0mu[0mt[0m [0mw[0me[0m [0ma[0mr[0me[0m [0ms[0mt[0mu[0mc[0mk[0m [0mw[0mi[0mt[0mh[0m [0mt[0mh[0mi[0ms[0m [0md[0me[0mv[0mi[0ma[0mn[0mt[0m [0mk[0mi[0ml[0ml[0me[0mr[0m.[0m
[1m[33mGold: [0m[41mH[0m[41me[0m[41m [0m[41ms[0m[41mh[0m[41mo[0m[41mu[0m[41ml[0m[41md[0m[41m [0m[41mh[0m[41ma[0m[41mv[0m[41me[0m [0md[0mi[0me[0md[0m [0mi[0mn[0m [0mA[0mf[0mg[0mh[0ma[0mn[0mi[0ms[0mt[0ma[0mn[0m [0ma[0mn[0md[0m [0mb[0me[0m [0mw[0mi[0mt[0mh[0m [0mS[0ma[0mt[0ma[0mn[0m [0mn[0mo[0mw[0m-[0m-[0mb[0mu[0mt[0m [0mw[0me[0m [0ma[0mr[0me[0m [0ms[0mt[0mu[0mc[0mk[0m [0mw[0mi[0mt[0mh

[1m[33mGold: [0mL[0mo[0ml[0m [0mr[0me[0ma[0ml[0ml[0my[0m.[0m [0mN[0mo[0m [0mo[0mf[0mf[0me[0mn[0mc[0me[0m [0mb[0mu[0mt[0m [0mt[0mh[0ma[0mt[0m’[0ms[0m [0ma[0mn[0m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41mi[0m[41mc[0m [0ms[0mt[0ma[0mt[0me[0mm[0me[0mn[0mt[0m.[0m [0mI[0m’[0mm[0m [0mn[0mo[0mt[0m [0ml[0mo[0mo[0mk[0mi[0mn[0mg[0m [0mf[0mo[0mr[0m [0ml[0mi[0mk[0me[0ms[0m [0mf[0mr[0mo[0mm[0m [0mt[0mh[0me[0m [0md[0mo[0mu[0mc[0mh[0me[0ms[0m [0mo[0mn[0m [0mt[0mh[0me[0m [0ml[0me[0mf[0mt[0m/[0m [0mr[0mi[0mg[0mh[0mt[0m [0mp[0mo[0ml[0mi[0mt[0mi[0mc[0ma[0ml[0m [0ms[0mi[0md[0me[0m [0mo[0mf[0m [0mt[0mh[0mi[0mn[0mg[0ms[0m.[0m [0mY[0mo[0mu[0m [0mc[0ma[0mn[0m’[0mt[0m [0mh[0mo[0mn[0me[0ms[0mt[0ml[0my[0m [0mt[0mh[0mi[0mn[0mk[0m [0mt[0mh[0mi[0ms[0m [0mt[0mh[0mo[0mu[0mg[0mh[0m.[0m [0mO[0mu[0mr[0m [0mg[0mo[0mv[0me

[1m[33mGold: [0mY[0mo[0mu[0m [0mw[0mo[0mu[0ml[0md[0mn[0m'[0mt[0m [0mk[0mn[0mo[0mw[0m [0mt[0mr[0mu[0mt[0mh[0m [0mi[0mf[0m [0mi[0mt[0m [0mc[0ma[0mm[0me[0m [0mu[0mp[0m [0ma[0mn[0md[0m [0ms[0mm[0ma[0mc[0mk[0m [0my[0mo[0mu[0m [0mo[0mn[0m [0mt[0mh[0me[0m [0m [0m[41mb[0m[41mu[0m[41mt[0m[41mt[0m.[0m


[1m[36mPred: [0mT[0mh[0me[0m [0mo[0mn[0ml[0my[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m[41ms[0m [0mI[0m [0ms[0me[0me[0m [0mh[0me[0ma[0mr[0m [0ma[0mr[0me[0m [0mt[0mh[0mo[0ms[0me[0m [0mw[0mh[0mo[0m [0mr[0ma[0mt[0me[0md[0m [0my[0mo[0mu[0mr[0m [0mc[0mo[0mm[0mm[0me[0mn[0mt[0m [0ma[0ms[0m [0m"[0mc[0mi[0mv[0mi[0ml[0m"[0m.[0m
[1m[33mGold: [0mT[0mh[0me[0m [0mo[0mn[0ml[0my[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m[41ms[0m [0mI[0m [0ms[0me[0me[0m [0mh[0me[0ma[0mr[0m [0ma[0mr[0me[0m [0mt[0mh[0mo[0ms[0me[0

[1m[36mPred: [0mY[0mo[0mu[0m [0ma[0mr[0me[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m.[0m [0m [0mL[0me[0ma[0mr[0mn[0m [0mt[0mo[0m [0mr[0me[0ma[0md[0m.[0m
[1m[33mGold: [0mY[0mo[0mu[0m [0ma[0mr[0me[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m.[0m [0m [0mL[0me[0ma[0mr[0mn[0m [0mt[0mo[0m [0mr[0me[0ma[0md[0m.[0m


[1m[36mPred: [0mT[0mh[0ma[0mn[0mk[0ms[0m [0mf[0mo[0mr[0m [0mh[0ma[0mt[0mi[0mn[0mg[0m,[0m [0m[41mc[0m[41mo[0m[41mw[0m[41ma[0m[41mr[0m[41md[0m.[0m
[1m[33mGold: [0mT[0mh[0ma[0mn[0mk[0ms[0m [0mf[0mo[0mr[0m [0m[41mh[0m[41ma[0m[41mt[0m[41mi[0m[41mn[0m[41mg[0m,[0m [0mc[0mo[0mw[0ma[0mr[0md[0m.[0m


[1m[36mPred: [0mI[0m [0md[0mo[0mn[0m'[0mt[0m [0mr[0me[0ms[0mp[0mo[0mn[0md[0m [0mt[0mo[0m [0md[0mu[0mm[0mb[0m [0mr[0mh[0me[0mt[0mo[0mr[0mi[0mc[0ma[0ml[0m [0mq[0mu[0me[0ms[0mt[0mi[0mo

In [33]:
print('CRF score: {:.3f}'.format(np.mean(scores_CRF)))

CRF score: 0.551


<a name = "3.3"></a>
### LSTM

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)
torch.cuda.current_device(), torch.cuda.get_device_name(0)

(0, 'GeForce RTX 2070')

In [39]:
train_data = []
for index, text in zip(train['spans_clean'], train['text']):
    toxic_words = [text[i[0]:i[-1]+1] for i in separate_words(index) if len(index) > 0]
#     print(toxic_words)
    
    tokens = word_tokenize(text)
    tagged_tokens = []
    
    for token in tokens:
        if token in toxic_words:
            tagged_tokens.append('toxic')
            # Removemos en caso de que se repita posteriormente pero esté como 'non_toxic'
            toxic_words.remove(token) 
        else:
            tagged_tokens.append('non_toxic')
            
    train_data.append((tokens, tagged_tokens))

In [40]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix.get(w, 0) for w in seq]      # Si no está que lo ponga como 'UNK'
    return torch.tensor(idxs, dtype=torch.long)

In [41]:
word_to_ix = {'UNK': 0}

for sent, tags in train_data:
    for word in sent:
        if word not in word_to_ix:  # word has not been assigned an index yet
            word_to_ix[word] = len(word_to_ix)  # Assign each word with a unique index
            
tag_to_ix = {"non_toxic": 0, "toxic": 1}  # Assign each tag with a unique index

Creamos el modelo:

In [42]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

Entrenamos el modelo:

In [43]:
# These will usually be more like 32 or 64 dimensional.
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [44]:
with torch.no_grad():
    inputs = prepare_sequence(train_data[0][0], word_to_ix)
    tag_scores = model(inputs)
#     print(tag_scores)
    
for epoch in trange(10):  
    for sentence, tags in train_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_in)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

100%|██████████| 10/10 [07:08<00:00, 42.87s/it]


In [46]:
def tagger_LSTM(text):
    ix_to_tag = {0: 'non_toxic', 1: 'toxic'}
    words = word_tokenize(text.lower())
    
    with torch.no_grad():
        inputs = prepare_sequence(words, word_to_ix)
        tag_scores = model(inputs)
        tags = [np.argmax(x.numpy()) for x in tag_scores]

        tagged_sentence = [(word, ix_to_tag[tag]) for word,tag in zip(words, tags)]

    return tagged_sentence

In [47]:
scores_LSTM = []

for gold_index, text in test.values:
    tagged_sentence = tagger_LSTM(text)   
    prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)
    
    print(colored('Pred: ', color='cyan', attrs=['bold']) + 
          color_toxic_words(prediction_index, text))   
    print(colored('Gold: ', color='yellow', attrs=['bold']) + 
          color_toxic_words(gold_index, text) + '\n'*2)
    
    scores_LSTM.append(f1(prediction_index, gold_index))
    

[1m[36mPred: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0mb[0mi[0mg[0mo[0mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m
[1m[33mGold: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0m[41mb[0m[41mi[0m[41mg[0m[41mo[0m[41mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m


[1m[36mPred: [0mH[0mo[0mw[0m [0ma[0mb[0mo[0mu[0mt[0m [0mw[0me[0m [0ms[0mt[0mo[0mp[0m [0mp[0mr[0mo[0mt[0me[0mc[0mt[0


[1m[36mPred: [0mT[0mh[0me[0mr[0me[0m'[0ms[0m [0ma[0m [0md[0mi[0mf[0mf[0me[0mr[0me[0mn[0mc[0me[0m [0mb[0me[0mt[0mw[0me[0me[0mn[0m [0ms[0ma[0my[0mi[0mn[0mg[0m [0mt[0mh[0me[0m [0mc[0mh[0mo[0mi[0mc[0me[0m [0mi[0ms[0m [0ma[0m [0mm[0mi[0ms[0mt[0ma[0mk[0me[0m [0ma[0mn[0md[0m [0mi[0mn[0mf[0me[0mr[0mr[0mi[0mn[0mg[0m [0ma[0m [0mu[0ms[0me[0mr[0m [0mi[0ms[0m [0m[41ms[0m[41mt[0m[41mu[0m[41mp[0m[41mi[0m[41md[0m.[0m [0mN[0mo[0mt[0mh[0mi[0mn[0mg[0m [0mp[0me[0mr[0ms[0mo[0mn[0ma[0ml[0m [0m"[0mf[0mr[0mi[0me[0mn[0md[0m"[0m.[0m
[1m[33mGold: [0mT[0mh[0me[0mr[0me[0m'[0ms[0m [0ma[0m [0md[0mi[0mf[0mf[0me[0mr[0me[0mn[0mc[0me[0m [0mb[0me[0mt[0mw[0me[0me[0mn[0m [0ms[0ma[0my[0mi[0mn[0mg[0m [0mt[0mh[0me[0m [0mc[0mh[0mo[0mi[0mc[0me[0m [0mi[0ms[0m [0ma[0m [0mm[0mi[0ms[0mt[0ma[0mk[0me[0m [0ma[0mn[0md[0m [0mi[0mn[0mf[0me[0m


[1m[36mPred: [0mI[0m [0md[0mo[0mn[0m'[0mt[0m [0mg[0me[0mt[0m [0mi[0mt[0m.[0m [0mW[0mh[0ma[0mt[0m'[0ms[0m [0mw[0mr[0mo[0mn[0mg[0m [0mw[0mi[0mt[0mh[0m [0mh[0ma[0mv[0mi[0mn[0mg[0m [0ma[0m [0ml[0my[0mi[0mn[0mg[0m,[0m [0m[41mi[0m[41mg[0m[41mn[0m[41mo[0m[41mr[0m[41ma[0m[41mn[0m[41mt[0m,[0m [0mu[0mn[0me[0mt[0mh[0mi[0mc[0ma[0ml[0m [0m[41mi[0m[41mn[0m[41mc[0m[41mo[0m[41mm[0m[41mp[0m[41me[0m[41mt[0m[41me[0m[41mn[0m[41mt[0m [0mw[0mi[0mt[0mh[0m [0mt[0mh[0me[0m [0ma[0mt[0mt[0me[0mn[0mt[0mi[0mo[0mn[0m [0ms[0mp[0ma[0mn[0m [0ma[0mn[0md[0m [0me[0mm[0mo[0mt[0mi[0mo[0mn[0ma[0ml[0m [0mi[0mm[0mm[0ma[0mt[0mu[0mr[0mi[0mt[0my[0m [0mo[0mf[0m [0ma[0m [0mc[0mh[0mi[0ml[0md[0m [0mf[0mo[0mr[0m [0mp[0mr[0me[0ms[0mi[0md[0me[0mn[0mt[0m?[0m
[1m[33mGold: [0mI[0m [0md[0mo[0mn[0m'[0mt[0m [0mg[0me[0mt[0m [0mi[0mt[0m.[0m [0mW[0m

[1m[36mPred: [0mM[0ma[0mr[0mk[0m [0mH[0mu[0mm[0me[0m [0mh[0ma[0ms[0m [0ma[0ml[0mw[0ma[0my[0ms[0m [0mb[0me[0me[0mn[0m [0mt[0mh[0me[0m [0ms[0mt[0me[0mr[0me[0mo[0mt[0my[0mp[0mi[0mc[0ma[0ml[0m [0mV[0ma[0mn[0mc[0mo[0mu[0mv[0me[0mr[0mi[0mt[0me[0m.[0m [0mS[0mc[0mr[0me[0ma[0mm[0m [0mh[0mo[0mw[0m [0ml[0mo[0mg[0mg[0mi[0mn[0mg[0m [0mm[0mu[0ms[0mt[0m [0mb[0me[0m [0ms[0mt[0mo[0mp[0mp[0me[0md[0m,[0m [0mw[0mh[0mi[0ml[0me[0m [0ml[0mi[0mv[0mi[0mn[0mg[0m [0mi[0mn[0m [0ma[0m [0mh[0mo[0mu[0ms[0me[0m [0mm[0ma[0md[0me[0m [0mo[0mf[0m [0mw[0mo[0mo[0md[0m.[0m [0mI[0mt[0m'[0ms[0m [0ms[0mo[0m [0mf[0ma[0mr[0m [0mp[0ma[0ms[0mt[0m [0mh[0my[0mp[0mo[0mc[0mr[0mi[0mt[0mi[0mc[0ma[0ml[0m [0mt[0mo[0m [0mb[0me[0m [0m[41mr[0m[41mi[0m[41md[0m[41mi[0m[41mc[0m[41mu[0m[41ml[0m[41mo[0m[41mu[0m[41ms[0m.[0m
[1m[33mGold: [0mM[0ma[0mr[0mk

[1m[36mPred: [0mT[0me[0ml[0ml[0m [0mt[0mh[0ma[0mt[0m [0mt[0mo[0m [0mt[0mh[0me[0m [0mI[0ms[0ml[0ma[0mm[0mi[0ms[0mt[0ms[0m [0mw[0mh[0mo[0m [0mp[0mr[0ma[0my[0m [0mf[0mi[0mv[0me[0m [0mt[0mi[0mm[0me[0ms[0m [0ma[0m [0md[0ma[0my[0m.[0m [0m [0mT[0me[0ml[0ml[0m [0mt[0mh[0ma[0mt[0m [0mt[0mo[0m [0mt[0mh[0me[0m [0ms[0me[0mx[0m [0mf[0mi[0me[0mn[0md[0ms[0m [0mw[0mh[0mo[0m [0ma[0mr[0me[0m [0m[41mt[0m[41me[0m[41ma[0m[41mc[0m[41mh[0m[41mi[0m[41mn[0m[41mg[0m [0mG[0mr[0ma[0md[0me[0m [0m3[0m [0ms[0mt[0mu[0md[0me[0mn[0mt[0ms[0m [0mh[0mo[0mm[0mo[0ms[0me[0mx[0mu[0ma[0ml[0mi[0mt[0my[0m [0ma[0mn[0md[0m [0mg[0me[0mn[0md[0me[0mr[0m [0mi[0md[0me[0mn[0mt[0mi[0mt[0my[0m [0m [0mW[0mh[0ma[0mt[0m [0mk[0mi[0mn[0md[0m [0mo[0mf[0m [0mm[0me[0mn[0m [0ma[0mn[0md[0m [0mb[0mo[0my[0ms[0m [0mw[0mi[0ml[0ml[0m [0mt[0mh[0me[0ms[0me[0m 

[1m[36mPred: [0mT[0mh[0me[0m [0mA[0mn[0mc[0mh[0mo[0mr[0ma[0mg[0me[0m [0mC[0mr[0mi[0mm[0me[0m [0mr[0me[0mp[0mo[0mr[0mt[0m [0mt[0mh[0mi[0ms[0m [0my[0me[0ma[0mr[0m [0ms[0ma[0mi[0md[0m [0mw[0mh[0mi[0mt[0me[0m [0m[41mw[0m[41mo[0m[41mm[0m[41me[0m[41mn[0m [0ma[0mr[0me[0m [0m[41mr[0m[41ma[0m[41mp[0m[41me[0m[41md[0m [0mi[0mn[0m [0mm[0mo[0ms[0mt[0m [0mc[0ma[0ms[0me[0ms[0m [0mb[0my[0m [0m[41mw[0m[41mh[0m[41mi[0m[41mt[0m[41me[0m [0m[41mm[0m[41me[0m[41mn[0m,[0m [0mn[0mo[0mt[0m [0mm[0mi[0mn[0mo[0mr[0mi[0mt[0my[0m [0mm[0me[0mn[0m.[0m [0m [0m
[0mT[0mh[0mi[0ms[0m [0mi[0ms[0m [0mf[0mo[0mr[0m [0my[0mo[0mu[0m [0mF[0mr[0mo[0mm[0m [0mL[0mi[0ms[0ma[0m [0mB[0ml[0mo[0mo[0mm[0m.[0m [0mL[0mi[0ms[0ma[0m [0mB[0ml[0mo[0mo[0mm[0m [0mi[0ms[0m [0ma[0m [0mL[0me[0mg[0ma[0ml[0m [0ma[0mn[0ma[0ml[0my[0ms[0mt[0m [0mf[0mo[0mr[0m 

[1m[36mPred: [0mT[0mh[0mi[0ms[0m [0mb[0mi[0ml[0ml[0m [0mi[0ms[0m [0ma[0m [0mt[0ma[0mx[0m [0m[41mc[0m[41mu[0m[41mt[0m [0md[0mi[0ms[0mg[0mu[0mi[0ms[0me[0md[0m [0ma[0ms[0m [0ma[0m [0mh[0me[0ma[0ml[0mt[0mh[0m [0mc[0ma[0mr[0me[0m [0mb[0mi[0ml[0ml[0m.[0m [0mP[0me[0mo[0mp[0ml[0me[0m [0mw[0mi[0ml[0ml[0m [0md[0mi[0me[0m [0mi[0mf[0m [0mt[0mh[0mi[0ms[0m [0mb[0mi[0ml[0ml[0m [0mp[0m[41ma[0m[41ms[0m[41ms[0me[0ms[0m.[0m [0mT[0mo[0m [0md[0me[0mn[0my[0m [0mt[0mh[0mi[0ms[0m [0mf[0ma[0mc[0mt[0m [0mi[0ms[0m [0mt[0mo[0m [0mb[0mu[0mr[0my[0m [0my[0mo[0mu[0mr[0m [0mh[0me[0ma[0md[0m [0me[0mv[0me[0mn[0m [0mf[0ma[0mr[0mt[0mh[0me[0mr[0m [0mu[0mp[0m [0mT[0mr[0mu[0mm[0mp[0ms[0m [0ma[0ms[0ms[0m.[0m [0mM[0ma[0mk[0me[0ms[0m [0mm[0me[0m [0ms[0mi[0mc[0mk[0m.[0m
[1m[33mGold: [0mT[0mh[0mi[0ms[0m [0mb[0mi[0ml[0ml[0m [0mi[0ms[0m 

[1m[36mPred: [0mB[0ma[0mc[0mk[0m [0mp[0me[0md[0ma[0ml[0mi[0mn[0mg[0m [0m[41mc[0m[41mo[0m[41mw[0m[41ma[0m[41mr[0m[41md[0m.[0m [0mC[0ml[0me[0ma[0mr[0ml[0my[0m [0mM[0mu[0mr[0mk[0mo[0mw[0ms[0mk[0mi[0m [0mw[0mo[0mu[0ml[0md[0m [0m[41mk[0m[41mi[0m[41ml[0m[41ml[0m [0m[41mh[0m[41mi[0m[41mm[0m.[0m
[1m[33mGold: [0mB[0ma[0mc[0mk[0m [0mp[0me[0md[0ma[0ml[0mi[0mn[0mg[0m [0m[41mc[0m[41mo[0m[41mw[0m[41ma[0m[41mr[0m[41md[0m.[0m [0mC[0ml[0me[0ma[0mr[0ml[0my[0m [0mM[0mu[0mr[0mk[0mo[0mw[0ms[0mk[0mi[0m [0mw[0mo[0mu[0ml[0md[0m [0m[41mk[0m[41mi[0m[41ml[0m[41ml[0m [0mh[0mi[0mm[0m.[0m


[1m[36mPred: [0mC[0mo[0mm[0me[0m [0mo[0mn[0m [0mM[0mo[0mt[0ml[0me[0my[0m,[0m [0mt[0mh[0ma[0mt[0m'[0ms[0m [0mg[0ma[0mr[0mb[0ma[0mg[0me[0m.[0m [0mI[0m [0me[0mx[0mp[0me[0mc[0mt[0m [0mb[0me[0mt[0mt[0me[0mr[0m [0mf[0mr[0mo[0mm[0m [0my[0mo[0

In [48]:
print('LSTM score: {:.3f}'.format(np.mean(scores_LSTM)))

LSTM score: 0.561
