# Toxic Spans Detection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import ast
from termcolor import colored
import string

from nltk import pos_tag
from nltk.corpus import treebank
from nltk.tag import HiddenMarkovModelTrainer
from nltk.tokenize import word_tokenize

import pycrfsuite

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer

from IPython.display import clear_output
from itertools import chain
from tqdm import trange, tqdm

import string
from utils.processing import color_toxic_words, remove_symbols, completely_toxic, separate_words, get_index_toxic_words, f1
from utils.basic_models import word2features, sent2features, sent2labels, sent2tokens, token_postag_label

sns.set_style('darkgrid')

## Resumen

1. Quitando símbolos tóxicos (", espacios, !, etc) tenemos los siguientes resultados:
  - Hidden Markov Model: 0.321
  - Conditional Random Fields: 0.551
  - LSTM: 0.561 `(EMBEDDING_DIM = 32, HIDDEN_DIM = 32, EPOCHS = 10)`
2. Quitando símbolos tóxicos (", espacios, !, etc) y agregando como completamente tóxicos todos los que no tienen índices (los que viene como [ ]):
  - Hidden Markov Model: 0.299
  - Conditional Random Fields: 0.546
  - LSTM: 0.556 `(EMBEDDING_DIM = 32, HIDDEN_DIM = 32, EPOCHS = 10)`
  
3. Sin quitar símbolos tóxicos (", espacios, !, etc), es decir, dataset original:
  - Hidden Markov Model: 0.367
  - Conditional Random Fields: 0.501
  - LSTM: 0.535 `(EMBEDDING_DIM = 32, HIDDEN_DIM = 32, EPOCHS = 10)`
  
4. Sin quitar símbolos tóxicos (", espacios, !, etc) y agregando como completamente tóxicos todos los que no tienen índices (los que viene como [ ]):
  - Hidden Markov Model: 0.367
  - Conditional Random Fields: 0.501
  - LSTM: 0.535 `(EMBEDDING_DIM = 32, HIDDEN_DIM = 32, EPOCHS = 10)`
  
Para la LSTM lo mejor parece ser la opción 1. Lo mejor hasta ahora es con `(EMBEDDING_DIM = 20 HIDDEN_DIM = 20 EPOCHS = 25)` (0.581).

- [1. Exploración](#1)
  - [1.1 Mal etiquetados](#1.1)
- [2. Preprocesamiento](#2)
- [3. Modelos](#3)
  - [3.1 Hidden Markov Model](#3.1)
  - [3.2 Conditional Random Fields for Sequence Prediction](#3.2)
  - [3.3 LSTM](#3.3)
- [4. Evaluation](#4)

<a name = "1"></a>
# Exploración

In [2]:
train = pd.read_csv('../data/tsd_train.csv', converters={'spans':ast.literal_eval})
test = pd.read_csv('../data/tsd_trial.csv', converters={'spans':ast.literal_eval})

In [3]:
train.head()

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."


Veamos las secciones tóxicas de algunas oraciones:

In [4]:
for i,(index,text) in enumerate(train.head(5).values):
    print('{} - '.format(i) + color_toxic_words(index, text) + '\n'*2)

0 - A[0mn[0mo[0mt[0mh[0me[0mr[0m [0m[41mv[0m[41mi[0m[41mo[0m[41ml[0m[41me[0m[41mn[0m[41mt[0m[41m [0m[41ma[0m[41mn[0m[41md[0m[41m [0m[41ma[0m[41mg[0m[41mg[0m[41mr[0m[41me[0m[41ms[0m[41ms[0m[41mi[0m[41mv[0m[41me[0m[41m [0m[41mi[0m[41mm[0m[41mm[0m[41mi[0m[41mg[0m[41mr[0m[41ma[0m[41mn[0m[41mt[0m [0mk[0mi[0ml[0ml[0mi[0mn[0mg[0m [0ma[0m [0mi[0mn[0mn[0mo[0mc[0me[0mn[0mt[0m [0ma[0mn[0md[0m [0mi[0mn[0mt[0me[0ml[0ml[0mi[0mg[0me[0mn[0mt[0m [0mU[0mS[0m [0mC[0mi[0mt[0mi[0mz[0me[0mn[0m.[0m.[0m.[0m.[0m [0mS[0ma[0mr[0mc[0ma[0ms[0mm[0m


1 - I[0m [0ma[0mm[0m [0m5[0m6[0m [0my[0me[0ma[0mr[0ms[0m [0mo[0ml[0md[0m,[0m [0mI[0m [0ma[0mm[0m [0mn[0mo[0mt[0m [0my[0mo[0mu[0mr[0m [0m[41mf[0m[41mu[0m[41mc[0m[41mk[0m[41mi[0m[41mn[0m[41mg[0m [0mj[0mu[0mn[0mi[0mo[0mr[0m [0mp[0ma[0ml[0m.[0m [0mW[0mh[0ma[0mt[0m [0my[0mo[0mu[0

<a name = "1.1"></a>
### ¿Mal etiquetados?

En algunas oraciones pone los espacios, comas y otros signos como caracteres tóxicos, hay que tener cuidado y al final concatenarlos si es que hay dos o más palabras tóxicas juntas. En la competencia parecen concatenarlos, pero no son consistententes en otras partes.

In [5]:
very_toxic = [i for i,x in zip(train.index, train['spans']) if len(x)>100][0:5]

for i,(index,text) in zip(very_toxic, train.values[very_toxic]):
    print('{} - '.format(i) + color_toxic_words(index, text) + '\n'*2)

158 - [41mT[0m[41mh[0m[41me[0m[41ms[0m[41me[0m[41m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41mi[0m[41mc[0m[41m [0m[41ms[0m[41mt[0m[41mu[0m[41md[0m[41mi[0m[41me[0m[41ms[0m[41m [0m[41ma[0m[41mr[0m[41me[0m[41m [0m[41mu[0m[41mp[0m[41ms[0m[41me[0m[41mt[0m[41mt[0m[41mi[0m[41mn[0m[41mg[0m[41m.[0m[41m [0m[41mW[0m[41mh[0m[41me[0m[41mr[0m[41me[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41mt[0m[41mh[0m[41me[0m[41m [0m[41mi[0m[41mn[0m[41mf[0m[41mo[0m[41mr[0m[41mm[0m[41ma[0m[41mt[0m[41mi[0m[41mo[0m[41mn[0m[41m [0m[41mt[0m[41mh[0m[41ma[0m[41mt[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41ms[0m[41mu[0m[41mr[0m[41mp[0m[41mr[0m[41mi[0m[41ms[0m[41mi[0m[41mn[0m[41mg[0m[41m.[0m[41m [0m[41mA[0m[41mn[0m[41my[0m[41m [0m[41mp[0m[41me[0m[41mr[0m[41ms[0m[41mo[0m[41mn[0m[41m [0m[41mi[0m[41mn[0m[41m [0m[41ma[0m[41mn[0m[41my[0m[41

Y está el problema de los etiquetados como [ ], que pueden ser completamente tóxicos, sarcásticos o simplemente son no tóxicos.

In [6]:
actually_toxic = [i for i in train.index if len(train.loc[i]['spans']) == 0][0:5]

for i,(index,text) in enumerate(train.values[actually_toxic]):
    print(str(i) + ' ' + color_toxic_words(index, text) + '\n'*2)

0 B[0mu[0mt[0m,[0m [0mb[0mu[0mt[0m,[0m [0mb[0mu[0mt[0m,[0m [0mi[0ms[0m [0mN[0mO[0mT[0m [0ma[0m [0md[0me[0mf[0me[0mn[0ms[0me[0m.[0m [0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0me[0mv[0me[0mn[0m [0ma[0m [0mg[0mo[0mo[0md[0m [0md[0me[0mf[0ml[0me[0mc[0mt[0mi[0mo[0mn[0m.[0m [0m [0mI[0mn[0m [0mA[0mm[0me[0mr[0mi[0mc[0ma[0m [0mt[0mo[0md[0ma[0my[0m [0mw[0me[0m [0mh[0ma[0mv[0me[0m [0mN[0ma[0mz[0mi[0ms[0m [0mw[0ma[0mv[0mi[0mn[0mg[0m [0mt[0mh[0me[0m [0mN[0ma[0mz[0mi[0m [0mf[0ml[0ma[0mg[0m [0ma[0mt[0m [0mr[0ma[0ml[0ml[0mi[0me[0ms[0m [0mi[0mn[0m [0mo[0mu[0mr[0m [0mc[0mi[0mt[0mi[0me[0ms[0m.[0m [0mI[0mn[0m [0mw[0mh[0ma[0mt[0m [0mc[0ma[0mp[0ma[0mc[0mi[0mt[0my[0m [0md[0mo[0me[0ms[0m [0ma[0mn[0my[0mo[0mn[0me[0m [0mt[0mh[0mi[0mn[0mk[0m [0mt[0mh[0mi[0ms[0m [0mi[0ms[0m [0mo[0mk[0m [0ma[0mn[0md[0m [0mw[0mh[0mo[0m [

<a name = "2"></a>
## Preprocesamiento
Primero sería mejor tener las palabras tóxicas que los caracteres, también hay que tener cuidado cuando marcan un espacio o algún signo de puntuación como tóxico, al final sólo hay que concatenar si hay dos palabras tóxicas juntas (por hacer). También hay que pasar todo a minúscula y hacer el análisis en minúscula.

In [7]:
text = 'fuck you idiot!!!'
index = np.arange(len(text)).tolist()

print(color_toxic_words(index, text))
print(color_toxic_words(remove_symbols(index, text), text))

[41mf[0m[41mu[0m[41mc[0m[41mk[0m[41m [0m[41my[0m[41mo[0m[41mu[0m[41m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41m![0m[41m![0m[41m![0m
[41mf[0m[41mu[0m[41mc[0m[41mk[0m [0m[41my[0m[41mo[0m[41mu[0m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41m![0m[41m![0m[41m![0m


Quitamos símbolos.

In [8]:
indices_clean = [remove_symbols(index, text) for index,text in 
                 zip(train['spans'], train['text'])]

train['spans_clean'] = indices_clean

Veamos como se ven si lo hacemos así:

In [9]:
very_toxic = [i for i,x in zip(train.index, train['spans']) if len(x)>100][0:3]

for i, (index,text,index_clean) in zip(very_toxic, train.values[very_toxic]):
    print('{} - '.format(i) + color_toxic_words(index, text))
    print('{} - '.format(i) + color_toxic_words(index_clean, text) + '\n'*2)

158 - [41mT[0m[41mh[0m[41me[0m[41ms[0m[41me[0m[41m [0m[41mi[0m[41md[0m[41mi[0m[41mo[0m[41mt[0m[41mi[0m[41mc[0m[41m [0m[41ms[0m[41mt[0m[41mu[0m[41md[0m[41mi[0m[41me[0m[41ms[0m[41m [0m[41ma[0m[41mr[0m[41me[0m[41m [0m[41mu[0m[41mp[0m[41ms[0m[41me[0m[41mt[0m[41mt[0m[41mi[0m[41mn[0m[41mg[0m[41m.[0m[41m [0m[41mW[0m[41mh[0m[41me[0m[41mr[0m[41me[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41mt[0m[41mh[0m[41me[0m[41m [0m[41mi[0m[41mn[0m[41mf[0m[41mo[0m[41mr[0m[41mm[0m[41ma[0m[41mt[0m[41mi[0m[41mo[0m[41mn[0m[41m [0m[41mt[0m[41mh[0m[41ma[0m[41mt[0m[41m [0m[41mi[0m[41ms[0m[41m [0m[41ms[0m[41mu[0m[41mr[0m[41mp[0m[41mr[0m[41mi[0m[41ms[0m[41mi[0m[41mn[0m[41mg[0m[41m.[0m[41m [0m[41mA[0m[41mn[0m[41my[0m[41m [0m[41mp[0m[41me[0m[41mr[0m[41ms[0m[41mo[0m[41mn[0m[41m [0m[41mi[0m[41mn[0m[41m [0m[41ma[0m[41mn[0m[41my[0m[41

También pasemos a minúscula el texto para el entrenamiento:

In [10]:
train['text'] = train['text'].apply(lambda x:x.lower())

<a name = "3"></a>
## Modelos

En la competencia usarán el F1 score para evaluar como sigue:

<blockquote cite="http://www.worldwildlife.org/who/index.html">
To evaluate the responses of a system participating in the challenge, we employ the $F_{1}$ score, as in [1]. Let system $A_i$ return a set $S^{t}_{A_{i}}$ of character offsets, for parts of the post found to be toxic. Let $G^{t}$ be the character offsets of the ground truth annotations of $t$. We compute the $F_{1}$ score of system $A_i$ with respect to the ground truth $G$ for post $t$ as follows, where $|·|$ denotes set cardinality.
    
$$ F_{1}^{t}(A_{i}, G) = \dfrac{2 \cdot P^{t}(A_{i}, G) \cdot R^{t}(A_{i}, G)}{P^{t}(A_{i}, G) + R^{t}(A_{i}, G)}$$
    
$$ P^{t}(A_{i}, G) = \dfrac{|S^{t}_{A_{i}} \cap S^{t}_{A_{i}}|}{|S^{t}_{A_{i}}|} $$
    
$$ R^{t}(A_{i}, G) = \dfrac{|S^{t}_{A_{i}} \cap S^{t}_{A_{i}}|}{|S^{t}_{G}|} $$    
    
If $S^{t}_{G}$ is empty for some post $t$ (no gold spans are given for $t$), we set $F_{1}^{t}(A_{i}, G)$ if $S^{t}_{A_{i}}$ is also empty, and $F_{1}^{t}(A_{i}, G) = 0$ otherwise. We finally average $F_{1}^{t}(A_{i}, G)$ over all the posts $t$ of an evaluation dataset $T$ to obtain a single score for system $A_{i}$.

</blockquote>

<a name = "3.1"></a>
### Hidden Markov Model

En nuestro caso debemos etiquetar en `toxic` y `non_toxic`.

In [11]:
train_data = []
for index, text in zip(train['spans_clean'], train['text']):
    toxic_words = [text[i[0]:i[-1]+1] for i in separate_words(index) if len(index) > 0]
    # print(toxic_words)
    
    tokens = word_tokenize(text)
    tagged_tokens = []
    
    for token in tokens:
        if token in toxic_words:
            tagged_tokens.append((token, 'toxic'))
            # Removemos en caso de que se repita posteriormente pero esté como 'non_toxic'
            toxic_words.remove(token) 
        else:
            tagged_tokens.append((token, 'non_toxic'))
            
    train_data.append(tagged_tokens)

Entrenamos:

In [12]:
trainer = HiddenMarkovModelTrainer()
tagger_HMM = trainer.train_supervised(train_data)

Veamos que score tenemos en el dataset de `test` y comparemos nuestras predicciones:

In [13]:
scores_HMM = []
tag_to_ix = {"non_toxic": 0, "toxic": 1}

for i, (gold_index, text) in enumerate(test.values):
    tagged_sentence = tagger_HMM.tag(word_tokenize(text.lower()))   
    tagged_sentence = [(x[0], tag_to_ix[x[1]]) for x in tagged_sentence]
    prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)
    
    if i < 5:
        print(colored('Pred: ', color='cyan', attrs=['bold']) + 
          color_toxic_words(prediction_index, text))   
        print(colored('Gold: ', color='yellow', attrs=['bold']) + 
              color_toxic_words(gold_index, text) + '\n'*2)
    
    scores_HMM.append(f1(prediction_index, gold_index))

[1m[36mPred: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0mb[0mi[0mg[0mo[0mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m
[1m[33mGold: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0m[41mb[0m[41mi[0m[41mg[0m[41mo[0m[41mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m


[1m[36mPred: [0mH[0mo[0mw[0m [0ma[0mb[0mo[0mu[0mt[0m [0mw[0me[0m [0ms[0mt[0mo[0mp[0m [0mp[0mr[0mo[0mt[0me[0mc[0mt[0

No es muy bueno desafortunadamente.

In [14]:
print('HMM score: {:.3f}'.format(np.mean(scores_HMM)))

HMM score: 0.319


<a name = "3.2"></a>
### Conditional Random Fields for Sequence Prediction

In [15]:
train_data_crf = []
for text, toxic_tags in tqdm(zip(train['text'], train_data), total=len(train_data)):
    pos_tags = token_postag_label(text.lower())
    sentence = []
    for x,y in zip(pos_tags, toxic_tags):
        sentence.append(x + (y[1],))
        
    train_data_crf.append(sentence)

100%|██████████| 7939/7939 [00:09<00:00, 839.87it/s]


In [16]:
X_train = [sent2features(s) for s in train_data_crf]
y_train = [sent2labels(s) for s in train_data_crf]

trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

Probablemente sea buena idea hacer una búsqueda de los mejores hiperparámetros, ya que el performance cambia significativamente dependiendo de los hiperparámetros.

In [17]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 0.01,  # coefficient for L2 penalty
    'max_iterations': 100,  # stop earlier
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

# trainer.params()

In [18]:
%time trainer.train('models/toxic_speech.crfsuite')

tagger_crf = pycrfsuite.Tagger()
tagger_crf.open('models/toxic_speech.crfsuite')

CPU times: user 13.8 s, sys: 2.63 ms, total: 13.8 s
Wall time: 13.8 s


<contextlib.closing at 0x7f173f223390>

In [19]:
test_data_crf =  []
for text in tqdm(test['text']):
    pos_tags = token_postag_label(text.lower())
    sentence = []
    for x in pos_tags:
        sentence.append(x + ('',))
        
    test_data_crf.append(sentence)

100%|██████████| 690/690 [00:00<00:00, 857.18it/s]


In [20]:
scores_CRF = []
tag_to_ix = {"non_toxic": 0, "toxic": 1}

for i, (text_crf, (gold_index,text)) in enumerate(zip(test_data_crf, test.values)):
    words = [x[0] for x in text_crf]
    tags = tagger_crf.tag(sent2features(text_crf))
    
    tagged_sentence = [(word, tag_to_ix[tag]) for word,tag in zip(words,tags)]
    prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)
    
    if i < 5:
        print(colored('Pred: ', color='cyan', attrs=['bold']) + 
          color_toxic_words(prediction_index, text))   
        print(colored('Gold: ', color='yellow', attrs=['bold']) + 
              color_toxic_words(gold_index, text) + '\n'*2)
    
    scores_CRF.append(f1(prediction_index, gold_index))

[1m[36mPred: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0mb[0mi[0mg[0mo[0mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m
[1m[33mGold: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0m[41mb[0m[41mi[0m[41mg[0m[41mo[0m[41mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m


[1m[36mPred: [0mH[0mo[0mw[0m [0ma[0mb[0mo[0mu[0mt[0m [0mw[0me[0m [0ms[0mt[0mo[0mp[0m [0mp[0mr[0mo[0mt[0me[0mc[0mt[0

Mejora pero sigue siendo bajo.

In [21]:
print('CRF score: {:.3f}'.format(np.mean(scores_CRF)))

CRF score: 0.552


In [22]:
evaluation = pd.read_csv('../Datos/tsd_test.csv')
evaluation.head()

FileNotFoundError: [Errno 2] No such file or directory: '../Datos/tsd_test.csv'

In [None]:
indices_evaluation = []

for text in evaluation['text']:
    tagged_sentence = tagger_LSTM(text)   
    tagged_sentence = [(x[0], tag_to_ix[x[1]]) for x in tagged_sentence]
    prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)
    indices_evaluation.append(prediction_index)
    print(colored('Pred: ', color='cyan', attrs=['bold']) + 
          color_toxic_words(prediction_index, text)) 

In [None]:
evaluation['spans'] = indices_evaluation
evaluation = evaluation[['spans', 'text']]
evaluation.head()

Para la evaluación se debe subir un zip con un archivo txt de la siguiente manera (al final subir el archivo `spans-pred.zip` que se produce):

In [None]:
predictions = evaluation['spans'].tolist()
ids = evaluation.index.tolist()

In [None]:
with open("spans-pred.txt", "w") as out:
    for uid, text_scores in zip(ids, predictions):
        out.write(f"{str(uid)}\t{str(text_scores)}\n")
        
# Zip the predictions
! zip -r spans-pred.zip ./spans-pred.* 
! rm spans-pred.txt
! mv spans-pred.zip ../spans-pred.zip