# Toxic Spans Detection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn

import spacy
import ast
from termcolor import colored

from tqdm import tqdm
import gdown

from utils.processing import get_index_toxic_words, color_toxic_words, f1
from utils.lstm import spacy_tokenizer, get_vocab

sns.set_style('darkgrid')
dev = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [2]:
# To plot using LaTeX, sometimes it gives trouble, in that case comment these two lines
plt.rc('text', usetex=True)
plt.rc('font', family='serif')

## Resultados

De los distintos preprocesamientos el que mayor F1 score tiene en `test` es poner los posts con [ ] como completamente tóxicos (**best-model-try2.pt**, *train*=0.6498 , *test*=0.6526), así que usaremos ese.

In [3]:
train = pd.read_csv('../data/tsd_train.csv', converters={'spans':ast.literal_eval})
test = pd.read_csv('../data/tsd_trial.csv', converters={'spans':ast.literal_eval})

In [None]:
# Nuestros embeddings
vocab = get_vocab(train)

In [5]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, stacked_layers, dropout_p, weight, hidden_dim, vocab_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim         # Dimension del estado oculta en cada direccion de la LSTM
        self.stacked_layers = stacked_layers # Cuantas capas en la LSTM
        
        self.word_embeddings = nn.Embedding.from_pretrained(weight)
        self.lstm = nn.LSTM(embedding_dim,
                            hidden_dim,
                            num_layers=stacked_layers,
                            dropout=dropout_p,
                            bidirectional=True)

        # Linear layers
        self.fc1 = nn.Linear(hidden_dim*2, 1) # 2 veces el tamaño de hidden_dim por ser bidireccional

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        output, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        x = torch.sigmoid(self.fc1(output.view(len(sentence), -1)))
        return x

In [6]:
def prepare_sequence(seq):
    idxs = vocab.lookup_indices(seq)      # Si no está lo pone como 0
    return torch.tensor(idxs, dtype=torch.long, device=dev)

def prepare_sequence_tags(seq):
    tag_to_ix = {"non_toxic": 0, "toxic": 1} 
    idxs = [tag_to_ix[s] for s in seq]
    return torch.tensor(idxs, dtype=torch.long, device=dev)

def tagger_LSTM(text, threshold=0.5):
    """
    Hace el tagging con el modelo que entrenamos.
    """
    ix_to_tag = {0: 'non_toxic', 1: 'toxic'}
    words = spacy_tokenizer(text.lower()) # Parece funcionar mejor
    
    with torch.no_grad():
        inputs = prepare_sequence(words)
        tag_scores = model(inputs)
        
        tags = [1 if x > threshold else 0 for x in tag_scores]
        tagged_sentence = list(zip(words, tags))

    return tagged_sentence

El archivo con el modelo está algo pesado para GitHub, así que lo pondremos en un Drive para descargarlo.

In [None]:
url = 'https://drive.google.com/uc?id=1KO-QXUBfwzjauWLhiVi9StD3y0GtiBbj'
output = 'models/best-model.pt'

gdown.download(url, output, quiet=False) 

In [8]:
model = torch.load(output)
model.to(torch.device(dev))

LSTMTagger(
  (word_embeddings): Embedding(19611, 200)
  (lstm): LSTM(200, 600, num_layers=6, dropout=0.2, bidirectional=True)
  (fc1): Linear(in_features=1200, out_features=1, bias=True)
)

In [9]:
indices_test = []
for i, (gold_index, text) in enumerate(zip(test['spans'],test['text'])):
    tagged_sentence = tagger_LSTM(text)   
    prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)
    indices_test.append(prediction_index)
    
    if i < 5:
        print(str(i) + colored(' Pred: ', color='cyan', attrs=['bold']) + 
              color_toxic_words(prediction_index, text))
        print(str(i) + colored(' Gold: ', color='yellow', attrs=['bold']) + 
              color_toxic_words(gold_index, text) + '\n'*2)

0[1m[36m Pred: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0m[41mb[0m[41mi[0m[41mg[0m[41mo[0m[41mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m
0[1m[33m Gold: [0mB[0me[0mc[0ma[0mu[0ms[0me[0m [0mh[0me[0m'[0ms[0m [0ma[0m [0m[41mm[0m[41mo[0m[41mr[0m[41mo[0m[41mn[0m [0ma[0mn[0md[0m [0ma[0m [0m[41mb[0m[41mi[0m[41mg[0m[41mo[0m[41mt[0m.[0m [0mI[0mt[0m'[0ms[0m [0mn[0mo[0mt[0m [0ma[0mn[0my[0m [0mm[0mo[0mr[0me[0m [0mc[0mo[0mm[0mp[0ml[0mi[0mc[0ma[0mt[0me[0md[0m [0mt[0mh[0ma[0mn[0m [0mt[0mh[0ma[0mt[0m.[0m


1[1m[36m Pred: [0mH[0mo[0mw[0m [0ma[0mb[0mo[0mu[0mt[0m [0mw[0me[0m [0ms[0mt[0mo[0mp[0m [0mp[

In [10]:
score_test = [f1(pred, gold) for pred,gold in zip(indices_test, test['spans'])]
print('F1 in test: {:.6f}'.format(np.mean(score_test)))

F1 in test: 0.648722


## Predicción en el dataset de *evaluation*

In [11]:
evaluation = pd.read_csv('../data/tsd_test.csv')

In [12]:
indices_evaluation = []
for i,text in enumerate(evaluation['text']):
    tagged_sentence = tagger_LSTM(text)   
    prediction_index = get_index_toxic_words(text.lower(), tagged_sentence)
    indices_evaluation.append(prediction_index)
    
    if i < 5:
        print(str(i) + colored(' Pred: ', color='cyan', attrs=['bold']) + 
              color_toxic_words(prediction_index, text) + '\n') 

0[1m[36m Pred: [0mT[0mh[0ma[0mt[0m'[0ms[0m [0mr[0mi[0mg[0mh[0mt[0m.[0m [0mT[0mh[0me[0my[0m [0ma[0mr[0me[0m [0mn[0mo[0mt[0m [0mn[0mo[0mr[0mm[0ma[0ml[0m.[0m [0mA[0mn[0md[0m [0mI[0m [0ma[0mm[0m [0ms[0mt[0ma[0mr[0mt[0mi[0mn[0mg[0m [0mf[0mr[0mo[0mm[0m [0mt[0mh[0me[0m [0mp[0mr[0me[0mm[0mi[0ms[0me[0m [0mt[0mh[0ma[0mt[0m [0mt[0mh[0me[0my[0m [0ma[0mr[0me[0m [0mA[0mB[0mN[0mO[0mR[0mM[0mA[0mL[0m.[0m [0mP[0mr[0mo[0mc[0me[0me[0md[0m [0mw[0mt[0mh[0m [0mt[0mh[0me[0m [0mt[0my[0mp[0mi[0mc[0ma[0ml[0m [0m[41mr[0m[41ma[0m[41mc[0m[41mi[0m[41ms[0m[41mt[0m[41m,[0m[41m [0m[41mb[0m[41mi[0m[41mg[0m[41mo[0m[41mt[0m[41m,[0m[41m [0m[41ms[0m[41me[0m[41mx[0m[41mi[0m[41ms[0m[41mt[0m[41m [0m[41mr[0m[41mu[0m[41mb[0m[41mb[0m[41mi[0m[41ms[0m[41mh[0m.[0m
[0mT[0mh[0ma[0mn[0mk[0ms[0m![0m

1[1m[36m Pred: [0m"[0mW[0ma[0mt[0mc[0mh[0m 

In [13]:
evaluation['spans'] = indices_evaluation
evaluation = evaluation[['spans', 'text']]
evaluation.head()

Unnamed: 0,spans,text
0,"[118, 119, 120, 121, 122, 123, 124, 125, 126, ...",That's right. They are not normal. And I am st...
1,"[81, 82, 83, 84, 85, 86]","""Watch people die from taking away their healt..."
2,"[483, 484, 485, 486, 487, 488, 489, 490]",tens years ago i contacted the PDR and suggest...
3,"[413, 414, 415, 416, 417, 418, 419, 420]",The parallels between the ANC and the Sicilian...
4,"[150, 151, 152, 153, 154, 155, 156, 271, 272, ...",Intel Community: ‘How can we work for a Presid...


Para la evaluación se debe subir un zip con un archivo txt de la siguiente manera (al final subir el archivo `spans-pred.zip` que se produce):

In [14]:
predictions = evaluation['spans'].tolist()
ids = evaluation.index.tolist()

with open("spans-pred.txt", "w") as out:
    for uid, text_scores in zip(ids, predictions):
        out.write(f"{str(uid)}\t{str(text_scores)}\n")
        
# Zip the predictions
! zip -r spans-pred.zip ./spans-pred.* 
! rm spans-pred.txt
! mv spans-pred.zip ../spans-pred.zip

  adding: spans-pred.txt (deflated 84%)
