## Contextualized model

Let's train a model but this time **taking** the context into account

In [1]:
%load_ext autoreload
%autoreload 2

from hatedetection import load_datasets

train_dataset, dev_dataset, test_dataset = load_datasets()


In [2]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "../models/bert-contextualized-hate-speech-es/"
model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=2)



device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 256

In [3]:
def tokenize(batch, context=True, padding='max_length', truncation=True):
    """
    Apply tokenization
    
    Arguments:
    ---------
    
    use_context: boolean (default True)
        Whether to add the context to the 
    """
    
    if context:
        args = [batch['context'], batch['text']]
    else:
        args = [batch['text']]
        
    return tokenizer(*args, padding='max_length', truncation=True)

batch_size = 32
eval_batch_size = 16

my_tokenize = lambda x: tokenize(x, context=True)

train_dataset = train_dataset.map(my_tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(my_tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(my_tokenize, batched=True, batch_size=eval_batch_size)



HBox(children=(FloatProgress(value=0.0, max=1139.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=570.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=709.0), HTML(value='')))




In [4]:
tokenizer.decode(train_dataset["input_ids"][4])

'[CLS] Les darán DNI provisorio a personas en situación vulnerable, para que puedan empezar a cobrar planes sociales [SEP] usuario Seguimos alimentando vagos [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [5]:

def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['HATEFUL']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

The history saving thread hit an unexpected error (OperationalError('database is locked')).History will not be written to the database.


HBox(children=(FloatProgress(value=0.0, max=36420.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=9106.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11343.0), HTML(value='')))




Lo cargamos sólo para evaluar 🤗

In [6]:
from hatedetection.metrics import compute_hate_metrics
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
)


trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_hate_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)


In [7]:
import pandas as pd
pd.options.display.max_columns = 40
pd.set_option('display.float_format', lambda x: '%.5f' % x)

df_results = pd.DataFrame([trainer.evaluate(dev_dataset)])

df_results.T

Unnamed: 0,0
eval_loss,0.5971
eval_accuracy,0.91808
eval_f1,0.83918
eval_precision,0.844
eval_recall,0.83457
eval_runtime,104.9399
eval_samples_per_second,86.773
init_mem_cpu_alloc_delta,54803.0
init_mem_gpu_alloc_delta,0.0
init_mem_cpu_peaked_delta,18258.0
