## Contextualized model

Let's train a model but this time **taking** the context into account

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
%load_ext autoreload
%autoreload 2

from hatedetection import load_datasets

train_dataset, dev_dataset, test_dataset = load_datasets()


In [3]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = 'dccuchile/bert-base-spanish-wwm-cased'

device = "cuda" if torch.cuda.is_available() else "cpu"

id2label = {0: 'Not hateful', 1: 'Hateful'}
label2id = {v:k for k,v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=2)

model.config.id2label = id2label
model.config.label2id = label2id

#model = model.to(device)
model.train();



tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 256

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased a

In [4]:
def tokenize(batch, context=True, padding='max_length', truncation=True):
    """
    Apply tokenization
    
    Arguments:
    ---------
    
    use_context: boolean (default True)
        Whether to add the context to the 
    """
    
    if context:
        args = [batch['context'], batch['text']]
    else:
        args = [batch['text']]
        
    return tokenizer(*args, padding='max_length', truncation=True)

batch_size = 32
eval_batch_size = 16

my_tokenize = lambda x: tokenize(x, context=True)

train_dataset = train_dataset.map(my_tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(my_tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(my_tokenize, batched=True, batch_size=eval_batch_size)



HBox(children=(FloatProgress(value=0.0, max=1047.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=524.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=658.0), HTML(value='')))




In [5]:
tokenizer.decode(train_dataset["input_ids"][4])

'[CLS] Sergio Massa : [UNK] Mauricio Macri despreció a los argentinos al irse a Francia [UNK] [SEP] usuario Lo único bueno del gobierno de macri fue la definición hacia massa : el politico menos confiable de la argentina.. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PA

In [6]:

def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['HATEFUL']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=33501.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8376.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10513.0), HTML(value='')))




In [7]:
train_dataset

Dataset({
    features: ['APPEARANCE', 'CALLS', 'CLASS', 'CRIMINAL', 'DISABLED', 'HATEFUL', 'LGBTI', 'POLITICS', 'RACISM', 'WOMEN', 'attention_mask', 'context', 'input_ids', 'labels', 'text', 'token_type_ids'],
    num_rows: 33501
})

In [8]:
dev_dataset

Dataset({
    features: ['APPEARANCE', 'CALLS', 'CLASS', 'CRIMINAL', 'DISABLED', 'HATEFUL', 'LGBTI', 'POLITICS', 'RACISM', 'WOMEN', 'attention_mask', 'context', 'input_ids', 'labels', 'text', 'token_type_ids'],
    num_rows: 8376
})

In [9]:
from hatedetection.metrics import compute_hate_metrics
from transformers import Trainer, TrainingArguments
epochs = 10

total_steps = (epochs * len(train_dataset)) // batch_size
warmup_steps = total_steps // 10
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    do_eval=False,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

results = []

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_hate_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.2956,0.28146,0.892431,0.797992,0.805527,0.791074,63.2263,132.477
2,0.2146,0.296483,0.88479,0.803208,0.785343,0.826371,63.2237,132.482
3,0.1269,0.281939,0.909742,0.826154,0.844316,0.810862,63.2941,132.335


KeyboardInterrupt: 

In [None]:
trainer.evaluate(dev_dataset)

In [None]:
trainer.save_model("../models/bert-contextualized-hate-speech-es")
tokenizer.save_pretrained("../models/bert-contextualized-hate-speech-es/")