## Contextualized model

Let's train a model but this time **taking** the context into account

In [1]:
%load_ext autoreload
%autoreload 2
import json

with open("../data/train.json") as f:
    train_articles = json.load(f)

with open("../data/test.json") as f:
    test_articles = json.load(f)

Let's take just the comments

In [2]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, Value, ClassLabel, Features
import pandas as pd

def serialize(article, comment):
    ret = comment.copy()
    ret["context"] = article["title"]
    return ret
    

train_comments = [serialize(article, comment) for article in train_articles for comment in article["comments"]]
test_comments = [serialize(article, comment) for article in test_articles for comment in article["comments"]]

#train_comments = sorted(train_comments, key=lambda x: len(x["text"] + x["context"]))
#test_comments = sorted(train_comments, key=lambda x: len(x["text"] + x["context"]))

train_df = pd.DataFrame(train_comments)
test_df = pd.DataFrame(test_comments)

train_df, dev_df = train_test_split(train_df, test_size=0.2, random_state=20212021)

In [3]:
import re

user_regex = re.compile(r"@[a-zA-Z0-9_]{0,15}")
url_regex = re.compile(
    "((?<=[^a-zA-Z0-9])(?:https?\:\/\/|[a-zA-Z0-9]{1,}\.{1}|\b)(?:\w{1,}\.{1}){1,5}(?:com|co|org|edu|gov|uk|net|ca|de|jp|fr|au|us|ru|ch|it|nl|se|no|es|mil|iq|io|ac|ly|sm){1}(?:\/[a-zA-Z0-9]{1,})*)"
)

def preprocess_tweet(text):
    """
    Basic preprocessing
    """
    text = user_regex.sub("usuario", text)
    text = url_regex.sub("url", text)

    return text

train_df["text"] = train_df["text"].apply(preprocess_tweet)
dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
test_df["text"] = test_df["text"].apply(preprocess_tweet)

In [4]:
features = Features({
    'context': Value('string'),
    'text': Value('string'),
    'is_hateful': ClassLabel(num_classes=2, names=["Not Hateful", "Hateful"])
})

columns = [
    "context",
    "text",
    "is_hateful",
]

train_dataset = Dataset.from_pandas(train_df[columns], features=features)
dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
test_dataset = Dataset.from_pandas(test_df[columns], features=features)


In [5]:
categories = [
    "is_hateful",
    "calls",
    "WOMEN",
    "LGBTI",
    "RACISM",
    "CLASS",
    "POLITICS",
    "DISABLED",
    "APPEARANCE",
    "CRIMINAL",
]

train_df[categories].mean(), train_df[categories].mean() - test_df[categories].mean()

(is_hateful    0.158162
 calls         0.028813
 WOMEN         0.022466
 LGBTI         0.012851
 RACISM        0.047203
 CLASS         0.014158
 POLITICS      0.024333
 DISABLED      0.010237
 APPEARANCE    0.031147
 CRIMINAL      0.033605
 dtype: float64,
 is_hateful   -0.005143
 calls         0.005331
 WOMEN        -0.008032
 LGBTI        -0.003811
 RACISM        0.005207
 CLASS        -0.004161
 POLITICS     -0.004899
 DISABLED     -0.002040
 APPEARANCE   -0.003736
 CRIMINAL      0.011097
 dtype: float64)

It seems that somehow it is evenly distributed...

In [6]:
train_df["text"] = train_df["text"].apply(preprocess_tweet)
dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
test_df["text"] = test_df["text"].apply(preprocess_tweet)


In [7]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = 'dccuchile/bert-base-spanish-wwm-cased'

device = "cuda" if torch.cuda.is_available() else "cpu"

id2label = {0: 'Not hateful', 1: 'Hateful'}
label2id = {v:k for k,v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=2)

model.config.id2label = id2label
model.config.label2id = label2id

model = model.to(device)
model.train();



tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 256

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased a

In [8]:
def tokenize(batch):
    return tokenizer(batch['context'], batch['text'], padding='max_length', truncation=True)

batch_size = 32
eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)



HBox(children=(FloatProgress(value=0.0, max=1005.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=503.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=642.0), HTML(value='')))




In [9]:

def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['is_hateful']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=32138.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8035.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10263.0), HTML(value='')))




In [10]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    """
    Compute metrics for Trainer
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
from transformers import Trainer, TrainingArguments
epochs = 10

total_steps = (epochs * len(train_dataset)) // batch_size
warmup_steps = total_steps // 10
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    do_eval=False,
    weight_decay=0.01,
    logging_dir='./logs',
    group_by_length=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.2936,0.325564,0.868451,0.647018,0.89941,0.612738,66.9521,120.011
2,0.2184,0.248986,0.907405,0.818037,0.854059,0.791908,67.3894,119.232
3,0.1274,0.322458,0.908027,0.820306,0.853701,0.795559,66.7088,120.449
4,0.0716,0.430035,0.909147,0.825273,0.851649,0.804571,67.4148,119.188
5,0.0405,0.526347,0.911512,0.826879,0.861615,0.801225,67.0573,119.823
6,0.0213,0.559009,0.908899,0.828415,0.845865,0.813656,66.9265,120.057
7,0.0152,0.617748,0.912383,0.828291,0.864033,0.802046,66.4775,120.868
8,0.0088,0.683005,0.911139,0.829779,0.854686,0.809937,66.9669,119.985
9,0.003,0.704842,0.911388,0.830702,0.854483,0.811576,66.9927,119.938
10,0.0027,0.721557,0.911014,0.831805,0.850936,0.81582,66.7634,120.35


TrainOutput(global_step=10050, training_loss=0.08420811005149834, metrics={'train_runtime': 8682.8158, 'train_samples_per_second': 1.157, 'total_flos': 54227512468746240, 'epoch': 10.0})

In [12]:
trainer.evaluate(dev_dataset)

{'eval_loss': 0.7215569019317627,
 'eval_accuracy': 0.911014312383323,
 'eval_f1': 0.8318054828480544,
 'eval_precision': 0.8509359013185828,
 'eval_recall': 0.8158195953108915,
 'eval_runtime': 66.0959,
 'eval_samples_per_second': 121.566,
 'epoch': 10.0}

In [18]:
trainer.save_model("../models/bert-contextualized-hate-speech-es")
tokenizer.save_pretrained("../models/bert-contextualized-hate-speech-es/")

('../models/bert-contextualized-hate-speech-es/tokenizer_config.json',
 '../models/bert-contextualized-hate-speech-es/special_tokens_map.json',
 '../models/bert-contextualized-hate-speech-es/vocab.txt',
 '../models/bert-contextualized-hate-speech-es/added_tokens.json')