## Non-contextualized model

Let's train a model without using the news

In [1]:
%load_ext autoreload
%autoreload 2
import json

with open("../data/train.json") as f:
    train_articles = json.load(f)

with open("../data/test.json") as f:
    test_articles = json.load(f)

Let's take just the comments

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

train_comments = [c for article in train_articles for c in article["comments"]]
test_comments = [c for article in test_articles for c in article["comments"]]

train_df = pd.DataFrame(train_comments)
test_df = pd.DataFrame(test_comments)

train_df, dev_df = train_test_split(train_df, test_size=0.2)

In [3]:
categories = [
    "is_hateful",
    "calls",
    "WOMEN",
    "LGBTI",
    "RACISM",
    "CLASS",
    "POLITICS",
    "DISABLED",
    "APPEARANCE",
    "CRIMINAL",
]

train_df[categories].mean() - test_df[categories].mean()

is_hateful    0.004586
calls         0.007727
WOMEN         0.005495
LGBTI        -0.007336
RACISM       -0.004984
CLASS        -0.000117
POLITICS     -0.002226
DISABLED      0.001036
APPEARANCE   -0.008904
CRIMINAL      0.017854
dtype: float64

It seems that somehow it is evenly distributed...

In [4]:
import re

user_regex = re.compile(r"@[a-zA-Z0-9_]{0,15}")
url_regex = re.compile(
    "((?<=[^a-zA-Z0-9])(?:https?\:\/\/|[a-zA-Z0-9]{1,}\.{1}|\b)(?:\w{1,}\.{1}){1,5}(?:com|co|org|edu|gov|uk|net|ca|de|jp|fr|au|us|ru|ch|it|nl|se|no|es|mil|iq|io|ac|ly|sm){1}(?:\/[a-zA-Z0-9]{1,})*)"
)

def preprocess_tweet(text):
    """
    Basic preprocessing
    """
    text = user_regex.sub("usuario", text)
    text = url_regex.sub("url", text)

    return text

In [5]:
train_df["text"] = train_df["text"].apply(preprocess_tweet)
dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
test_df["text"] = test_df["text"].apply(preprocess_tweet)


In [6]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

model_name = 'dccuchile/bert-base-spanish-wwm-cased'

device = "cuda" if torch.cuda.is_available() else "cpu"

id2label = {0: 'Not hateful', 1: 'Hateful'}
label2id = {v:k for k,v in id2label.items()}

model = BertForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=2)

model.config.hidden_dropout_prob = 0.20
model.config.id2label = id2label
model.config.label2id = label2id

model = model.to(device)
model.train();



tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased a

In [7]:
from datasets import Dataset, Value, ClassLabel, Features

#examples = pd.concat([train_df, dev_df])

features = Features({
    'text': Value('string'),
    'is_hateful': ClassLabel(num_classes=2, names=["Not Hateful", "Hateful"])
})

train_dataset = Dataset.from_pandas(train_df[["text", "is_hateful"]], features=features)
dev_dataset = Dataset.from_pandas(dev_df[["text", "is_hateful"]], features=features)
test_dataset = Dataset.from_pandas(test_df[["text", "is_hateful"]], features=features)


In [8]:
def tokenize(batch):
        return tokenizer(batch['text'], padding='max_length', truncation=True)

batch_size = 64
eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)



HBox(children=(FloatProgress(value=0.0, max=288.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=288.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=355.0), HTML(value='')))




In [9]:

def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['is_hateful']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=18375.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4594.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5668.0), HTML(value='')))




In [10]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    """
    Compute metrics for Trainer
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
from transformers import Trainer, TrainingArguments
epochs = 5

total_steps = (epochs * len(train_dataset)) // batch_size
warmup_steps = total_steps // 10
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    do_eval=False,
    weight_decay=0.01,
    logging_dir='./logs',
)

results = []

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()

f1_scores = torch.Tensor([r["eval_f1"] for r in results])
print(f"Macro F1: {f1_scores.mean():.3f} +- {f1_scores.std():.3f}")

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,No log,0.348921,0.875054,0.754097,0.826806,0.71828,16.5723,277.21
2,0.332100,0.335157,0.874184,0.779324,0.797842,0.764578,16.5982,276.778
3,0.332100,0.437481,0.877231,0.761523,0.82773,0.72689,16.5823,277.043
4,0.126800,0.577937,0.8633,0.765138,0.775632,0.756082,16.5891,276.929
5,0.126800,0.67803,0.865912,0.764815,0.782404,0.750864,16.5759,277.15


Macro F1: nan +- nan


In [16]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.70694500207901,
 'eval_accuracy': 0.8600917431192661,
 'eval_f1': 0.741964277051741,
 'eval_precision': 0.7589047146990792,
 'eval_recall': 0.7286028994315549,
 'eval_runtime': 20.1646,
 'eval_samples_per_second': 281.087,
 'epoch': 5.0}