## Non-contextualized model

Let's train a model without using the news

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
%load_ext autoreload
%autoreload 2
import json

with open("../data/train.json") as f:
    train_articles = json.load(f)

with open("../data/test.json") as f:
    test_articles = json.load(f)

Let's take just the comments

In [3]:
from sklearn.model_selection import train_test_split
import pandas as pd

train_comments = [c for article in train_articles for c in article["comments"]]
test_comments = [c for article in test_articles for c in article["comments"]]

train_df = pd.DataFrame(train_comments)
test_df = pd.DataFrame(test_comments)

train_df, dev_df = train_test_split(train_df, test_size=0.2, random_state=20212021)

In [4]:
import re

user_regex = re.compile(r"@[a-zA-Z0-9_]{0,15}")
url_regex = re.compile(
    "((?<=[^a-zA-Z0-9])(?:https?\:\/\/|[a-zA-Z0-9]{1,}\.{1}|\b)(?:\w{1,}\.{1}){1,5}(?:com|co|org|edu|gov|uk|net|ca|de|jp|fr|au|us|ru|ch|it|nl|se|no|es|mil|iq|io|ac|ly|sm){1}(?:\/[a-zA-Z0-9]{1,})*)"
)

def preprocess_tweet(text):
    """
    Basic preprocessing
    """
    text = user_regex.sub("usuario", text)
    text = url_regex.sub("url", text)

    return text

train_df["text"] = train_df["text"].apply(preprocess_tweet)
dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
test_df["text"] = test_df["text"].apply(preprocess_tweet)

In [5]:
from datasets import Dataset, Value, ClassLabel, Features

features = Features({
    'text': Value('string'),
    'is_hateful': ClassLabel(num_classes=2, names=["Not Hateful", "Hateful"])
})

columns = [
    "text",
    "is_hateful",
]

train_dataset = Dataset.from_pandas(train_df[columns], features=features)
dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
test_dataset = Dataset.from_pandas(test_df[columns], features=features)


In [6]:
categories = [
    "is_hateful",
    "calls",
    "WOMEN",
    "LGBTI",
    "RACISM",
    "CLASS",
    "POLITICS",
    "DISABLED",
    "APPEARANCE",
    "CRIMINAL",
]

train_df[categories].mean() - test_df[categories].mean()

is_hateful   -0.005143
calls         0.005331
WOMEN        -0.008032
LGBTI        -0.003811
RACISM        0.005207
CLASS        -0.004161
POLITICS     -0.004899
DISABLED     -0.002040
APPEARANCE   -0.003736
CRIMINAL      0.011097
dtype: float64

It seems that somehow it is evenly distributed...

In [7]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer

model_name = 'dccuchile/bert-base-spanish-wwm-cased'

device = "cuda" if torch.cuda.is_available() else "cpu"

id2label = {0: 'Not hateful', 1: 'Hateful'}
label2id = {v:k for k,v in id2label.items()}

model = BertForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=2)

model.config.id2label = id2label
model.config.label2id = label2id

#model = model.to(device)
model.train();



tokenizer = BertTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased a

In [8]:
def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

batch_size = 32
eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)



HBox(children=(FloatProgress(value=0.0, max=1005.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=503.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=642.0), HTML(value='')))




In [9]:

def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['is_hateful']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=32138.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=8035.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10263.0), HTML(value='')))




In [10]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    """
    Compute metrics for Trainer
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
from transformers import Trainer, TrainingArguments
epochs = 10

total_steps = (epochs * len(train_dataset)) // batch_size
warmup_steps = total_steps // 10
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    do_eval=False,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

results = []

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()

f1_scores = torch.Tensor([r["eval_f1"] for r in results])
print(f"Macro F1: {f1_scores.mean():.3f} +- {f1_scores.std():.3f}")

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.3112,0.305759,0.884754,0.7718,0.806112,0.747929,28.5812,281.129
2,0.2517,0.283648,0.898942,0.790178,0.852218,0.753765,28.5777,281.163
3,0.1595,0.326218,0.893964,0.794845,0.821365,0.774606,28.5607,281.331
4,0.1015,0.489298,0.891226,0.788126,0.816944,0.766707,28.5916,281.027
5,0.0632,0.545203,0.875544,0.786151,0.774798,0.799593,28.5948,280.995
6,0.0438,0.623489,0.8972,0.797565,0.832325,0.772676,28.6158,280.789
7,0.0303,0.699792,0.895333,0.797426,0.824316,0.776917,28.5989,280.955
8,0.0229,0.614335,0.889359,0.797108,0.803468,0.791204,28.6349,280.601
9,0.0129,0.784076,0.895333,0.799963,0.821725,0.782577,28.6275,280.674
10,0.0116,0.822676,0.896951,0.799413,0.829013,0.777293,28.6105,280.841


Macro F1: nan +- nan


In [12]:
trainer.evaluate(dev_dataset)

{'eval_loss': 0.7840758562088013,
 'eval_accuracy': 0.8953329184816428,
 'eval_f1': 0.7999629731824067,
 'eval_precision': 0.8217253661720993,
 'eval_recall': 0.782576909879597,
 'eval_runtime': 28.6482,
 'eval_samples_per_second': 280.472,
 'epoch': 10.0}