## Non-contextualized model

Let's train a model without using the news

In [1]:
%load_ext autoreload
%autoreload 2
import json

with open("../data/train.json") as f:
    train_articles = json.load(f)

with open("../data/test.json") as f:
    test_articles = json.load(f)

Let's take just the comments

In [2]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, Value, ClassLabel, Features
import pandas as pd

def serialize(article, comment):
    ret = comment.copy()
    ret["context"] = article["title"]
    return ret
    

train_comments = [serialize(article, comment) for article in train_articles for comment in article["comments"]]
test_comments = [serialize(article, comment) for article in test_articles for comment in article["comments"]]

#train_comments = sorted(train_comments, key=lambda x: len(x["text"] + x["context"]))
#test_comments = sorted(train_comments, key=lambda x: len(x["text"] + x["context"]))

train_df = pd.DataFrame(train_comments)
test_df = pd.DataFrame(test_comments)

train_df, dev_df = train_test_split(train_df, test_size=0.2)

In [3]:
import re

user_regex = re.compile(r"@[a-zA-Z0-9_]{0,15}")
url_regex = re.compile(
    "((?<=[^a-zA-Z0-9])(?:https?\:\/\/|[a-zA-Z0-9]{1,}\.{1}|\b)(?:\w{1,}\.{1}){1,5}(?:com|co|org|edu|gov|uk|net|ca|de|jp|fr|au|us|ru|ch|it|nl|se|no|es|mil|iq|io|ac|ly|sm){1}(?:\/[a-zA-Z0-9]{1,})*)"
)

def preprocess_tweet(text):
    """
    Basic preprocessing
    """
    text = user_regex.sub("usuario", text)
    text = url_regex.sub("url", text)

    return text

train_df["text"] = train_df["text"].apply(preprocess_tweet)
dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
test_df["text"] = test_df["text"].apply(preprocess_tweet)

In [4]:
features = Features({
    'context': Value('string'),
    'text': Value('string'),
    'is_hateful': ClassLabel(num_classes=2, names=["Not Hateful", "Hateful"])
})

columns = [
    "context",
    "text",
    "is_hateful",
]

train_dataset = Dataset.from_pandas(train_df[columns], features=features)
dev_dataset = Dataset.from_pandas(dev_df[columns], features=features)
test_dataset = Dataset.from_pandas(test_df[columns], features=features)


In [5]:
categories = [
    "is_hateful",
    "calls",
    "WOMEN",
    "LGBTI",
    "RACISM",
    "CLASS",
    "POLITICS",
    "DISABLED",
    "APPEARANCE",
    "CRIMINAL",
]

train_df[categories].mean(), train_df[categories].mean() - test_df[categories].mean()

(is_hateful    0.178000
 calls         0.032341
 WOMEN         0.024120
 LGBTI         0.008618
 RACISM        0.057501
 CLASS         0.018523
 POLITICS      0.025605
 DISABLED      0.011886
 APPEARANCE    0.037492
 CRIMINAL      0.038136
 dtype: float64,
 is_hateful   -0.005573
 calls        -0.002791
 WOMEN        -0.011171
 LGBTI        -0.002935
 RACISM        0.010025
 CLASS         0.001907
 POLITICS     -0.003988
 DISABLED     -0.000299
 APPEARANCE   -0.004129
 CRIMINAL     -0.000953
 dtype: float64)

It seems that somehow it is evenly distributed...

In [6]:
train_df["text"] = train_df["text"].apply(preprocess_tweet)
dev_df["text"] = dev_df["text"].apply(preprocess_tweet)
test_df["text"] = test_df["text"].apply(preprocess_tweet)


In [7]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = 'dccuchile/bert-base-spanish-wwm-cased'

device = "cuda" if torch.cuda.is_available() else "cpu"

id2label = {0: 'Not hateful', 1: 'Hateful'}
label2id = {v:k for k,v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=2)

model.config.id2label = id2label
model.config.label2id = label2id

model = model.to(device)
model.train();



tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 256

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased a

In [8]:
def tokenize(batch):
    return tokenizer(batch['context'], batch['text'], padding='max_length', truncation=True)

batch_size = 32
eval_batch_size = 16

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=batch_size)
dev_dataset = dev_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=eval_batch_size)



HBox(children=(FloatProgress(value=0.0, max=631.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=316.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=395.0), HTML(value='')))




In [9]:

def format_dataset(dataset):
    dataset = dataset.map(lambda examples: {'labels': examples['is_hateful']})
    dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
    return dataset

train_dataset = format_dataset(train_dataset)
dev_dataset = format_dataset(dev_dataset)
test_dataset = format_dataset(test_dataset)

HBox(children=(FloatProgress(value=0.0, max=20191.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=5048.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=6319.0), HTML(value='')))




In [10]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    """
    Compute metrics for Trainer
    """
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
from transformers import Trainer, TrainingArguments
epochs = 10

total_steps = (epochs * len(train_dataset)) // batch_size
warmup_steps = total_steps // 10
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=eval_batch_size,
    warmup_steps=warmup_steps,
    evaluation_strategy="epoch",
    do_eval=False,
    weight_decay=0.01,
    logging_dir='./logs',
    group_by_length=True,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)


trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
1,0.3933,0.303925,0.878962,0.802495,0.796098,0.809499,42.1691,119.709
2,0.2635,0.290862,0.884905,0.812794,0.805305,0.821092,42.1754,119.691
3,0.184,0.355174,0.897781,0.824957,0.833927,0.816836,42.0843,119.95
4,0.0673,0.420819,0.897583,0.807719,0.85829,0.775686,42.2572,119.459
5,0.0404,0.55121,0.885103,0.820126,0.803115,0.842147,42.3463,119.208
6,0.0225,0.60236,0.905903,0.831612,0.859694,0.810087,42.3242,119.27
7,0.0161,0.711772,0.904319,0.829097,0.85608,0.808279,42.259,119.454
8,0.006,0.7396,0.900357,0.832214,0.835231,0.829298,42.3716,119.137
9,0.0042,0.786964,0.905309,0.833282,0.853959,0.816421,42.2424,119.501
10,0.0021,0.797052,0.90412,0.832869,0.849196,0.819043,42.3969,119.065


NameError: name 'results' is not defined

In [12]:
trainer.evaluate(test_dataset)

{'eval_loss': 1.1021950244903564,
 'eval_accuracy': 0.8716569077385662,
 'eval_f1': 0.7624932074993378,
 'eval_precision': 0.7993836464784296,
 'eval_recall': 0.7383036675110788,
 'eval_runtime': 46.9653,
 'eval_samples_per_second': 134.546,
 'epoch': 10.0}