In [1]:
import os
import numpy as np

import pytorch_lightning as pl
from box import Box

from robertuito.trainer import evaluate, predict, train_model
from robertuito.utils import create_folds, import_data

pl.seed_everything(42, workers=True)

# solving forking issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

Global seed set to 42


In [2]:
LABELS = [
    "Odio",
    "Mujeres",
    "Comunidad LGBTQ+",
    "Comunidades Migrantes",
    "Pueblos Originarios",
]

TRAIN_PATH = "public_data/tweets_train.csv"
TEST_PATH = "Datathon-2022-full/Datos/tweets_test.csv"
train, test = import_data(TRAIN_PATH, TEST_PATH)
train = create_folds(train, LABELS)

In [3]:
dm_config = Box(dict(
    train = train,
    test = test,
    labels = LABELS,
    batch_size = 32,
    tokenizer = 'pysentimiento/robertuito-hate-speech',
))

model_config = Box(dict(
    model_name = 'pysentimiento/robertuito-hate-speech',
    dropout = 0.2,
    hidden_size = 768,
    n_labels = len(LABELS),
    train_size = 56,
    batch_size = dm_config.batch_size,
    warmup=0.2,
    w_decay=0.001,
    lr = 3e-4
))

training_config = Box(dict(
    max_epochs = 50,
    patience = 10,
    fast_dev_run=False,
    overfit_batches=0
))

In [None]:
%%time
score = []
for fold in range(5):
    trainer, model, dm= train_model([fold], dm_config, model_config, training_config)
    f1 = evaluate(trainer, model, dm, threshold = 0.5)

    print(f"Fold {fold} F1: {f1}")
    score.append(f1)

In [None]:
print(f"Mean 5 Fold CV Score: {np.mean(score)}")

In [None]:
dm_config = Box(dict(
    train = train,
    test = test,
    labels = LABELS[0],
    batch_size = 32,
    tokenizer = 'pysentimiento/robertuito-hate-speech',
))

score = []
for fold in range(5):
    trainer, model, dm= train_model([fold], dm_config, model_config, training_config)
    f1 = evaluate(trainer, model, dm, threshold = 0.5, custom=False)

    print(f"Fold {fold} F1: {f1}")
    score.append(f1)
    
print(f"Mean 5 Fold CV Score: {np.mean(score)}")