In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import transformers
%load_ext autoreload
%autoreload 2

transformers.logging.set_verbosity_error()

In [2]:
MODEL_NAME = 'bert-base-uncased'

TRAIN_METADATA_PATH = 'data/semeval8/full_meta_en_train.csv'
TEST_METADATA_PATH = 'data/semeval8/evaluate_metadata.csv'
TRAIN_DATA_PATH = 'data/semeval8/en_train.csv'
TEST_DATA_PATH = 'data/semeval8/evaluate_dataset.csv'

In [3]:
from utils.seed import set_seed

SEED = 146
set_seed(SEED)

# Load the data

In [4]:
from utils.data_processing import prepre_data_to_model
from sklearn.model_selection import train_test_split
from datasets import SentencesDataset

# Find best parameters

In [5]:
from utils.data_processing import create_sentences_corpus
from sklearn.metrics import f1_score
import torch
from models import tBERT
from transformers import AdamW
from trainer import Trainer
import optuna
import json
import os

def tbert_objective(trail):
    # PARAMS SETTING
    # Embedding
    embeddings_length = trail.suggest_categorical('embeddings_length', [256, 512])
    # General Model
    lr = trail.suggest_loguniform('lr', 1e-6, 1e-3)
    # LDA
    n_topics = trail.suggest_int('n_topics', 50, 500)
    alpha = trail.suggest_float('alpha', 1/50, 10)


    corpus = create_sentences_corpus(train_dataloader)
    model = tBERT(corpus, model_name=MODEL_NAME,num_labels=2,
             max_length=embeddings_length, n_topics=n_topics, alpha=alpha, device=DEVICE)

    optimizer = AdamW(model.parameters(), lr=lr)

    trainer = Trainer(model=model, optimizer=optimizer, train_dataloader=train_dataloader, num_epochs=NUM_EPOCHS, device=DEVICE)

    trainer.train()

    labels, preds = trainer.evaluate(val_dataloader)
    f1 = f1_score(labels, preds)

    return f1

In [6]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

BATCH_SIZE = 16
NUM_EPOCHS = 5

N_TRAILS = 35
CUTOFF_OPTIONS = [2.5, 3]
COL_TEXT_OPTIONS = ['title', 'text']

In [7]:
from itertools import product
all_options = list(product(CUTOFF_OPTIONS, COL_TEXT_OPTIONS))

for cutoff, col_text_to_use in all_options:
        file_name = f'tBERT_cutoff{cutoff}_text{col_text_to_use}_best_params.json'

        if not os.path.isfile(f'results/{file_name}'):

            # Create the data
            train_base = prepre_data_to_model(TRAIN_METADATA_PATH, TRAIN_DATA_PATH, cutoff=cutoff, col_text_to_use=col_text_to_use)
            test = prepre_data_to_model(TEST_METADATA_PATH, TEST_DATA_PATH, cutoff=cutoff, col_text_to_use=col_text_to_use)

            train, val = train_test_split(train_base, test_size=0.2, shuffle=True, random_state=SEED)

            train_ds = SentencesDataset(train.text1.tolist(), train.text1.tolist(), train.labels.tolist())
            val_ds = SentencesDataset(val.text1.tolist(), val.text1.tolist(), val.labels.tolist())
            test_ds = SentencesDataset(test.text1.tolist(), test.text1.tolist(), test.labels.tolist())

            train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=BATCH_SIZE)
            val_dataloader = DataLoader(val_ds, shuffle=True, batch_size=BATCH_SIZE)
            test_dataloader = DataLoader(test_ds, shuffle=True, batch_size=BATCH_SIZE)

            # Hyperparameter
            study = optuna.create_study(direction='maximize')
            study.optimize(tbert_objective, n_trials=N_TRAILS)

            trail_ = study.best_trial
            with open(f'results/{file_name}', 'w') as f:
                d = dict(trail_.params)
                d['f1'] = trail_.values[0]
                json.dump(d, f, indent=4)
            print(f'BEST TRAIL cutoff - {cutoff}, col - {col_text_to_use}:\n f1:  {trail_.values}\nparams: {trail_.params}')

# Train models and evaluate using the TEST SET

In [8]:
NUM_EPOCHS = 12
FULL_TRAIN = False

In [9]:
if FULL_TRAIN:
    files = [f for f in os.listdir('results') if f.endswith('.json')]
    for file_name in files:
        outputs_file = f'{file_name.split("_best")[0]}.json'
        cutoff = float(outputs_file.split("_text")[0].split("cutoff")[1])
        col_text_to_use = outputs_file.split("_text")[1].split(".json")[0]

        # load the best HP
        with open (os.path.join('results', file_name), 'r') as f:
            params = json.load(f)

        # load the data
        train = prepre_data_to_model(TRAIN_METADATA_PATH, TRAIN_DATA_PATH, cutoff=cutoff, col_text_to_use=col_text_to_use)
        test = prepre_data_to_model(TEST_METADATA_PATH, TEST_DATA_PATH, cutoff=cutoff, col_text_to_use=col_text_to_use)

        train_ds = SentencesDataset(train.text1.tolist(), train.text1.tolist(), train.labels.tolist())
        test_ds = SentencesDataset(test.text1.tolist(), test.text1.tolist(), test.labels.tolist())

        train_dataloader = DataLoader(train_ds, shuffle=True, batch_size=BATCH_SIZE)
        test_dataloader = DataLoader(test_ds, shuffle=True, batch_size=BATCH_SIZE)


        # train tBERT
        corpus = create_sentences_corpus(train_dataloader)
        model = tBERT(corpus, model_name=MODEL_NAME, num_labels=2,
                      max_length=params['embeddings_length'], n_topics=params['n_topics'],
                      alpha=params['alpha'], device=DEVICE)

        optimizer = AdamW(model.parameters(), lr=params['lr'])

        trainer = Trainer(model=model, optimizer=optimizer,
                          train_dataloader=train_dataloader,
                          num_epochs=NUM_EPOCHS, device=DEVICE)

        trainer.train()

        # evaluate TEST
        labels, preds = trainer.evaluate(test_dataloader)

        # save results
        with open(os.path.join('results', 'full_train', outputs_file), 'w') as f:
            params = dict(params)
            del params['f1'] # remove f1 from the HP part
            params['labels'] = labels
            params['preds'] = preds
            json.dump(params, f, indent=2)

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

results = pd.DataFrame(columns=['cutoff', 'text_type', 'f1'])
files = [f for f in os.listdir('results/full_train') if f.endswith('.json')]
for i, file in enumerate(files):
    cutoff = file.split('cutoff')[1].split('_text')[0]
    text = file.split('_text')[1].split('.json')[0]

    with open (os.path.join('results', 'full_train', file), 'r') as f:
        params = json.load(f)
    labels = params['labels']
    preds = params['preds']

    f1 = f1_score(labels, preds)

    results.loc[i] = [cutoff, text, f1]

results.sort_values(by=['cutoff', 'text_type'])

Unnamed: 0,cutoff,text_type,f1
0,2.5,text,0.613636
1,2.5,title,0.614815
2,3.0,text,0.591093
3,3.0,title,0.559387
