In [27]:
import torch 
import optuna
import json
import numpy as np
import pandas as pd
import custom_utils

device = "cuda" if torch.cuda.is_available() else "cpu"
device = "cpu"
torch.set_default_device(device)

from transformers import AutoModel, AutoTokenizer, BertTokenizerFast
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

from torch import nn
from torchmetrics.classification import F1Score
from sklearn.utils.class_weight import compute_class_weight

from torch.utils.data import TensorDataset, DataLoader


In [28]:
class WordFeatureDataset(Dataset):
    def __init__(self, data : pd.DataFrame, tokenizer : AutoTokenizer, max_seq_len : int):
        # gather data
        sentences = data['sentences'].to_list()
        speakers = data['speakers'].to_list()
        labels = data['labels'].to_list()

        # token parameters
        params = {
            'max_length' : max_seq_len,
            'padding' : True,
            'truncation' : True,
            'return_token_type_ids' : False
        }
        tokens = tokenizer.batch_encode_plus(sentences, **params)
        
        # hot encoder for speakers
        switcher = {
            "PM" : [1,0,0,0],
            "ME" : [0,1,0,0],
            "UI" : [0,0,1,0],
            "ID" : [0,0,0,1]
        }

        self.sequences = torch.tensor(tokens['input_ids']).to(device)
        self.attention_masks = torch.tensor(tokens['attention_mask']).to(device)
        self.speakers = torch.Tensor([switcher[el] for el in speakers]).to(device)
        self.lengths = torch.Tensor([[len(sentence.split())] for sentence in sentences]).to(device)
        self.labels = torch.tensor(labels).to(device)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        attention_mask = self.attention_masks[idx]
        speaker = self.speakers[idx]
        length = self.lengths[idx]
        label = self.labels[idx]

        sample = {
            'sequence': sequence,
            'attention_mask': attention_mask,
            'speaker': speaker,
            'length': length,
            'label': label
        }

        return sample

In [29]:
def data_loader(batch_size : int, train : pd.DataFrame, valid : pd.DataFrame, tokenizer : AutoTokenizer, max_seq_len : int = 80) -> tuple[DataLoader, DataLoader]:
    # create custom datasets
    train_dataset = WordFeatureDataset(train, tokenizer, max_seq_len)
    valid_dataset = WordFeatureDataset(valid, tokenizer, max_seq_len)

    # create dataloaders
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))

    return train_loader, valid_loader

In [30]:
from copy import deepcopy

# generate MLP from parameters
def MLP(params):
    n_layers = params['n_layers']
    layers = []

    in_features = params['input_size']
    for i in range(n_layers):
        out_features = params[f'n_{i}_size']
        layers.append(torch.nn.Linear(in_features, out_features))
        layers.append(torch.nn.ReLU())
        
        # dropout
        p = params['n_p']
        layers.append(torch.nn.Dropout(p))

        # updating next layer size
        in_features = out_features
        
    layers.append(torch.nn.Linear(in_features, params['output_size']))
    model = torch.nn.Sequential(*layers)
    return model

class MLP_FT(nn.Module):
    def __init__(self, base_model, params):
        super(MLP_FT, self).__init__()
        self.base_model = deepcopy(base_model)
        self.dropout = nn.Dropout(0.1)
        self.mlp = MLP(params)

    def forward(self, seq, mask, speakers, lengths):
        # language model pass
        outputs = self.base_model(seq, attention_mask=mask)
        hidden_states = outputs.last_hidden_state
        x = hidden_states[:,0,:]

        # MLP pass
        x = self.dropout(x)
        x = torch.cat((x, speakers, lengths), dim=1)
        x = self.mlp(x) 
        return x

In [31]:
def train_MLP_FT(base_model, params, train, valid, tokenizer, max_seq_len, trial=None):
    n_epochs = 5
    eval_at = 100

    model = MLP_FT(base_model, params).to(device)
    class_weights = compute_class_weight('balanced', classes=np.unique(train['labels'].to_numpy()), y=train['labels'].to_numpy())
    criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float()) 
    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"], weight_decay=params["weight_decay"])
    f1 = F1Score(task='binary', num_classes=2).to(device)

    train_loader, valid_loader = data_loader(params["batch_size"], train, valid, tokenizer, max_seq_len)


    it = 0
    hst_train_loss = [] 
    hst_valid_loss = []
    hst_f1_score = []

    best_valid_loss = float("inf")
    patience = 10
    best_model = None
    
    # itera nas epochs
    for epoch in range(n_epochs):
        if patience == 0: break
        
        # itera nos train batches
        for idx, samples in enumerate(train_loader):
            if patience == 0: break
            it += 1

            # train step
            model.train()
            optimizer.zero_grad()
            out = model(samples['sequence'], samples['attention_mask'], samples['speaker'], samples['length'])
            loss = criterion(out, samples['label'])
            loss.backward()
            optimizer.step()
            
            train_loss = loss.cpu().detach().numpy() / 1

            if it % eval_at == 0:
                model.eval()

                valid_loss = 0
                f1_score = 0
                
                # itera nos valid batches
                for idx, samples in enumerate(valid_loader):
                    out = model(samples['sequence'], samples['attention_mask'], samples['speaker'], samples['length'])
                    loss = criterion(out, samples['label'])
                    valid_loss += loss.cpu().detach().numpy() / len(valid_loader)
                    f1_score += f1(samples['label'], out.argmax(dim=1)).cpu().detach().numpy() / len(valid_loader)
                
                # early stopping
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    best_model = deepcopy(model)
                    patience = 10
                else:
                    patience -= 1 
                
                hst_train_loss.append(train_loss)
                hst_valid_loss.append(valid_loss)
                hst_f1_score.append(f1_score)

                print('Iter: {} | Train Loss: {} | Val Loss: {} | F1-score: {}'.format(it, train_loss, valid_loss, f1_score))

    # objective function criterion
    combined = sorted(zip(hst_valid_loss, hst_f1_score), key=lambda x : x[0])
    _, scores = zip(*combined)
    qtd = 3
    final_score = sum(scores[:qtd]) / qtd

    results = {
        "score" : final_score,
        "params" : params,
        "valid_loss" : hst_valid_loss,
        "train_loss" : hst_train_loss,
        "f1_score" : hst_f1_score, 
    }
    
    return best_model, results

In [32]:
# def objective(trial):
#     # model parameters
#     params = {
#         # "n_layers" : trial.suggest_int("n_layers", 2, 5),
#         "n_layers" : 2,
#         "input_size" : base_model.config.hidden_size + 4 + 1,
#         "output_size" : 2,
#         "n_p" : trial.suggest_float("n_p", 0.2, 0.7),
#         "lr" : trial.suggest_float("lr", 1e-5, 1e-4),
#         "weight_decay" : trial.suggest_float("weight_decay", 1e-5, 1e-4),
#         "batch_size" : 100
#     }
#     for i in range(params["n_layers"]):
#         params[f"n_{i}_size"] = trial.suggest_int(f"n_{i}_size", 200, 800)
    
#     _, results = train_MLP_FT(base_model, params, trial)
    
#     # save results
#     json.dump(results, open(f"models/mlp_results_{trial.number}.json", "w"))

#     return results['score']

In [33]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=100)

In [34]:
# read
sentences, speakers, labels = custom_utils.read_data("training", "training_labels.json")

# split
df = pd.DataFrame({"sentences" : sentences, "speakers" : speakers, "labels" : labels})
train, valid = train_test_split(df, test_size=0.2, random_state=69, stratify=df.labels)

print(f"Train: {len(train)}\nValid: {len(valid)}")

Train: 58098
Valid: 14525


In [35]:
## define base mode (embedder)
# base_model_name = 'roberta-base'
base_model_name = 'bert-base-uncased'
base_model = AutoModel.from_pretrained(base_model_name)
# tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer = BertTokenizerFast.from_pretrained(base_model_name)
max_seq_len = 80

In [36]:
# define params for training
model_params = {
    "input_size" : base_model.config.hidden_size + 4 + 1,
    "output_size" : 2,
    "n_layers" : 2, 
    "n_0_size" : 480,
    "n_1_size" : 352,
    "n_p" : 0.5,
    "lr" : 5e-5,
    "weight_decay" : 5e-4,
    "batch_size" : 64,
}

# model_params = {'output_size' : 2, 'batch_size' : 128, 'input_size' : base_model.config.hidden_size + 4 + 1, 'n_layers': 2, 'n_p': 0.6516533716631351, 'lr': 0.0005595147939796057, 'weight_decay': 5.913216878057305e-05, 'n_0_size': 480, 'n_1_size': 352}

# train model
# model, score = train_MLP_FT(base_model, model_params, train, valid, tokenizer, max_seq_len)

In [37]:
import custom_utils
test_sentences, test_speakers, _  = custom_utils.read_data_by_ID("test", combine = False)

# format test input data
test_data = {}
for id in test_sentences:
    test_data[id] = custom_utils.format_input(test_sentences[id], test_speakers[id], tokenizer, max_seq_len)

In [38]:
model_params = {
    "input_size" : base_model.config.hidden_size + 4 + 1,
    "output_size" : 2,
    "n_layers" : 2, 
    "n_0_size" : 480,
    "n_1_size" : 352,
    "n_p" : 0.5,
    "lr" : 5e-5,
    "weight_decay" : 5e-4,
    "batch_size" : 64,
}

model = MLP_FT(base_model, model_params).to(device)

model.load_state_dict(torch.load("rezero.pt"))

<All keys matched successfully>

In [39]:
# given that we have already a model
test_labels = {}

model.eval()
test_labels = {}
for id in test_sentences.keys():
    print(id)
    out = model(**test_data[id])
    pred = out.argmax(dim=1)
    test_labels[id] = pred.cpu().detach().numpy()
    print(test_labels)

ES2003a
{'ES2003a': array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 