In [None]:
import torch 
import optuna
import json
import os
from pathlib import Path
import numpy as np
import pandas as pd

from transformers import AutoModel, AutoTokenizer, BertTokenizerFast
from torch.utils.data import DataLoader, Dataset
from torchmetrics.classification import F1Score

from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from copy import deepcopy

import src.custom_utils as cu

from tqdm import tqdm
import logging

from typing import Optional

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

In [None]:
!mkdir -p logs
logging.basicConfig(level=logging.INFO, filename='logs/hyperTuner.log', encoding='utf-8', filemode='a')

In [None]:
class WordFeatureDataset(Dataset):
    def __init__(self, data : pd.DataFrame, tokenizer : AutoTokenizer, max_seq_len : int):
        # gather data
        sentences = data['sentences'].to_list()
        speakers = data['speakers'].to_list()
        labels = data['labels'].to_list()

        # token parameters
        params = {
            'max_length' : max_seq_len,
            'padding' : True,
            'truncation' : True,
            'return_token_type_ids' : False
        }
        tokens = tokenizer.batch_encode_plus(sentences, **params)
        
        # hot encoder for speakers
        switcher = {
            "PM" : [1,0,0,0],
            "ME" : [0,1,0,0],
            "UI" : [0,0,1,0],
            "ID" : [0,0,0,1]
        }

        self.sequences = torch.tensor(tokens['input_ids']).to(device)
        self.attention_masks = torch.tensor(tokens['attention_mask']).to(device)
        self.speakers = torch.Tensor([switcher[el] for el in speakers]).to(device)
        self.lengths = torch.Tensor([[len(sentence.split())] for sentence in sentences]).to(device)
        self.labels = torch.tensor(labels).to(device)
        
    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        attention_mask = self.attention_masks[idx]
        speaker = self.speakers[idx]
        length = self.lengths[idx]
        label = self.labels[idx]

        sample = {
            'sequence': sequence,
            'attention_mask': attention_mask,
            'speaker': speaker,
            'length': length,
            'label': label
        }

        return sample

In [None]:
def data_loader(batch_size : int, train : pd.DataFrame, valid : pd.DataFrame, tokenizer : AutoTokenizer, max_seq_len : int = 80) -> tuple[DataLoader, DataLoader]:
    # create custom datasets
    train_dataset = WordFeatureDataset(train, tokenizer, max_seq_len)
    valid_dataset = WordFeatureDataset(valid, tokenizer, max_seq_len)

    # create dataloaders
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))

    return train_loader, valid_loader

In [None]:
# generate MLP from parameters
def MLP(params):
    n_layers = params['n_layers']
    layers = []

    in_features = params['input_size']
    for i in range(n_layers):
        out_features = params[f'n_{i}_size']
        layers.append(torch.nn.Linear(in_features, out_features))
        layers.append(torch.nn.ReLU())
        
        # dropout
        p = params['n_p']
        layers.append(torch.nn.Dropout(p))

        # updating next layer size
        in_features = out_features
        
    layers.append(torch.nn.Linear(in_features, params['output_size']))
    model = torch.nn.Sequential(*layers)
    return model

class MLP_FT(torch.nn.Module):
    def __init__(self, base_model, params):
        super(MLP_FT, self).__init__()
        self.base_model = deepcopy(base_model)
        self.dropout = torch.nn.Dropout(0.1)
        self.mlp = MLP(params)

    def forward(self, seq, mask, speakers, lengths):
        # language model pass
        outputs = self.base_model(seq, attention_mask=mask)
        hidden_states = outputs.last_hidden_state
        x = hidden_states[:,0,:]

        # MLP pass
        x = self.dropout(x)
        x = torch.cat((x, speakers, lengths), dim=1)
        x = self.mlp(x) 
        return x

In [None]:
def train_model(model, criterion, optimizer, metric, params, train_loader, valid_loader):
    n_epochs = params['n_epochs']
    eval_at = params['eval_at']
    max_patience = params['max_patience']

    hst_train_loss = [] 
    hst_valid_loss = []
    hst_f1_score = []

    best_valid_loss = float("inf")
    patience = max_patience
    best_weights = None
    
    it = 0
    # itera nas epochs
    for epoch in range(n_epochs):
        if patience == 0: break
        
        # itera nos train batches
        for samples in tqdm(train_loader):
            if patience == 0: break
            it += 1

            # train step
            model.train()
            optimizer.zero_grad()
            out = model(samples['sequence'], samples['attention_mask'], samples['speaker'], samples['length'])
            loss = criterion(out, samples['label'])
            loss.backward()
            optimizer.step()
            
            train_loss = loss.cpu().detach().numpy() / 1

            if it % eval_at == 0:
                model.eval()

                valid_loss = 0
                f1_score = 0
                
                # itera nos valid batches
                for idx, samples in enumerate(valid_loader):
                    out = model(samples['sequence'], samples['attention_mask'], samples['speaker'], samples['length'])
                    loss = criterion(out, samples['label'])
                    valid_loss += loss.cpu().detach().numpy() / len(valid_loader)
                    f1_score += metric(samples['label'], out.argmax(dim=1)).cpu().detach().numpy() / len(valid_loader)
                
                # early stopping
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    best_weights = model.state_dict()
                    patience = max_patience
                else:
                    patience -= 1 
                
                hst_train_loss.append(train_loss)
                hst_valid_loss.append(valid_loss)
                hst_f1_score.append(f1_score)

                logging.info('Iter: {} | Train Loss: {} | Val Loss: {} | F1-score: {}'.format(it, train_loss, valid_loss, f1_score))

    # objective function criterion
    combined = sorted(zip(hst_valid_loss, hst_f1_score), key=lambda x : x[0])
    _, scores = zip(*combined)
    qtd = 3
    final_score = sum(scores[:qtd]) / qtd

    results = {
        "score" : final_score,
        "params" : params,
        "valid_loss" : hst_valid_loss,
        "train_loss" : hst_train_loss,
        "f1_score" : hst_f1_score, 
    }
    
    return best_weights, results

In [None]:
class HyperTuner:
    def __init__(self, base_model, train, valid, tokenizer, max_seq_len):
        self.base_model = base_model
        self.train = train
        self.valid = valid
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        
    def objective(self, trial):
        assert os.path.isdir("models")
        
        model_params = {
            "input_size" : self.base_model.config.hidden_size + 4 + 1,
            "output_size" : 2,
            "n_layers" : trial.suggest_int("n_layers", 2, 3), 
            "n_p" : trial.suggest_float("n_p", 0.2, 0.7),
        }
        for i in range(model_params["n_layers"]):
            model_params[f"n_{i}_size"] = trial.suggest_int(f"n_{i}_size", 200, 800)

        training_params = {
            "batch_size" : 100,
            "lr" : trial.suggest_float("lr", 1e-5, 1e-4),
            "weight_decay" : trial.suggest_float("weight_decay", 1e-5, 1e-4),
            "n_epochs" : 5,
            "eval_at" : 50,
            "max_patience" : 10,
        }

        # model
        model = MLP_FT(self.base_model, model_params)
        class_weights = compute_class_weight('balanced', classes=np.unique(self.train['labels'].to_numpy()), y=self.train['labels'].to_numpy())
        criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float()) 
        optimizer = torch.optim.Adam(model.parameters(), lr=training_params['lr'], weight_decay=training_params['weight_decay'])
        metric = F1Score(task='binary', num_classes=2).to(device)

        # data loaders
        train_loader, valid_loader = data_loader(training_params['batch_size'], self.train, self.valid, self.tokenizer, self.max_seq_len)
        
        _, results = train_model(model, criterion, optimizer, metric, training_params, train_loader, valid_loader)

        # save results
        json.dump(training_params, open(f"tuner/parameters/training_{trial.number}.json", "w"))
        json.dump(model_params, open(f"tuner/parameters/model_{trial.number}.json", "w"))
        json.dump(results, open(f"tuner/results/results_{trial.number}.json", "w"))

        return results["score"]
    
    def optimize(self, n_trials):
        study = optuna.create_study(direction='maximize')
        study.optimize(self.objective, n_trials=n_trials)

In [None]:
# read training data
sentences, speakers, labels = cu.read_data("training", "training_labels.json")

# split data
df = pd.DataFrame({"sentences" : sentences, "speakers" : speakers, "labels" : labels})
train, valid = train_test_split(df, test_size=0.2, random_state=69, stratify=df.labels)

print(f"Train: {len(train)}\nValid: {len(valid)}")

In [None]:
## define base mode (embedder)
base_model_name = 'bert-base-uncased'
base_model = AutoModel.from_pretrained(base_model_name)
tokenizer = BertTokenizerFast.from_pretrained(base_model_name)
max_seq_len = 80

### 1. Hyper parameter optimization

In [None]:
!mkdir -p tuner/parameters # save model and training parameters
!mkdir -p tuner/results # save results

hpt = HyperTuner(base_model, train, valid, tokenizer, max_seq_len)
hpt.optimize(100)

### 2. Training from parameters

In [None]:
# dirty, only for this notebook
def train_from_params(model_params, training_params, file_name : Optional[str] = None):
    # model
    model = MLP_FT(base_model, model_params)
    class_weights = compute_class_weight('balanced', classes=np.unique(train['labels'].to_numpy()), y=train['labels'].to_numpy())
    criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float()) 
    optimizer = torch.optim.Adam(model.parameters(), lr=training_params['lr'], weight_decay=training_params['weight_decay'])
    metric = F1Score(task='binary', num_classes=2).to(device)

    # data loaders
    train_loader, valid_loader = data_loader(training_params['batch_size'], train, valid, tokenizer, max_seq_len)

    # train model
    trained_weights, _ = train_model(model, criterion, optimizer, metric, training_params, train_loader, valid_loader)

    # reload weights
    model.load_state_dict(trained_weights)

    if file_name:
        torch.save(model, file_name)

    return model

### 3. Predictions

In [None]:
# format test input data
def predict_labels(test_sentences : dict, test_speakers : dict, model : AutoModel, device : str = device) -> dict:
    model.to(device)
    test_data = {}
    for id in test_sentences:
        test_data[id] = cu.format_input(test_sentences[id], test_speakers[id], tokenizer, max_seq_len, device)

    model.eval()
    test_labels = {}

    for id in test_sentences.keys():
        out = model(**test_data[id])
        pred = out.argmax(dim=1)
        test_labels[id] = pred.cpu().detach().tolist()

    return test_labels

#### 3.1 using a selected model to train (example)

In [None]:
model_params = {
    "input_size" : base_model.config.hidden_size + 4 + 1,
    "output_size" : 2,
    "n_layers" : 3,
    "n_p" : 0.5,
    "n_0_size" : 400,
    "n_1_size" : 300,
    "n_2_size" : 200
}

training_params = {
    "batch_size" : 100,
    "lr" : 5e-5,
    "weight_decay" : 5e-4,
    "n_epochs" : 5,
    "eval_at" : 50,
    "max_patience" : 10,
}

# # uncomment to save and predict labels
# model = train_from_params(model_params, training_params)
# torch.save(model, "selected_model.pt")
# !python3 predict_labels.py --model_path selected_model.pt --labels_path labels_from_selected.json

#### 3.2 using best models from hyper tuning --> majority vote

> pick some params for trainin (based on score during hyp. tuning)

In [None]:
all_scores = {}
all_training_params = {}
all_model_params = {}

for item in Path("tuner/results/").iterdir():
    if not item.suffix == ".json" : continue
    
    idx = item.stem.split('_')[2]
    all_scores[idx] = json.load(open(item, "r"))["score"]
    
    tp_path = f"tuner/parameters/training_{idx}.json"
    m_path = f"tuner/parameters/model_{idx}.json"

    all_training_params[idx] = json.load(open(tp_path, "r"))
    all_model_params[idx] = json.load(open(m_path, "r"))
    
# take 10 trials with best scores
number_of_models = 10
idx_score = sorted(all_scores.items(), key=lambda x : x[1], reverse=True)
print(idx_score[:number_of_models])

best_idxs, _ = zip(*idx_score[:number_of_models])

> train with selected params

In [None]:
# load test data
test_sentences, test_speakers, _  = cu.read_data_by_ID("test", combine = False)
 
# save predictions for each model
all_predictions = []

for idx in best_idxs:
    model = train_from_params(all_model_params[idx], all_training_params[idx])
    
    model.eval()
    try:
        prediction = predict_labels(test_sentences, test_speakers, model)
    except RuntimeError as e:
        if device == "cuda" and "CUDA out of memory" in str(e):
            print("Insufficient memory on gpu, predictions will be calculated using cpu")
            test_labels = predict_labels(test_sentences, test_speakers, model, device="cpu")
        else: raise e
    
    all_predictions.append(prediction) 


> majority vote itself

In [None]:
test_labels = {}

for key in all_predictions[0].keys():
    pred_for_key = [preds[key] for preds in all_predictions]
    
    # calculate mean and cap to 1 where val >= 0.5
    major_preds = [1 if sum(x) / len(all_predictions) >= 0.5 else 0 for x in zip(*pred_for_key)]
    
    # assign to test_labels
    test_labels[key] = major_preds

print(test_labels)

json.dump(test_labels, open("labels_from_majority_vote.json", "w"))