In [1]:
import torch 
import optuna
import json
import numpy as np
import pandas as pd
import custom_utils

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split

from torch import nn
from torchmetrics.classification import F1Score
from sklearn.utils.class_weight import compute_class_weight

from torch.utils.data import TensorDataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read
sentences, speakers, labels = custom_utils.read_data("training", "training_labels.json")

# split
df = pd.DataFrame({"sentences" : sentences, "speakers" : speakers, "labels" : labels})
df['l-sentences'] = df['sentences'].shift(1)
df['r-sentences'] = df['sentences'].shift(-1)

train, test = train_test_split(df, test_size=0.2, random_state=69, stratify=df.labels)
train, valid = train_test_split(train, test_size=0.1, random_state=69, stratify=train.labels)

print(f"Train: {len(train)}\nTest: {len(test)}\nValid: {len(valid)}")

# Getting setences list
train_sentences = train['sentences'].to_list()
train_r_sentences = train['r-sentences'].to_list()
train_l_sentences = train['l-sentences'].to_list()

valid_sentences = valid['sentences'].to_list()
valid_r_sentences = valid['r-sentences'].to_list()
valid_l_sentences = valid['l-sentences'].to_list()

test_sentences = test['sentences'].to_list()
test_r_sentences = test['r-sentences'].to_list()
test_l_sentences = test['l-sentences'].to_list()

Train: 52288
Test: 14525
Valid: 5810


In [3]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [4]:
max_seq_len = 80

In [5]:
## Tokenization
params = {
    'max_length' : max_seq_len,
    'padding' : True,
    'truncation' : True,
    'return_token_type_ids' : False
}

# tokenize and encode sequences in the training set
train_tokens = tokenizer.batch_encode_plus(train_sentences, **params)
train_l_tokens = tokenizer.batch_encode_plus(train_sentences, **params)
train_r_tokens = tokenizer.batch_encode_plus(train_sentences, **params)

# tokenize and encode sequences in the validation set
valid_tokens = tokenizer.batch_encode_plus(valid_sentences, **params)
valid_l_tokens = tokenizer.batch_encode_plus(valid_l_sentences, **params)
valid_r_tokens = tokenizer.batch_encode_plus(valid_r_sentences, **params)

# tokenize and encode sequences in the test set
test_tokens = tokenizer.batch_encode_plus(test_sentences, **params)
test_l_tokens = tokenizer.batch_encode_plus(test_l_sentences, **params)
test_r_tokens = tokenizer.batch_encode_plus(test_r_sentences, **params)

# hot encoder for speakers
switcher = {
    "PM" : [1,0,0,0],
    "ME" : [0,1,0,0],
    "UI" : [0,0,1,0],
    "ID" : [0,0,0,1]
}

# for train set
train_seq = {
    "mid" : torch.tensor(train_tokens['input_ids']),
    "l" : torch.tensor(train_l_tokens['input_ids']),
    "r" : torch.tensor(train_r_tokens['input_ids'])
}
train_mask = {
    "mid" : torch.tensor(train_tokens['attention_mask']),
    "l" : torch.tensor(train_l_tokens['attention_mask']),
    "r" : torch.tensor(train_r_tokens['attention_mask'])
}
train_speaker = torch.Tensor([switcher[el] for el in train['speakers']]).to(device)
train_len = torch.Tensor([[len(sentence.split())] for sentence in train['sentences']]).to(device)
train_y = torch.tensor(train['labels'].to_numpy())

# for validation set
valid_seq = {
    "mid": torch.tensor(valid_tokens['input_ids']),
    "l": torch.tensor(valid_l_tokens['input_ids']),
    "r": torch.tensor(valid_r_tokens['input_ids'])
}
valid_mask = {
    "mid": torch.tensor(valid_tokens['attention_mask']),
    "l": torch.tensor(valid_l_tokens['attention_mask']),
    "r": torch.tensor(valid_r_tokens['attention_mask'])
}
valid_speaker = torch.Tensor([switcher[el] for el in valid['speakers']]).to(device)
valid_len = torch.Tensor([[len(sentence.split())] for sentence in valid['sentences']]).to(device)
valid_y = torch.tensor(valid['labels'].to_numpy())

# for test set
test_seq = {
    "mid": torch.tensor(test_tokens['input_ids']),
    "l": torch.tensor(test_l_tokens['input_ids']),
    "r": torch.tensor(test_r_tokens['input_ids'])
}
test_mask = {
    "mid": torch.tensor(test_tokens['attention_mask']),
    "l": torch.tensor(test_l_tokens['attention_mask']),
    "r": torch.tensor(test_r_tokens['attention_mask'])
}
test_speaker = torch.Tensor([switcher[el] for el in test['speakers']]).to(device)
test_len = torch.Tensor([[len(sentence.split())] for sentence in test['sentences']]).to(device)
test_y = torch.tensor(test['labels'].to_numpy())

In [6]:
class WordFeatureDataset(Dataset):
    def __init__(self, sequences, attention_masks, speakers, lengths, labels):
        self.sequences = sequences
        self.attention_masks = attention_masks
        self.speakers = speakers
        self.lengths = lengths
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence = {}
        for key in self.sequences:
            sequence[key] = self.sequences[key][idx]

        attention_mask = {}
        for key in self.attention_masks:
            attention_mask[key] = self.attention_masks[key][idx]

        speaker = self.speakers[idx]
        length = self.lengths[idx]
        label = self.labels[idx]

        sample = {
            'sequence': sequence,
            'attention_mask': attention_mask,
            'speaker': speaker,
            'length': length,
            'label': label
        }

        return sample

In [7]:
def data_loader(batch_size):
    # create tensor datasets
    train_dataset = WordFeatureDataset(train_seq, train_mask, train_speaker, train_len, train_y)
    valid_dataset = WordFeatureDataset(valid_seq, valid_mask, valid_speaker, valid_len, valid_y)
    test_dataset = WordFeatureDataset(test_seq, test_mask, test_speaker, test_len, test_y)

    # create dataloaders
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    
    return train_loader, valid_loader, test_loader

In [8]:
def MLP(params):
    # Model
    n_layers = params['n_layers']
    layers = []

    in_features = params['input_size']
    for i in range(n_layers):
        out_features = params[f'n_{i}_size']
        layers.append(torch.nn.Linear(in_features, out_features))
        layers.append(torch.nn.ReLU())
        
        # suggest dropout
        p = params['n_p']
        layers.append(torch.nn.Dropout(p))

        # updating next layer size
        in_features = out_features
        
    layers.append(torch.nn.Linear(in_features, params['output_size']))
    model = torch.nn.Sequential(*layers)
    return model

class MLP_Bert(nn.Module):
    def __init__(self, bert, params):
        super(MLP_Bert, self).__init__()

        self.bert = {
            "mid" : AutoModel.from_pretrained('bert-base-uncased'),
            "l" : AutoModel.from_pretrained('bert-base-uncased'),
            "r" : AutoModel.from_pretrained('bert-base-uncased')
        }

        self.dropout = nn.Dropout(0.1)
        self.mlp = MLP(params)

    # define the forward pass
    def forward(self, seq, mask, speakers, lengths):
        out = {}
        for key in self.bert:
            outputs = self.bert[key](seq[key], attention_mask=mask[key])
            out[key] = outputs.last_hidden_state[:,0,:]

        bert_output = torch.cat(tuple(out.values()), dim=1)
        fc_input = torch.cat((bert_output, speakers, lengths), dim=1)
        x = self.mlp(fc_input) 
        return x

In [9]:
def objective(trial):
    n_epochs = 20
    eval_at = 300
    
    params = {
        # "n_layers" : trial.suggest_int("n_layers", 2, 5),
        "n_layers" : 3,
        "input_size" : 3*bert.config.hidden_size + 4 + 1,
        "output_size" : 2,
        "n_p" : trial.suggest_float("n_p", 0.2, 0.7),
        "lr" : trial.suggest_float("lr", 1e-5, 1e-4),
        "weight_decay" : trial.suggest_float("weight_decay", 1e-5, 1e-4),
        "batch_size" : 30
    }
    for i in range(params["n_layers"]):
        params[f"n_{i}_size"] = trial.suggest_int(f"n_{i}_size", 200, 800)

    model = MLP_Bert(bert, params).to(device)
    class_weights = compute_class_weight('balanced', classes=np.unique(train['labels'].to_numpy()), y=train['labels'].to_numpy())
    criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float()) 
    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"], weight_decay=params["weight_decay"])
    f1 = F1Score(task='binary', num_classes=2).to(device)

    train_loader, valid_loader, _ = data_loader(params["batch_size"])

   
    it = 0
    hst_train_loss = [] 
    hst_valid_loss = []
    hst_f1_score = []

    best_valid_loss = float("inf")
    patience = 10
    
    # itera nas epochs
    for epoch in range(n_epochs):
        if patience == 0: break
        
        # itera nos train batches
        for idx, samples in enumerate(train_loader):
            if patience == 0: break
            it += 1

            # train step
            model.train()
            optimizer.zero_grad()
            out = model(samples['sequence'], samples['attention_mask'], samples['speaker'], samples['length'])
            loss = criterion(out, samples['label'])
            loss.backward()
            optimizer.step()
            
            train_loss = loss.cpu().detach().numpy() / 1
            
            if it % 100 == 0:
                print(idx, train_loss)
            if it % eval_at == 0:
                model.eval()

                valid_loss = 0
                f1_score = 0
                
                # itera nos valid batches
                for idx, samples in enumerate(valid_loader):
                    out = model(samples['sequence'], samples['attention_mask'], samples['speaker'], samples['length'])
                    loss = criterion(out, samples['label'])
                    valid_loss += loss.cpu().detach().numpy() / len(valid_loader)
                    f1_score += f1(samples['label'], out.argmax(dim=1)).cpu().detach().numpy() / len(valid_loader)
                
                # early stopping
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    best_weights = model.state_dict()
                    patience = 10
                else:
                    patience -= 1 
                
                hst_train_loss.append(train_loss)
                hst_valid_loss.append(valid_loss)
                hst_f1_score.append(f1_score)

                print('Iter: {} | Train Loss: {} | Val Loss: {} | F1-score: {}'.format(it, train_loss, valid_loss, f1_score))

    # objective function criterion
    combined = sorted(zip(hst_valid_loss, hst_f1_score), key=lambda x : x[0])
    _, scores = zip(*combined)
    qtd = 3
    final_score = sum(scores[:qtd]) / qtd

    results = {
        "score" : final_score,
        "params" : params,
        "valid_loss" : hst_valid_loss,
        "train_loss" : hst_train_loss,
        "f1_score" : hst_f1_score, 
    }
    json.dump(results, open(f"models/mlp_results_{trial.number}.json", "w"))
    
    return final_score


In [10]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-12-05 15:39:04,874] A new study created in memory with name: no-name-a8a08499-52da-4100-82cd-55d6dec3f191


99 0.662815272808075
199 0.5843757390975952
299 0.47338905930519104
Iter: 300 | Train Loss: 0.47338905930519104 | Val Loss: 0.565304029233677 | F1-score: 0.4934636928986024
399 0.4944024384021759
499 0.5007961392402649
599 0.3792397975921631
Iter: 600 | Train Loss: 0.3792397975921631 | Val Loss: 0.5365272754246425 | F1-score: 0.4903187608903213
699 0.2982766628265381
799 0.3855109214782715
899 0.46472597122192383
Iter: 900 | Train Loss: 0.46472597122192383 | Val Loss: 0.5229114019071933 | F1-score: 0.5022879118848703
999 0.7689629197120667
1099 0.4054035246372223
1199 0.5328714847564697
Iter: 1200 | Train Loss: 0.5328714847564697 | Val Loss: 0.5319560886043865 | F1-score: 0.49330339734394485
1299 0.6226460933685303
1399 0.5811951160430908
1499 0.35184311866760254
Iter: 1500 | Train Loss: 0.35184311866760254 | Val Loss: 0.5101152317425641 | F1-score: 0.49932992957609174
1599 0.4939771592617035
1699 0.4182641804218292
56 0.39936646819114685
Iter: 1800 | Train Loss: 0.39936646819114685 | 

In [1]:
import custom_utils

In [7]:
test_sentences, test_speakers, _  = custom_utils.read_data_by_ID("test", combine = False)

# tokenization
params = {
    'max_length' : 80,
    'padding' : True,
    'truncation' : True,
    'return_token_type_ids' : False
}

test_data = {}
for id in test_sentences:
    test_data[id] = custom_utils.format_input(test_sentences[id], test_speakers[id], t_params=params)

TypeError: format_input() missing 1 required positional argument: 'tokenizer'