In [65]:
import torch 
import json
import numpy as np
import pandas as pd
import custom_utils

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import Dataset


In [66]:
from sklearn.model_selection import train_test_split

# read
sentences, speakers, labels = custom_utils.read_data("training", "training_labels.json")

# split
df = pd.DataFrame({"sentences" : sentences, "speakers" : speakers, "labels" : labels})

train, test = train_test_split(df, test_size=0.2, random_state=69, stratify=df.labels)
train, valid = train_test_split(train, test_size=0.1, random_state=69, stratify=train.labels)

print(f"Train: {len(train)}\nTest: {len(test)}\nValid: {len(valid)}")
train.head()

# Getting setences list
train_sentences = train['sentences'].to_list()
valid_sentences = valid['sentences'].to_list()
test_sentences = test['sentences'].to_list()

Train: 52288
Test: 14525
Valid: 5810


In [67]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [68]:
max_seq_len = 80

In [69]:
## Tokenization
params = {
    'max_length' : max_seq_len,
    'padding' : True,
    'truncation' : True,
    'return_token_type_ids' : False
}

# tokenize and encode sequences in the training set
train_tokens = tokenizer.batch_encode_plus(train_sentences, **params)

# tokenize and encode sequences in the validation set
valid_tokens = tokenizer.batch_encode_plus(valid_sentences, **params)

# tokenize and encode sequences in the test set
test_tokens = tokenizer.batch_encode_plus(test_sentences, **params)

# hot encoder for speakers
switcher = {
    "PM" : [1,0,0,0],
    "ME" : [0,1,0,0],
    "UI" : [0,0,1,0],
    "ID" : [0,0,0,1]
}

# for train set
train_seq = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_speaker = torch.Tensor([switcher[el] for el in train['speakers']]).to(device)
train_len = torch.Tensor([[len(sentence.split())] for sentence in train['sentences']]).to(device)
train_y = torch.tensor(train['labels'].to_numpy())

# for validation set
val_seq = torch.tensor(valid_tokens['input_ids'])
val_mask = torch.tensor(valid_tokens['attention_mask'])
valid_speaker = torch.Tensor([switcher[el] for el in valid['speakers']]).to(device)
valid_len = torch.Tensor([[len(sentence.split())] for sentence in valid['sentences']]).to(device)
valid_y = torch.tensor(valid['labels'].to_numpy())

# for test set
test_seq = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])
test_speaker = torch.Tensor([switcher[el] for el in test['speakers']]).to(device)
test_len = torch.Tensor([[len(sentence.split())] for sentence in test['sentences']]).to(device)
test_y = torch.tensor(test['labels'].to_numpy())

In [70]:

class WordFeatureDataset(Dataset):
    def __init__(self, sequences, attention_masks, speakers, lengths, labels):
        self.sequences = sequences
        self.attention_masks = attention_masks
        self.speakers = speakers
        self.lengths = lengths
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        attention_mask = self.attention_masks[idx]
        speaker = self.speakers[idx]
        length = self.lengths[idx]
        label = self.labels[idx]

        sample = {
            'sequence': sequence,
            'attention_mask': attention_mask,
            'speaker': speaker,
            'length': length,
            'label': label
        }

        return sample

In [71]:
from torch.utils.data import TensorDataset, DataLoader

def data_loader(batch_size):
    # create tensor datasets
    train_dataset = WordFeatureDataset(train_seq, train_mask, train_speaker, train_len, train_y)
    valid_dataset = WordFeatureDataset(val_seq, val_mask, valid_speaker, valid_len, valid_y)
    test_dataset = WordFeatureDataset(test_seq, test_mask, test_speaker, test_len, test_y)

    # create dataloaders
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    
    return train_loader, valid_loader, test_loader

In [72]:
from torch import nn

In [73]:
def MLP(params):
    # Model
    n_layers = params['n_layers']
    layers = []

    in_features = params['input_size']
    for i in range(n_layers):
        out_features = params[f'n_{i}_size']
        layers.append(torch.nn.Linear(in_features, out_features))
        layers.append(torch.nn.ReLU())
        
        # suggest dropout
        p = params['n_p']
        layers.append(torch.nn.Dropout(p))

        # updating next layer size
        in_features = out_features
        
    layers.append(torch.nn.Linear(in_features, params['output_size']))
    model = torch.nn.Sequential(*layers)
    return model

class MLP_Bert(nn.Module):
    def __init__(self, bert, params):
        super(MLP_Bert, self).__init__()

        self.bert = bert 
        self.dropout = nn.Dropout(0.1)
        self.mlp = MLP(params)

    # define the forward pass
    def forward(self, seq, mask, speakers, lengths):

        # pass the inputs to the model  
        outputs = self.bert(seq, attention_mask=mask)
        hidden_states = outputs.last_hidden_state

        # Average pooling across the entire sequence
        avg_pooled = torch.mean(hidden_states, dim=1)
        
        cls_input = torch.cat((avg_pooled, speakers, lengths), dim=1)
        x = self.mlp(cls_input) 
        return x

In [74]:
# class MLP_Bert(nn.Module):
#     def __init__(self, bert):
#         super(MLP_Bert, self).__init__()

#         self.bert = bert 
        
#         # dropout layer
#         self.dropout = nn.Dropout(0.2)
        
#         # relu activation function
#         self.relu = nn.ReLU()

#         # dense layer 1
#         self.fc1 = nn.Linear(768, 512)
        
#         # dense layer 2 (Output layer)
#         self.fc2 = nn.Linear(512, 2)

#     # define the forward pass
#     def forward(self, sent_id, mask):

#         # pass the inputs to the model  
#         outputs = self.bert(sent_id, attention_mask=mask)
#         hidden_states = outputs.last_hidden_state

#         # Average pooling across the entire sequence
#         avg_pooled = torch.mean(hidden_states, dim=1)

#         x = self.fc1(avg_pooled)

#         x = self.relu(x)

#         x = self.dropout(x)

#         # output layer
#         x = self.fc2(x)
        
#         return x

In [75]:
from sklearn.utils.class_weight import compute_class_weight
from torchmetrics.classification import F1Score

In [76]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [77]:
# params = json.load(open("models/mlp_results_232.json", "r"))["params"]
# params['input_size'] = bert.config.hidden_size + 4

In [80]:
def objective(trial):

    params = {
        "n_layers" : trial.suggest_int("n_layers", 2, 4),
        "input_size" : bert.config.hidden_size + 4 + 1,
        "output_size" : 2,
        "n_p" : trial.suggest_float("n_p", 0.5, 0.8),
        "lr" : trial.suggest_float("lr", 1e-4, 1e-3),
        "weight_decay" : trial.suggest_float("weight_decay", 1e-5, 1e-4),
        "batch_size" : 250
    }
    for i in range(trial.params["n_layers"]):
        params[f"n_{i}_size"] = trial.suggest_int(f"n_{i}_size", 200, 800)

    model = MLP_Bert(bert, params).to(device)
    class_weights = compute_class_weight('balanced', classes=np.unique(train['labels'].to_numpy()), y=train['labels'].to_numpy())
    criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float()) 
    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"], weight_decay=params["weight_decay"])
    f1 = F1Score(task='binary', num_classes=2).to(device)

    train_loader, valid_loader, _ = data_loader(params["batch_size"])

    n_epochs = 20
    it = 0
    hst_train_loss = [] 
    hst_valid_loss = []
    hst_f1_score = []

    best_valid_loss = float("inf")
    patience = 10

    # itera nas epochs
    for epoch in range(n_epochs):
        if epoch > 0:
            break
        if patience == 0: break
        
        # itera nos train batches
        for idx, samples in enumerate(train_loader):
            if patience == 0: break
            it += 1

            # train step
            model.train()
            optimizer.zero_grad()
            out = model(samples['sequence'], samples['attention_mask'], samples['speaker'], samples['length'])
            loss = criterion(out, samples['label'])
            loss.backward()
            optimizer.step()
            
            train_loss = loss.cpu().detach().numpy() / 1
            
            if it % 25 == 0:
                print(f"{it} : {train_loss}")

            if it % 75 == 0:
                model.eval()

                valid_loss = 0
                f1_score = 0
                
                # itera nos valid batches
                for idx, samples in enumerate(valid_loader):
                    out = model(samples['sequence'], samples['attention_mask'], samples['speaker'], samples['length'])
                    loss = criterion(out, samples['label'])
                    valid_loss += loss.cpu().detach().numpy() / len(valid_loader)
                    f1_score += f1(samples['label'], out.argmax(dim=1)).cpu().detach().numpy() / len(valid_loader)
                
                # early stopping
                if valid_loss < best_valid_loss:
                    best_valid_loss = valid_loss
                    best_weights = model.state_dict()
                    patience = 10
                else:
                    patience -= 1 
                
                hst_train_loss.append(train_loss)
                hst_valid_loss.append(valid_loss)
                hst_f1_score.append(f1_score)

                print('Iter: {} | Train Loss: {} | Val Loss: {} | F1-score: {}'.format(it, train_loss, valid_loss, f1_score))

    # objective function criterion
    combined = sorted(zip(hst_valid_loss, hst_f1_score), key=lambda x : x[0])
    _, scores = zip(*combined)
    qtd = 3
    final_score = sum(scores[:qtd]) / qtd

    return final_score


In [81]:
import optuna

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-12-04 19:42:46,175] A new study created in memory with name: no-name-c150634f-00ca-4c64-a023-0b5e82c5d463
