In [1]:
import torch 
import json
import numpy as np
import pandas as pd
import custom_utils

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

from transformers import AutoModel, BertTokenizerFast
from torch.utils.data import Dataset


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.model_selection import train_test_split

# read
sentences, speakers, labels = custom_utils.read_data("training", "training_labels.json")

# split
df = pd.DataFrame({"sentences" : sentences, "speakers" : speakers, "labels" : labels})

train, test = train_test_split(df, test_size=0.2, random_state=69, stratify=df.labels)
train, valid = train_test_split(train, test_size=0.3, random_state=69, stratify=train.labels)

print(f"Train: {len(train)}\nTest: {len(test)}\nValid: {len(valid)}")
train.head()

# Getting setences list
train_sentences = train['sentences'].to_list()
valid_sentences = valid['sentences'].to_list()
test_sentences = test['sentences'].to_list()

Train: 40668
Test: 14525
Valid: 17430


In [3]:
bert = AutoModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [4]:
max_seq_len = 80

In [5]:
## Tokenization
params = {
    'max_length' : max_seq_len,
    'padding' : True,
    'truncation' : True,
    'return_token_type_ids' : False
}

# tokenize and encode sequences in the training set
train_tokens = tokenizer.batch_encode_plus(train_sentences, **params)

# tokenize and encode sequences in the validation set
valid_tokens = tokenizer.batch_encode_plus(valid_sentences, **params)

# tokenize and encode sequences in the test set
test_tokens = tokenizer.batch_encode_plus(test_sentences, **params)

# hot encoder for speakers
switcher = {
    "PM" : [1,0,0,0],
    "ME" : [0,1,0,0],
    "UI" : [0,0,1,0],
    "ID" : [0,0,0,1]
}

# for train set
train_seq = torch.tensor(train_tokens['input_ids'])
train_mask = torch.tensor(train_tokens['attention_mask'])
train_speaker = torch.Tensor([switcher[el] for el in train['speakers']]).to(device)
train_len = torch.Tensor([[len(sentence.split())] for sentence in train['sentences']]).to(device)
train_y = torch.tensor(train['labels'].to_numpy())

# for validation set
val_seq = torch.tensor(valid_tokens['input_ids'])
val_mask = torch.tensor(valid_tokens['attention_mask'])
valid_speaker = torch.Tensor([switcher[el] for el in valid['speakers']]).to(device)
valid_len = torch.Tensor([[len(sentence.split())] for sentence in valid['sentences']]).to(device)
valid_y = torch.tensor(valid['labels'].to_numpy())

# for test set
test_seq = torch.tensor(test_tokens['input_ids'])
test_mask = torch.tensor(test_tokens['attention_mask'])
test_speaker = torch.Tensor([switcher[el] for el in test['speakers']]).to(device)
test_len = torch.Tensor([[len(sentence.split())] for sentence in test['sentences']]).to(device)
test_y = torch.tensor(test['labels'].to_numpy())

In [6]:

class WordFeatureDataset(Dataset):
    def __init__(self, sequences, attention_masks, speakers, lengths, labels):
        self.sequences = sequences
        self.attention_masks = attention_masks
        self.speakers = speakers
        self.lengths = lengths
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        attention_mask = self.attention_masks[idx]
        speaker = self.speakers[idx]
        length = self.lengths[idx]
        label = self.labels[idx]

        sample = {
            'sequence': sequence,
            'attention_mask': attention_mask,
            'speaker': speaker,
            'length': length,
            'label': label
        }

        return sample

In [7]:
from torch.utils.data import TensorDataset, DataLoader

def data_loader(batch_size):
    # create tensor datasets
    train_dataset = WordFeatureDataset(train_seq, train_mask, train_speaker, train_len, train_y)
    valid_dataset = WordFeatureDataset(val_seq, val_mask, valid_speaker, valid_len, valid_y)
    test_dataset = WordFeatureDataset(test_seq, test_mask, test_speaker, test_len, test_y)

    # create dataloaders
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    
    return train_loader, valid_loader, test_loader

In [8]:
from torch import nn

In [83]:
class MLP_Bert(nn.Module):
    def __init__(self, bert):
        super(MLP_Bert, self).__init__()

        self.bert = bert 
        
        # dropout layer
        self.dropout = nn.Dropout(0.1)
        
        # relu activation function
        self.relu = nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768, 512)
        
        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512, 2)

        # softmax activation function
        # self.softmax = nn.LogSoftmax(dim=1)

    # define the forward pass
    def forward(self, sent_id, mask):

        # pass the inputs to the model  
        outputs = self.bert(sent_id, attention_mask=mask)
        hidden_states = outputs.last_hidden_state

        # Average pooling across the entire sequence
        avg_pooled = torch.mean(hidden_states, dim=1)

        x = self.fc1(avg_pooled)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)
        
        # apply softmax activation
        # x = self.softmax(x)
        return x

In [84]:
from sklearn.utils.class_weight import compute_class_weight
from torchmetrics.classification import F1Score

In [85]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [95]:
model = MLP_Bert(bert=bert)
class_weights = compute_class_weight('balanced', classes=np.unique(train['labels'].to_numpy()), y=train['labels'].to_numpy())
criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights).float()) 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
f1 = F1Score(task='binary', num_classes=2).to(device)

train_loader, valid_loader, test_loader = data_loader(100)

n_epochs = 20
it = 0
hst_train_loss = [] 
hst_valid_loss = []
hst_f1_score = []

best_valid_loss = float("inf")
patience = 10

for epoch in range(n_epochs):
    if epoch > 0:
        break
    if patience == 0: break
    
    for samples in train_loader:
        if patience == 0: break
        it += 1

        # train step
        model.train()
        optimizer.zero_grad()
        out = model(sent_id = samples['sequence'], mask = samples['attention_mask'])
        loss = criterion(out, samples['label'])
        loss.backward()
        optimizer.step()
        
        train_loss = loss.cpu().detach().numpy() / 1
        print(train_loss)

        if it % 10 == 0:
            model.eval()

            valid_loss = 0
            f1_score = 0
            
            qnt = 2
            for idx, samples in enumerate(valid_loader):
                if idx >= qnt:
                    break 
                out = model(sent_id = samples['sequence'], mask = samples['attention_mask'])
                loss = criterion(out, samples['label'])
                # valid_loss += loss.cpu().detach().numpy() / len(valid_loader)
                valid_loss += loss.cpu().detach().numpy() / qnt
                # f1_score += f1(samples['label'], out.argmax(dim=1)).cpu().detach().numpy() / len(valid_loader)
                f1_score += f1(samples['label'], out.argmax(dim=1)).cpu().detach().numpy() / qnt
            
            # early stopping
            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                best_weights = model.state_dict()
                patience = 10
            else:
                patience -= 1 
            
            hst_train_loss.append(train_loss)
            hst_valid_loss.append(valid_loss)
            hst_f1_score.append(f1_score)

            print('Iter: {} | Train Loss: {} | Val Loss: {} | F1-score: {}'.format(it, train_loss, valid_loss, f1_score))

# objective function criterion
combined = sorted(zip(hst_valid_loss, hst_f1_score), key=lambda x : x[0])
_, scores = zip(*combined)
qtd = 3
final_score = sum(scores[:qtd]) / qtd

# torch.save(best_weights, f"models/mlp_{trial.number}.pt")
# results = {
#     "score" : final_score,
#     "params" : params,
#     "valid_loss" : hst_valid_loss,
#     "train_loss" : hst_train_loss,
#     "f1_score" : hst_f1_score, 
# }
# json.dump(results, open(f"models/mlp_results_{trial.number}.json", "w"))


0.6957338452339172
0.6357496380805969
0.6081460118293762
0.5195344686508179
0.5750989317893982
0.6164846420288086
0.47410687804222107
0.44061678647994995
0.5451452732086182
0.5210814476013184
Iter: 10 | Train Loss: 0.5210814476013184 | Val Loss: 0.4649866223335266 | F1-score: 0.5434243381023407
0.47182002663612366
0.6088654398918152
0.4705297648906708
0.4009742736816406
0.48865222930908203
0.38142427802085876
0.4440515637397766
0.4908559024333954
0.42344650626182556
0.4834602177143097
Iter: 20 | Train Loss: 0.4834602177143097 | Val Loss: 0.434982031583786 | F1-score: 0.5966748893260956
0.4789697527885437
0.5518483519554138
0.4203784465789795
0.44918760657310486
0.5708299875259399
0.41181573271751404
0.4768314063549042
0.3348318338394165
0.4104020297527313
0.42413556575775146
Iter: 30 | Train Loss: 0.42413556575775146 | Val Loss: 0.4026656746864319 | F1-score: 0.4749823957681656
0.37824708223342896
0.5124050974845886
0.319325715303421
0.41193506121635437
0.4364067018032074
0.56813430786