In [1]:
import torch 
import json
import numpy as np
import pandas as pd
import custom_utils

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device("cuda")

In [2]:
from sklearn.model_selection import train_test_split

# read
sentences, speakers, labels = custom_utils.read_data("training", "training_labels.json")

# split
df = pd.DataFrame({"sentences" : sentences, "speakers" : speakers, "labels" : labels})

train, test = train_test_split(df, test_size=0.2, random_state=69, stratify=df.labels)

# train, valid = train_test_split(train, test_size=0.3, random_state=69, stratify=train.labels)


# print(f"Train: {len(train)}\nTest: {len(test)}\nValid: {len(valid)}")
train.head()

Unnamed: 0,sentences,speakers,labels
45504,Smiling fish <vocalsound> .,PM,0
47790,"Robustness , uh-huh .",ME,1
44498,Because the the the electronic device's price ...,ID,0
49154,so it's very lousy .,UI,1
12142,Okay .,PM,0


In [3]:
from sentence_transformers import SentenceTransformer

# hot encoder for speakers
switcher = {
    "PM" : [1,0,0,0],
    "ME" : [0,1,0,0],
    "UI" : [0,0,1,0],
    "ID" : [0,0,0,1]
}

# embed
bert = SentenceTransformer('all-MiniLM-L6-v2')

train_sentences = bert.encode(train['sentences'].to_numpy(), convert_to_tensor=True, show_progress_bar=True).to(device)
train_speaker = torch.Tensor([switcher[el] for el in train['speakers']]).to(device)
train_len = torch.Tensor([[len(sentence.split())] for sentence in train['sentences']]).to(device)
train_X = torch.cat((train_sentences, train_speaker, train_len), dim=1)
# train_X = torch.cat((train_sentences, train_speaker), dim=1)
train_y = torch.tensor(train['labels'].to_numpy())

# valid_sentences = bert.encode(valid['sentences'].to_numpy(), convert_to_tensor=True, show_progress_bar=True).to(device)
# valid_speaker = torch.Tensor([switcher[el] for el in valid['speakers']]).to(device)
# valid_len = torch.Tensor([len(sentence) for sentence in valid['sentences']]).to(device)
# valid_X = torch.cat((valid_sentences, valid_speaker, valid_len), dim=1)
# valid_y = torch.tensor(valid['labels'].to_numpy())

test_sentences = bert.encode(test['sentences'].to_numpy(), convert_to_tensor=True, show_progress_bar=True).to(device)
test_speaker = torch.Tensor([switcher[el] for el in test['speakers']]).to(device)
test_len = torch.Tensor([[len(sentence.split())] for sentence in test['sentences']]).to(device)
test_X = torch.cat((test_sentences, test_speaker, test_len), dim=1)
# test_X = torch.cat((test_sentences, test_speaker), dim=1)
test_y = torch.tensor(test['labels'].to_numpy())

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 1816/1816 [01:25<00:00, 21.34it/s]
Batches: 100%|██████████| 454/454 [00:14<00:00, 30.67it/s]


In [4]:
from torch.utils.data import TensorDataset, DataLoader

def data_loader(batch_size):
    # create tensor datasets
    trainset = TensorDataset((train_X).to(device), (train_y).to(device))
    # validset = TensorDataset((valid_X).to(device), (valid_y).to(device))
    testset = TensorDataset((test_X).to(device), (test_y).to(device))

    # create dataloaders
    train_loader = DataLoader(trainset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    # valid_loader = DataLoader(validset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    test_loader = DataLoader(testset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
    
    return train_loader, test_loader

In [5]:
def MLP(params):
    # Model
    n_layers = params['n_layers']
    layers = []

    in_features = params['input_size']
    for i in range(n_layers):
        out_features = params[f'n_{i}_size']
        layers.append(torch.nn.Linear(in_features, out_features))
        layers.append(torch.nn.ReLU())
        
        # suggest dropout
        p = params['n_p']
        layers.append(torch.nn.Dropout(p))

        # updating next layer size
        in_features = out_features
        
    layers.append(torch.nn.Linear(in_features, params['output_size']))
    model = torch.nn.Sequential(*layers)
    return model

In [65]:
import optuna
from torchmetrics.classification import F1Score

def train_MLP(params):
        
    model = MLP(params)
    criterion = torch.nn.CrossEntropyLoss() 
    optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"], weight_decay=params["weight_decay"])
    f1 = F1Score(task='binary', num_classes=params['output_size']).to(device)

    train_loader, test_loader = data_loader(params["batch_size"])
    
    n_epochs = 20
    it = 0
    hst_train_loss = [] 
    hst_test_loss = []
    hst_f1_score = []

    best_test_loss = float("inf")
    best_f1_score = 0
    patience = 10
    for epoch in range(n_epochs):
        if patience == 0: break
        for samples, labels in train_loader:
            if patience == 0: break
            it += 1

            # train step
            model.train()
            optimizer.zero_grad()
            out = model(samples)
            loss = criterion(out, labels)
            loss.backward()
            optimizer.step()
            

            if it % 100 == 0:
                model.eval()

                train_loss = loss.cpu().detach().numpy() / 1
                test_loss = 0
                f1_score = 0
                for samples, labels in test_loader:
                    out = model(samples)
                    loss = criterion(out, labels)
                    test_loss += loss.cpu().detach().numpy() / len(test_loader)
                    f1_score += f1(labels, out.argmax(dim=1)).cpu().detach().numpy() / len(test_loader)
                
                if f1_score > best_f1_score:
                    best_f1_score = f1_score
                    best_weights = model.state_dict()

                # early stopping
                if test_loss < best_test_loss:
                    best_test_loss = test_loss
                    patience = 10
                else:
                    patience -= 1 
                
                hst_train_loss.append(train_loss)
                hst_test_loss.append(test_loss)
                hst_f1_score.append(f1_score)

                print('Iter: {} | Train Loss: {} | Val Loss: {} | F1-score: {}'.format(it, train_loss, test_loss, f1_score))

    return best_weights

In [2]:
import json
from pathlib import Path

score = {}
params = {}
for item in Path("models").iterdir():
    if not item.suffix == ".json" : continue
    
    id = item.stem.split('_')[2]
    score[id] = json.load(open(item, "r"))["score"]
    params[id] = json.load(open(item, "r"))["params"]

qtd = 15
ll = sorted(score.items(), key=lambda x : x[1], reverse=True)
best_ids, _ = zip(*ll[:qtd])
ll

[('232', 0.589133054469571),
 ('212', 0.588377456773411),
 ('138', 0.5879984432604255),
 ('222', 0.5878143520517782),
 ('132', 0.5876851022716553),
 ('267', 0.5876206554698221),
 ('210', 0.5875831268953555),
 ('442', 0.587565704371578),
 ('238', 0.5875579280446664),
 ('385', 0.587500210286993),
 ('100', 0.5874619231960637),
 ('211', 0.5874350281186805),
 ('214', 0.5873827083092746),
 ('152', 0.5873703920259709),
 ('66', 0.5870380125939847),
 ('71', 0.5870271862456294),
 ('322', 0.5869192556729393),
 ('231', 0.5868994530403254),
 ('209', 0.5868964544100355),
 ('124', 0.5868047824720056),
 ('335', 0.5867338250532296),
 ('380', 0.5866987007532934),
 ('374', 0.5865984380696759),
 ('241', 0.5865795743284803),
 ('69', 0.5865025361852041),
 ('144', 0.586411763531293),
 ('449', 0.5863994325984988),
 ('166', 0.5863632651848523),
 ('406', 0.5863480911110387),
 ('134', 0.5863240716003238),
 ('153', 0.5862676180899143),
 ('466', 0.5862553270928789),
 ('403', 0.5862011093468892),
 ('402', 0.5861185

In [21]:
test_sentences, test_speakers, _  = custom_utils.read_data_by_ID("test", combine = False)

def format_input(sentences, speakers):
    switcher = {
        "PM" : [1,0,0,0],
        "ME" : [0,1,0,0],
        "UI" : [0,0,1,0],
        "ID" : [0,0,0,1]
    }

    train_sentences = bert.encode(sentences, convert_to_tensor=True, show_progress_bar=True).to(device)
    train_speaker = torch.Tensor([switcher[el] for el in speakers]).to(device)
    train_X = torch.cat((train_sentences, train_speaker), dim=1)
    return train_X

data = {}
for id in test_sentences:
    data[id] = format_input(test_sentences[id], test_speakers[id])

Batches: 100%|██████████| 10/10 [00:00<00:00, 26.69it/s]
Batches: 100%|██████████| 24/24 [00:00<00:00, 40.25it/s]
Batches: 100%|██████████| 24/24 [00:01<00:00, 15.92it/s]
Batches: 100%|██████████| 29/29 [00:01<00:00, 16.31it/s]
Batches: 100%|██████████| 14/14 [00:00<00:00, 16.10it/s]
Batches: 100%|██████████| 25/25 [00:01<00:00, 17.10it/s]
Batches: 100%|██████████| 27/27 [00:01<00:00, 25.22it/s]
Batches: 100%|██████████| 34/34 [00:00<00:00, 39.07it/s]
Batches: 100%|██████████| 12/12 [00:00<00:00, 32.14it/s]
Batches: 100%|██████████| 20/20 [00:00<00:00, 29.49it/s]
Batches: 100%|██████████| 22/22 [00:00<00:00, 30.86it/s]
Batches: 100%|██████████| 24/24 [00:01<00:00, 15.18it/s]
Batches: 100%|██████████| 9/9 [00:00<00:00, 24.55it/s]
Batches: 100%|██████████| 24/24 [00:01<00:00, 14.08it/s]
Batches: 100%|██████████| 25/25 [00:01<00:00, 15.50it/s]
Batches: 100%|██████████| 35/35 [00:04<00:00,  7.47it/s]
Batches: 100%|██████████| 8/8 [00:01<00:00,  6.65it/s]
Batches: 100%|██████████| 19/19 [00

In [66]:
all_models = []
for bid in best_ids:
    model = MLP(params[bid])
    state_dict = train_MLP(params[bid])
    model.load_state_dict(state_dict)
    
    model.eval()
    test_labels = {}
    for id in test_sentences.keys():
        out = model(data[id])
        pred = out.argmax(dim=1)
        test_labels[id] = pred.cpu().detach().numpy()
    all_models.append(test_labels)

Iter: 100 | Train Loss: 0.40085580945014954 | Val Loss: 0.4385976016521454 | F1-score: 0.0
Iter: 200 | Train Loss: 0.35230720043182373 | Val Loss: 0.37750003238519036 | F1-score: 0.0
Iter: 300 | Train Loss: 0.33393925428390503 | Val Loss: 0.359857436021169 | F1-score: 0.0
Iter: 400 | Train Loss: 0.3215361535549164 | Val Loss: 0.36413959562778475 | F1-score: 0.0
Iter: 500 | Train Loss: 0.35456258058547974 | Val Loss: 0.35692150791486116 | F1-score: 0.5802803734938304
Iter: 600 | Train Loss: 0.3904114067554474 | Val Loss: 0.35572748879591637 | F1-score: 0.5726165334383646
Iter: 700 | Train Loss: 0.34016886353492737 | Val Loss: 0.3527257412672043 | F1-score: 0.5755985647439956
Iter: 800 | Train Loss: 0.3549371361732483 | Val Loss: 0.3567900826533635 | F1-score: 0.5829618016878764
Iter: 900 | Train Loss: 0.34820765256881714 | Val Loss: 0.3471076180537541 | F1-score: 0.5693604071935017
Iter: 1000 | Train Loss: 0.31409886479377747 | Val Loss: 0.33952231307824443 | F1-score: 0.571476105848948

In [67]:
test_labels = {}

n = len(all_models)
for model_labels in all_models:
    for id in model_labels:
        if not id in test_labels:
            test_labels[id] = model_labels[id] / n
        else:
            test_labels[id] += model_labels[id] / n

In [68]:
for id in test_labels:
    test_labels[id] = [int(el) for el in list(np.where(test_labels[id] >= 0.5, 1, 0))]

In [69]:
json.dump(test_labels, open("test_labels.json", "w"))