In [240]:
import torch
import torch
import numpy as np

In [241]:
from gensim.models import KeyedVectors
import gensim.downloader as api

model_path = "tokenizer.model"
model_name = "glove-wiki-gigaword-300"

# load model (and save if necessary)
try:
    tokenizer = KeyedVectors.load(model_path)
except FileNotFoundError:
    tokenizer = api.load(model_name)
    tokenizer.save(model_path)

In [242]:
def embed_sentence(sentence, tokenizer):
    embeddings = []
    length = 0
    for word in sentence.split():
        word = word.lower()
        if not word in tokenizer: continue
        length += 1
        embeddings.append(tokenizer[word])
    
    if len(embeddings) == 0: # bug fix for padding function (we need to asssure at least one element)
        embeddings = np.zeros(shape=(1,tokenizer.vector_size))
        length = 1
    return torch.tensor(np.asarray(embeddings)), length

In [243]:
from pathlib import Path
import json

# read data
sentences = []
speakers = []
labels = []
label_data = json.load(open("training_labels.json", "r"))
for item in Path("training").iterdir():
    if not item.is_file(): continue 
    if not item.suffix == ".json": continue
    
    # load data
    dialog = json.load(open(item, "r"))
    sentences += [exchange["text"] for exchange in dialog]
    speakers += [exchange["speaker"] for exchange in dialog]
    labels += [val for val in label_data[item.stem]]

In [244]:
print(len(sentences), len(speakers), len(labels))

72623 72623 72623


In [245]:
switcher = {
        "PM" : [1,0,0,0],
        "ME" : [0,1,0,0],
        "UI" : [0,0,1,0],
        "ID" : [0,0,0,1]
}

In [246]:
test = torch.tensor([switcher[sp] for sp in speakers])

ht = torch.rand(size=(len(speakers),2))

torch.cat([ht, test], dim = 1)

tensor([[0.8264, 0.4666, 1.0000, 0.0000, 0.0000, 0.0000],
        [0.1422, 0.8018, 1.0000, 0.0000, 0.0000, 0.0000],
        [0.8732, 0.7705, 1.0000, 0.0000, 0.0000, 0.0000],
        ...,
        [0.8567, 0.3403, 1.0000, 0.0000, 0.0000, 0.0000],
        [0.5017, 0.3098, 1.0000, 0.0000, 0.0000, 0.0000],
        [0.7111, 0.6735, 1.0000, 0.0000, 0.0000, 0.0000]])

In [247]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size + len(switcher), output_size)

    def forward(self, sentences, speakers):
        # from sentences to sequences of vectors (embedding)
        embedded_sentences, lengths = list(zip(*[embed_sentence(sentence, tokenizer) for sentence in sentences]))

        # pack / pad sequences (save memory) 
        packed_sentences = pack_padded_sequence(pad_sequence(embedded_sentences, batch_first=True), lengths, batch_first=True, enforce_sorted=False)
        
        # hot encode speakers
        print(speakers)
        encoded_speakers = torch.tensor([switcher[speaker] for speaker in speakers])

        # lstm layer
        _, (ht,_) = self.lstm(packed_sentences) # it does accept packed sequences 

        # linear f.c. layer 
        output = self.fc(torch.cat([ht[-1], encoded_speakers], dim=1)) # use only output from the last hidden state
        
        return output


In [248]:
from sklearn.model_selection import StratifiedKFold
from torchmetrics.classification import F1Score

device = "cuda" if torch.cuda.is_available() else "cpu"
num_classes = 2
num_features = tokenizer.vector_size # vector size of word embedding

In [249]:
def train(sentences, speakers, labels, model, criterion, optimizer):
    assert(len(labels) == len(sentences))
    model.train() 
    optimizer.zero_grad()
    out = model(sentences, speakers)
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    return loss

def validate(sentences, speakers, labels, model, criterion):
    assert(len(labels) == len(sentences))
    model.eval()
    out = model(sentences, speakers)
    loss = criterion(out, labels)
    pred_labels = out.argmax(dim=1)
    f1 = F1Score(task='binary', num_classes=num_classes)
    score = f1(labels, pred_labels)
    return loss, score

In [250]:
import optuna

def objective(trial):
    n_folds = 5
    n_epochs = 200
    patience = 10
    avg_score = 0
    

    skf = StratifiedKFold(n_splits=n_folds)

    for fold, (train_idx, valid_idx) in enumerate(skf.split(sentences, labels)):
        # split data
        train_labels = torch.tensor([labels[i] for i in train_idx])
        valid_labels = torch.tensor([labels[i] for i in valid_idx])
        
        train_sentences = [sentences[i] for i in train_idx]
        valid_sentences = [sentences[i] for i in valid_idx]

        train_speakers = [speakers[i] for i in train_idx]
        valid_speakers = [speakers[i] for i in valid_idx]
        
        # set model, criterion and optimizers
        # 1. parameters
        # hidden_size = trial.suggest_int(f'hidden_size', 16, 256)
        hidden_size = 64
        # lr = trial.suggest_float(f'lr', 1e-3, 1e-1)
        lr = 0.01
        # weight_decay = trial.suggest_float(f'weight_decay', 1e-5, 1e-2)
        weight_decay = 5e-4
        # 2. objects
        model = LSTMClassifier(num_features, hidden_size, num_classes).to(device)
        criterion = torch.nn.CrossEntropyLoss() # need to check input
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        best_weights = model.state_dict()

        # Epoch
        best_valid_loss = float('inf')
        score_at_best = -1
        current_patience = 0
        for epoch in range(n_epochs):
            train_loss = train(train_sentences, train_speakers, train_labels, model, criterion, optimizer)
            valid_loss, score = validate(valid_sentences, valid_speakers, valid_labels, model, criterion)
            
            # Stopping criteria            
            if valid_loss > best_valid_loss:
                current_patience += 1
            else:
                # best_weights = model.state_dict()
                best_valid_loss = valid_loss    
                score_at_best = score 
                current_patience = 0
            
            if current_patience == patience:
                break
            
            print(f'Fold: {fold}, Epoch: {epoch}, Train loss: {train_loss:.4f}, Valid loss: {valid_loss:.4f}, Score: {score:.4f}')
        
        avg_score += score_at_best/n_folds
        
        # save model and params 
        torch.save(best_weights, f"models/lstm_{trial.number}_{fold}.pt")
        json.dump(trial.params, open(f"models/params_{trial.number}.json", "w"))
    return avg_score

In [251]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)

[I 2023-11-30 22:02:11,432] A new study created in memory with name: no-name-fc948210-7522-4322-a52d-458d24fd9e45
[W 2023-11-30 22:02:16,847] Trial 0 failed with parameters: {} because of the following error: KeyError('So').
Traceback (most recent call last):
  File "/Users/czartur/code/course/deep_learning/datachallenge/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/5h/t1mh5vx9383cnpvfpnf6p1x80000gn/T/ipykernel_2228/285212025.py", line 42, in objective
    train_loss = train(train_sentences, train_speakers, train_labels, model, criterion, optimizer)
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/5h/t1mh5vx9383cnpvfpnf6p1x80000gn/T/ipykernel_2228/1996007288.py", line 5, in train
    out = model(sentences, speakers)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/czartur/code/course/d



KeyError: 'So'