In [49]:
import torch
import torch
import numpy as np

In [67]:
from gensim.models import KeyedVectors
import gensim.downloader as api

model_path = "tokenizer.model"
model_name = "glove-wiki-gigaword-300"

# load model (and save if necessary)
try:
    tokenizer = KeyedVectors.load(model_path)
except FileNotFoundError:
    tokenizer = api.load(model_name)
    tokenizer.save(model_path)

In [68]:
def embed_sentence(sentence, tokenizer):
    embeddings = []
    length = 0
    for word in sentence.split():
        word = word.lower()
        if not word in tokenizer: continue
        length += 1
        embeddings.append(tokenizer[word])
    
    if len(embeddings) == 0: # bug fix for padding function (we need to asssure at least one element)
        embeddings = np.zeros(shape=(1,tokenizer.vector_size))
        length = 1
    return torch.tensor(np.asarray(embeddings)), length

In [69]:
from pathlib import Path
import json

# read data
def read_data(dialogs_folder = str, labels_file = str):
    sentences = []
    speakers = []
    labels = []
    if labels_file:
        label_data = json.load(open(labels_file, "r"))
    for item in Path(dialogs_folder).iterdir():
        if not item.is_file(): continue 
        if not item.suffix == ".json": continue
        
        # load data
        dialog = json.load(open(item, "r"))
        sentences += [exchange["text"] for exchange in dialog]
        speakers += [exchange["speaker"] for exchange in dialog]
        if labels_file:
            labels += [val for val in label_data[item.stem]]
    if labels_file:
        return sentences, speakers, labels
    else:
        return sentences, speakers

In [None]:
# train data
sentences, speakers, labels = read_data("training", "training_labels.json")

In [53]:
print(len(sentences), len(speakers), len(labels))

72623 72623 72623


In [54]:
switcher = {
        "PM" : [1,0,0,0],
        "ME" : [0,1,0,0],
        "UI" : [0,0,1,0],
        "ID" : [0,0,0,1]
}

In [55]:
test = torch.tensor([switcher[sp] for sp in speakers])

ht = torch.rand(size=(len(speakers),2))

torch.cat([ht, test], dim = 1)

tensor([[0.3625, 0.4869, 1.0000, 0.0000, 0.0000, 0.0000],
        [0.1996, 0.5787, 1.0000, 0.0000, 0.0000, 0.0000],
        [0.9791, 0.3932, 1.0000, 0.0000, 0.0000, 0.0000],
        ...,
        [0.6686, 0.7506, 0.0000, 1.0000, 0.0000, 0.0000],
        [0.8596, 0.9174, 0.0000, 0.0000, 1.0000, 0.0000],
        [0.6687, 0.9737, 0.0000, 1.0000, 0.0000, 0.0000]], device='cuda:0')

In [56]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device("cuda")

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size + len(switcher), output_size)

    def forward(self, sentences, speakers):
        # from sentences to sequences of vectors (embedding)
        embedded_sentences, lengths = list(zip(*[embed_sentence(sentence, tokenizer) for sentence in sentences]))

        # pack / pad sequences (save memory) 
        packed_sentences = pack_padded_sequence(pad_sequence(embedded_sentences, batch_first=True), lengths, batch_first=True, enforce_sorted=False)
        
        # hot encode speakers
        encoded_speakers = torch.tensor([switcher[speaker] for speaker in speakers])

        # send to device 
        packed_sentences.to(device)
        encoded_speakers.to(device)
        
        # lstm layer
        _, (ht,_) = self.lstm(packed_sentences) # it does accept packed sequences 

        # linear f.c. layer 
        output = self.fc(torch.cat([ht[-1], encoded_speakers], dim=1)) # use only output from the last hidden state
        
        return output


In [57]:
from sklearn.model_selection import StratifiedKFold
from torchmetrics.classification import F1Score

num_classes = 2
num_features = tokenizer.vector_size # vector size of word embedding

In [58]:
def train(sentences, speakers, labels, model, criterion, optimizer):
    assert(len(labels) == len(sentences))
    model.train() 
    optimizer.zero_grad()
    out = model(sentences, speakers)
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    return loss

def validate(sentences, speakers, labels, model, criterion):
    assert(len(labels) == len(sentences))
    model.eval()
    out = model(sentences, speakers)
    loss = criterion(out, labels)
    pred_labels = out.argmax(dim=1)
    f1 = F1Score(task='binary', num_classes=num_classes)
    score = f1(labels, pred_labels)
    return loss, score

In [59]:
import optuna

def objective(trial):
    n_folds = 5
    n_epochs = 200
    patience = 10
    avg_score = 0
    

    skf = StratifiedKFold(n_splits=n_folds)

    for fold, (train_idx, valid_idx) in enumerate(skf.split(sentences, labels)):
        # split data
        train_labels = torch.tensor([labels[i] for i in train_idx])
        valid_labels = torch.tensor([labels[i] for i in valid_idx])
        
        train_sentences = [sentences[i] for i in train_idx]
        valid_sentences = [sentences[i] for i in valid_idx]

        train_speakers = [speakers[i] for i in train_idx]
        valid_speakers = [speakers[i] for i in valid_idx]
        
        # set model, criterion and optimizers
        # 1. parameters
        hidden_size = trial.suggest_int(f'hidden_size', 16, 256)
        # hidden_size = 64
        # lr = trial.suggest_float(f'lr', 1e-3, 1e-1)
        lr = 0.01
        # weight_decay = trial.suggest_float(f'weight_decay', 1e-5, 1e-2)
        weight_decay = 5e-4
        # 2. objects
        model = LSTMClassifier(num_features, hidden_size, num_classes).to(device)
        criterion = torch.nn.CrossEntropyLoss() # need to check input
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        best_weights = model.state_dict()

        # Epoch
        best_valid_loss = float('inf')
        score_at_best = -1
        current_patience = 0
        for epoch in range(n_epochs):
            train_loss = train(train_sentences, train_speakers, train_labels, model, criterion, optimizer)
            valid_loss, score = validate(valid_sentences, valid_speakers, valid_labels, model, criterion)
            
            # Stopping criteria            
            if valid_loss > best_valid_loss:
                current_patience += 1
            else:
                best_weights = model.state_dict()
                best_valid_loss = valid_loss    
                score_at_best = score 
                current_patience = 0
            
            if current_patience == patience:
                break
            
            print(f'Fold: {fold}, Epoch: {epoch}, Train loss: {train_loss:.4f}, Valid loss: {valid_loss:.4f}, Score: {score:.4f}')
        
        avg_score += score_at_best/n_folds
        
        # save model and params 
        torch.save(best_weights, f"models/lstm_{trial.number}_{fold}.pt")
        json.dump(trial.params, open(f"models/params_{trial.number}.json", "w"))
    return avg_score

In [60]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)

[I 2023-12-01 19:38:11,929] A new study created in memory with name: no-name-98711a11-c64f-4168-8b0d-a6ddbfa2d4f4


Fold: 0, Epoch: 0, Train loss: 0.6883, Valid loss: 0.4992, Score: 0.0007
Fold: 0, Epoch: 1, Train loss: 0.4995, Valid loss: 0.4934, Score: 0.0000
Fold: 0, Epoch: 2, Train loss: 0.4923, Valid loss: 0.4581, Score: 0.0007
Fold: 0, Epoch: 3, Train loss: 0.4562, Valid loss: 0.4442, Score: 0.0368
Fold: 0, Epoch: 4, Train loss: 0.4408, Valid loss: 0.4414, Score: 0.1007
Fold: 0, Epoch: 5, Train loss: 0.4364, Valid loss: 0.4214, Score: 0.1160
Fold: 0, Epoch: 6, Train loss: 0.4156, Valid loss: 0.3977, Score: 0.1075
Fold: 0, Epoch: 7, Train loss: 0.3921, Valid loss: 0.3840, Score: 0.1251
Fold: 0, Epoch: 8, Train loss: 0.3792, Valid loss: 0.3698, Score: 0.2274
Fold: 0, Epoch: 9, Train loss: 0.3658, Valid loss: 0.3631, Score: 0.4259
Fold: 0, Epoch: 10, Train loss: 0.3581, Valid loss: 0.3598, Score: 0.4577
Fold: 0, Epoch: 11, Train loss: 0.3551, Valid loss: 0.3516, Score: 0.2801
Fold: 0, Epoch: 12, Train loss: 0.3503, Valid loss: 0.3507, Score: 0.1310
Fold: 0, Epoch: 13, Train loss: 0.3502, Valid lo

[W 2023-12-01 20:52:55,545] Trial 0 failed with parameters: {'hidden_size': 78} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\astus\code\extractive-summarization\.venv\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\astus\AppData\Local\Temp\ipykernel_19696\2811287571.py", line 43, in objective
    valid_loss, score = validate(valid_sentences, valid_speakers, valid_labels, model, criterion)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\astus\AppData\Local\Temp\ipykernel_19696\1996007288.py", line 14, in validate
    out = model(sentences, speakers)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\astus\code\extractive-summarization\.venv\Lib\site-packages\torch\nn\modules\module.py", line 1518, in _wrapped_call_impl
    return self._call_impl(*args, *

KeyboardInterrupt: 

### Use best params

In [64]:
trial_number = 0
params = json.load(open(f"models/params_{trial_number}.json", "r"))

# maybe load model from a certain fold ?
# fold = 0
# model.load_state_dict(torch.load(f"models/lstm_{trial_number}_{fold}.pt"))

<All keys matched successfully>

### Use entire dataset to train

In [74]:
model = LSTMClassifier(num_features, 64, num_classes)
criterion = torch.nn.CrossEntropyLoss() # need to check input
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)

In [78]:
import custom_utils
from sklearn.model_selection import train_test_split

train_sentences, train_speakers, _ = custom_utils.gather_dataset("training", combine = False)
sentences, speakers, labels = read_data("training", "training_labels.json") 

y = labels
X = list(zip(sentences, speakers))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

train_sentences, train_speakers = zip(*X_train)
test_sentences, test_speakers = zip(*X_test)

train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

n_epochs = 200
patience = 10
best_valid_loss = float('inf')
for epoch in range(n_epochs):
    train_loss = train(train_sentences, train_speakers, train_labels, model, criterion, optimizer)
    valid_loss, score = validate(test_sentences, test_speakers, test_labels, model, criterion)
            
    # Stopping criteria            
    if valid_loss > best_valid_loss:
        current_patience += 1
    else:
        best_weights = model.state_dict()
        best_valid_loss = valid_loss    
        score_at_best = score 
        current_patience = 0
    
    if current_patience == patience:
        break
    
    print(f'Epoch: {epoch}, Train loss: {train_loss:.4f}, Valid loss: {valid_loss:.4f}, Score: {score:.4f}')


Epoch: 0, Train loss: 0.3152, Valid loss: 0.3296, Score: 0.4708
Epoch: 1, Train loss: 0.3156, Valid loss: 0.3293, Score: 0.5065
Epoch: 2, Train loss: 0.3150, Valid loss: 0.3297, Score: 0.5193
Epoch: 3, Train loss: 0.3153, Valid loss: 0.3293, Score: 0.4938
Epoch: 4, Train loss: 0.3148, Valid loss: 0.3295, Score: 0.4822
Epoch: 5, Train loss: 0.3150, Valid loss: 0.3295, Score: 0.5119
Epoch: 6, Train loss: 0.3147, Valid loss: 0.3295, Score: 0.5123
Epoch: 7, Train loss: 0.3147, Valid loss: 0.3293, Score: 0.4835
Epoch: 8, Train loss: 0.3146, Valid loss: 0.3292, Score: 0.4881
Epoch: 9, Train loss: 0.3144, Valid loss: 0.3293, Score: 0.5115
Epoch: 10, Train loss: 0.3144, Valid loss: 0.3291, Score: 0.5023
Epoch: 11, Train loss: 0.3141, Valid loss: 0.3292, Score: 0.4814
Epoch: 12, Train loss: 0.3142, Valid loss: 0.3292, Score: 0.5036
Epoch: 13, Train loss: 0.3139, Valid loss: 0.3293, Score: 0.5108
Epoch: 14, Train loss: 0.3139, Valid loss: 0.3292, Score: 0.4909
Epoch: 15, Train loss: 0.3138, Vali

### Produce test results

In [79]:
import custom_utils
# read test data (dicitonary-like)
test_sentences, test_speakers, _  = custom_utils.gather_dataset("test", combine = False)

In [81]:
# predictions
model.eval()
test_labels = {}
for id in test_sentences.keys():
    out = model(test_sentences[id], test_speakers[id])
    pred = out.argmax(dim=1)
    test_labels[id] = pred.tolist()

In [82]:
json.dump(test_labels, open("test_labels.json", "w"))