In [1]:
import torch
import json
import custom_utils
import numpy as np

### LSTM using pre-trained embedding

#### + hot-encoder for speaker on the last layer 

In [2]:
from gensim.models import KeyedVectors
import gensim.downloader as api

model_path = "tokenizer.model"
model_name = "glove-wiki-gigaword-300"

# load model (and save if necessary)
try:
    tokenizer = KeyedVectors.load(model_path)
except FileNotFoundError:
    tokenizer = api.load(model_name)
    tokenizer.save(model_path)

def embed_sentence(sentence, tokenizer):
    embeddings = []
    length = 0
    for word in sentence.split():
        word = word.lower()
        if not word in tokenizer: continue
        length += 1
        embeddings.append(tokenizer[word])

    if len(embeddings) == 0: # bug fix for padding function (we need to asssure at least one element)
        embeddings = np.zeros(shape=(1,tokenizer.vector_size))
        length = 1
    return torch.tensor(np.asarray(embeddings)), length

In [3]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device("cuda") 

# hot encode speakers
switcher = {
    "PM" : [1,0,0,0],
    "ME" : [0,1,0,0],
    "UI" : [0,0,1,0],
    "ID" : [0,0,0,1]
}

class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size + len(switcher), output_size)

    def forward(self, sentences, speakers):
        # from sentences to sequences of vectors (embedding)
        embedded_sentences, lengths = list(zip(*[embed_sentence(sentence, tokenizer) for sentence in sentences]))

        # pack / pad sequences (save memory) 
        packed_sentences = pack_padded_sequence(pad_sequence(embedded_sentences, batch_first=True), lengths, batch_first=True, enforce_sorted=False)
        
        # hot encode speakers
        encoded_speakers = torch.tensor([switcher[speaker] for speaker in speakers])

        # send to device 
        packed_sentences.to(device)
        encoded_speakers.to(device)
        
        # lstm layer
        _, (ht,_) = self.lstm(packed_sentences) # it does accept packed sequences 

        # linear f.c. layer 
        output = self.fc(torch.cat([ht[-1], encoded_speakers], dim=1)) # use only output from the last hidden state
        
        return output


### LSTM + embedding from scratch

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# read data
sentences, speakers, labels = custom_utils.read_data("training", "training_labels.json")

df = pd.DataFrame({"sentences" : sentences, "speakers" : speakers, "labels" : labels})

train, test = train_test_split(df, test_size=0.5, random_state=69)

train, valid = train_test_split(train, test_size=0.3, random_state=69)


print(f"Train: {len(train)}\nTest: {len(test)}\nValid: {len(valid)}")
train.head()

Train: 25417
Test: 36312
Valid: 10894


Unnamed: 0,sentences,speakers,labels
10418,Uh . <vocalsound> I mean to certain cues .,UI,0
21576,another thought I <disfmarker>,UI,0
65989,"Mm , yeah .",UI,0
1872,maybe like yellow and white <disfmarker>,PM,0
41397,Okay . <vocalsound> Okay .,ID,0


In [5]:
# Build vocabulary
import torchtext
from torchtext.data import get_tokenizer
from collections import Counter

tokenizer = get_tokenizer("basic_english")

words=[]
num_words = 1000

for text in df["sentences"]:
    tokens=tokenizer(text)
    words.extend(tokens)

top_1k = dict(Counter(words).most_common(1000))
top_1k['<unk>']=num_words+1
top_1k['<pad>']=num_words+2

vocab = torchtext.vocab.vocab(top_1k, specials = ['<unk>', '<pad>'])
vocab.set_default_index(vocab['<unk>'])

In [6]:
# sentences to vectors
# elements of those vectors are ID's representing the words
# padding / capping are applied
max_len=80

def vectorize_sentences(reviews, max_len):
    vectors=[]
    for text in reviews:
        tokens=tokenizer(text)
        v=vocab.forward(tokens)
        if len(v) > max_len : v = v[:max_len]
        if len(v) < max_len : #padding
            tmp = np.full(max_len, vocab['<pad>'])
            tmp[0:len(v)]=v 
            v = tmp
        vectors.append(np.array(v))
    return np.array(vectors)

In [7]:
train_X = vectorize_sentences(train['sentences'], max_len)
test_X = vectorize_sentences(test['sentences'], max_len)
val_X = vectorize_sentences(valid['sentences'], max_len)

train_y = np.array(train['labels']).reshape(-1,1)
test_y = np.array(test['labels']).reshape(-1,1)
val_y = np.array(valid['labels']).reshape(-1,1)

In [8]:
from torch.utils.data import TensorDataset, DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device("cuda")

# define batch size
batch_size = 64

# create tensor datasets
trainset = TensorDataset(torch.from_numpy(train_X).to(device), torch.from_numpy(train_y).float().to(device))
validset = TensorDataset(torch.from_numpy(val_X).to(device), torch.from_numpy(val_y).float().to(device))
testset = TensorDataset(torch.from_numpy(test_X).to(device), torch.from_numpy(test_y).float().to(device))

# create dataloaders
train_loader = DataLoader(trainset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
valid_loader = DataLoader(validset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))
test_loader = DataLoader(testset, shuffle=True, batch_size=batch_size, generator=torch.Generator(device=device))

In [9]:
class LSTMEmbedder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class):
        """
        vocab_size: (int) size of the vocabulary - required by embeddings
        embed_dim: (int) size of embeddings
        hidden_dim: (int) number of hidden units
        num_class: (int) number of classes
        """
        super().__init__()
        self.hidden_dim=hidden_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=1, bidirectional=False, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_class)
        self.dropout = nn.Dropout(0.8)
        

    def forward(self, text):
        r"""
        Arguments:
            text: 1-D tensor representing a bag of text tensors
        """
        text=self.embedding(text)
        output, (hidden, cell) = self.lstm(text)
        #for the sizes:
        #output:[batch_size, sent_len, hidden_dim*num directions]
        #hidden: [num_layers * num_directions, batch_size, hidden_dim]
        x = hidden.view(-1, self.hidden_dim)
        x=self.dropout(x)
        x=self.fc(x)
        out = torch.sigmoid(x)
        return out
    
    def get_embedding_for(self, w):
        idx = vocab.lookup_indices([w])
        return self.embedding(torch.Tensor(idx).int())

In [13]:
from sklearn.metrics import f1_score

def train_model(model, optimizer, loss_criterion):
    iter = 0
    flag = 0
    num_epochs = 10
    history_train_acc, history_val_acc, history_train_loss, history_val_loss = [], [], [], []
    best_accuracy = 0
    for epoch in range(num_epochs):
        for i, (samples, labels) in enumerate(train_loader):
            # Training mode
            model.train()

            # Load samples
            samples = samples.view(-1, max_len).to(device)
            labels = labels.view(-1, 1).to(device)

            # Clear gradients w.r.t. parameters
            optimizer.zero_grad()

            # Forward pass to get output/logits
            outputs = model(samples)
            
            # Calculate Loss: softmax --> cross entropy loss
            loss = loss_criterion(outputs, labels)
            
            # Getting gradients w.r.t. parameters
            loss.backward()

            # Updating parameters
            optimizer.step()

            iter += 1

            if iter % 100 == 0:
                # Get training statistics
                train_loss = loss.data.item()
            
                # Testing mode
                model.eval()
                # Calculate Accuracy         
                correct = 0
                total = 0
                
                true_labels = []
                predicted_labels = []
                # Iterate through test dataset
                for samples, labels in valid_loader:
                    # Load samples
                    samples = samples.view(-1, max_len).to(device)
                    labels = labels.view(-1).to(device)

                    # Forward pass only to get logits/output
                    outputs = model(samples)

                    # Val loss
                    val_loss = loss_criterion(outputs.view(-1, 1), labels.view(-1, 1))

                    predicted = outputs.ge(0.5).view(-1)

                    # Total number of labels
                    total += labels.size(0)

                    # Total correct predictions
                    correct += (predicted.type(torch.FloatTensor).cpu() == labels.type(torch.FloatTensor)).sum().item()
                    # correct = (predicted == labels.byte()).int().sum().item()

                    true_labels.extend(labels.cpu().numpy())
                    predicted_labels.extend(predicted.cpu().numpy())
                    score = f1_score(true_labels, predicted_labels)

                accuracy = 100. * correct / total

                # Print Loss
                print('Iter: {} | Train Loss: {} | Val Loss: {} | Val Accuracy: {} | F1-Score: {}'.format(iter, train_loss, val_loss.item(), round(accuracy, 2), score))

                # Append to history
                history_val_loss.append(val_loss.data.item())
                history_val_acc.append(round(accuracy, 2))
                history_train_loss.append(train_loss)

                # Save model when accuracy beats best accuracy
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    # We can load this best model on the validation set later
                    torch.save(model.state_dict(), 'best_model.pth')
    return (history_train_acc, history_val_acc, history_train_loss, history_val_loss)

In [11]:
# next --> Learn how to use the train loader

In [14]:
input_dim = num_words + 2 #add 2 for start and end sentence symbols
embedding_dim = 100
hidden_dim = 32
output_dim = 1

model = LSTMEmbedder(input_dim, embedding_dim, hidden_dim, output_dim)
# criterion = torch.nn.CrossEntropyLoss() 
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-3)

(train_acc, val_acc, train_loss, val_loss) = train_model(model, optimizer, criterion)

Iter: 100 | Train Loss: 0.5442842841148376 | Val Loss: 0.5196453332901001 | Val Accuracy: 81.74 | F1-Score: 0.0
Iter: 200 | Train Loss: 0.45501670241355896 | Val Loss: 0.34091874957084656 | Val Accuracy: 81.74 | F1-Score: 0.0
Iter: 300 | Train Loss: 0.39740949869155884 | Val Loss: 0.6293905377388 | Val Accuracy: 81.74 | F1-Score: 0.0
Iter: 400 | Train Loss: 0.4156252145767212 | Val Loss: 0.43384283781051636 | Val Accuracy: 81.74 | F1-Score: 0.0
Iter: 500 | Train Loss: 0.526523768901825 | Val Loss: 0.7034732699394226 | Val Accuracy: 81.74 | F1-Score: 0.0
Iter: 600 | Train Loss: 0.44506949186325073 | Val Loss: 0.2481091022491455 | Val Accuracy: 81.74 | F1-Score: 0.0
Iter: 700 | Train Loss: 0.599042534828186 | Val Loss: 0.32586905360221863 | Val Accuracy: 81.74 | F1-Score: 0.0
Iter: 800 | Train Loss: 0.4241156578063965 | Val Loss: 0.4211306869983673 | Val Accuracy: 81.74 | F1-Score: 0.0
Iter: 900 | Train Loss: 0.5314010381698608 | Val Loss: 0.4248012900352478 | Val Accuracy: 81.74 | F1-Sc

### Training with optuna


In [None]:
from sklearn.model_selection import StratifiedKFold
from torchmetrics.classification import F1Score

num_classes = 2
num_features = tokenizer.vector_size # vector size of word embeddings

# train data
sentences, speakers, labels = custom_utils.read_data("training", "training_labels.json")

: 

In [None]:
def train(sentences, speakers, labels, model, criterion, optimizer):
    assert(len(labels) == len(sentences))
    model.train() 
    optimizer.zero_grad()
    out = model(sentences, speakers)
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    return loss

def validate(sentences, speakers, labels, model, criterion):
    assert(len(labels) == len(sentences))
    model.eval()
    out = model(sentences, speakers)
    loss = criterion(out, labels)
    pred_labels = out.argmax(dim=1)
    f1 = F1Score(task='binary', num_classes=num_classes)
    score = f1(labels, pred_labels)
    return loss, score

In [None]:
import optuna

def objective(trial):
    n_folds = 5
    n_epochs = 200
    patience = 10
    avg_score = 0
    

    skf = StratifiedKFold(n_splits=n_folds)

    for fold, (train_idx, valid_idx) in enumerate(skf.split(sentences, labels)):
        # split data
        train_labels = torch.tensor([labels[i] for i in train_idx])
        valid_labels = torch.tensor([labels[i] for i in valid_idx])
        
        train_sentences = [sentences[i] for i in train_idx]
        valid_sentences = [sentences[i] for i in valid_idx]

        train_speakers = [speakers[i] for i in train_idx]
        valid_speakers = [speakers[i] for i in valid_idx]
        
        # set model, criterion and optimizers
        # 1. parameters
        hidden_size = trial.suggest_int(f'hidden_size', 10, 100)
        # hidden_size = 64
        lr = trial.suggest_float(f'lr', 1e-3, 1e-2)
        # lr = 0.001
        # weight_decay = trial.suggest_float(f'weight_decay', 1e-5, 1e-2)
        weight_decay = 5e-4
        # 2. objects
        model = LSTMClassifier(num_features, hidden_size, num_classes).to(device)
        criterion = torch.nn.CrossEntropyLoss() # need to check input
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
        best_weights = model.state_dict()

        # Epoch
        best_valid_loss = float('inf')
        score_at_best = -1
        current_patience = 0
        for epoch in range(n_epochs):
            train_loss = train(train_sentences, train_speakers, train_labels, model, criterion, optimizer)
            valid_loss, score = validate(valid_sentences, valid_speakers, valid_labels, model, criterion)
            
            # Stopping criteria            
            if valid_loss > best_valid_loss:
                current_patience += 1
            else:
                best_weights = model.state_dict()
                best_valid_loss = valid_loss    
                score_at_best = score 
                current_patience = 0
            
            if current_patience == patience:
                break
            
            print(f'Fold: {fold}, Epoch: {epoch}, Train loss: {train_loss:.4f}, Valid loss: {valid_loss:.4f}, Score: {score:.4f}')
        
        avg_score += score_at_best/n_folds
        
        # save model and params 
        torch.save(best_weights, f"models/lstm_{trial.number}_{fold}.pt")
        json.dump(trial.params, open(f"models/params_{trial.number}.json", "w"))
    return avg_score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

### Use best params

In [None]:
trial_number = 0
params = json.load(open(f"models/params_{trial_number}.json", "r"))

# maybe load model from a certain fold ?
# fold = 0
# model.load_state_dict(torch.load(f"models/lstm_{trial_number}_{fold}.pt"))

In [None]:
model = LSTMClassifier(num_features, 64, num_classes)
criterion = torch.nn.CrossEntropyLoss() # need to check input
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)

### Use entire dataset to train

In [None]:
import custom_utils
from sklearn.model_selection import train_test_split

train_sentences, train_speakers, _ = custom_utils.gather_dataset("training", combine = False)
sentences, speakers, labels = read_data("training", "training_labels.json") 

y = labels
X = list(zip(sentences, speakers))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

train_sentences, train_speakers = zip(*X_train)
test_sentences, test_speakers = zip(*X_test)

train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

n_epochs = 200
patience = 10
best_valid_loss = float('inf')
for epoch in range(n_epochs):
    train_loss = train(train_sentences, train_speakers, train_labels, model, criterion, optimizer)
    valid_loss, score = validate(test_sentences, test_speakers, test_labels, model, criterion)
            
    # Stopping criteria            
    if valid_loss > best_valid_loss:
        current_patience += 1
    else:
        best_weights = model.state_dict()
        best_valid_loss = valid_loss    
        score_at_best = score 
        current_patience = 0
    
    if current_patience == patience:
        break
    
    print(f'Epoch: {epoch}, Train loss: {train_loss:.4f}, Valid loss: {valid_loss:.4f}, Score: {score:.4f}')


### Produce test results

In [None]:
# read test data (dicitonary-like)
test_sentences, test_speakers, _  = custom_utils.gather_dataset("test", combine = False)

In [None]:
# predictions
model.eval()
test_labels = {}
for id in test_sentences.keys():
    out = model(test_sentences[id], test_speakers[id])
    pred = out.argmax(dim=1)
    test_labels[id] = pred.tolist()

In [None]:
# json.dump(test_labels, open("test_labels.json", "w"))