<a href="https://colab.research.google.com/github/deckerkrogh/semeval-2024-10/blob/main/DualLSTMv2_semeval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install unzip
!wget https://nlp.stanford.edu/data/glove.6B.zip
!unzip /content/glove.6B.zip

Collecting unzip
  Downloading unzip-1.0.0.tar.gz (704 bytes)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: unzip
  Building wheel for unzip (setup.py) ... [?25l[?25hdone
  Created wheel for unzip: filename=unzip-1.0.0-py3-none-any.whl size=1280 sha256=79abb825a59394f4a5a95c3b2b98197cfddf1d25a9dea8b1ed6bd934ed39feba
  Stored in directory: /root/.cache/pip/wheels/80/dc/7a/f8af45bc239e7933509183f038ea8d46f3610aab82b35369f4
Successfully built unzip
Installing collected packages: unzip
Successfully installed unzip-1.0.0
--2023-12-07 02:48:58--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-12-07 02:48:58--  https://downloads.cs.stanford.edu/nlp/data/glove.6

In [None]:
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

import gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.metrics import accuracy_score


In [None]:
# TODO: use wget
glovefile = 'glove.6B.100d.txt'
glove_file_datapath = datapath(glovefile)
tmp_file = get_tmpfile('word2vec.txt')

_ = glove2word2vec(glovefile, tmp_file)
word2vec_weights = gensim.models.KeyedVectors.load_word2vec_format(tmp_file)

  _ = glove2word2vec(glovefile, tmp_file)


In [None]:
print(word2vec_weights['cat'])

[ 0.23088    0.28283    0.6318    -0.59411   -0.58599    0.63255
  0.24402   -0.14108    0.060815  -0.7898    -0.29102    0.14287
  0.72274    0.20428    0.1407     0.98757    0.52533    0.097456
  0.8822     0.51221    0.40204    0.21169   -0.013109  -0.71616
  0.55387    1.1452    -0.88044   -0.50216   -0.22814    0.023885
  0.1072     0.083739   0.55015    0.58479    0.75816    0.45706
 -0.28001    0.25225    0.68965   -0.60972    0.19578    0.044209
 -0.31136   -0.68826   -0.22721    0.46185   -0.77162    0.10208
  0.55636    0.067417  -0.57207    0.23735    0.4717     0.82765
 -0.29263   -1.3422    -0.099277   0.28139    0.41604    0.10583
  0.62203    0.89496   -0.23446    0.51349    0.99379    1.1846
 -0.16364    0.20653    0.73854    0.24059   -0.96473    0.13481
 -0.0072484  0.33016   -0.12365    0.27191   -0.40951    0.021909
 -0.6069     0.40755    0.19566   -0.41802    0.18636   -0.032652
 -0.78571   -0.13847    0.044007  -0.084423   0.04911    0.24104
  0.45273   -0.18682 

In [None]:
#@title Load data

train_url = 'https://raw.githubusercontent.com/deckerkrogh/nlp243_data/main/datasets/task3_train.json'
train_df = pd.read_json(train_url)

In [None]:
#@title Sequencer

class PreTrainedSequencer(object):
    # Maps text to index and then to corresponding embedding
    # Source: Jesh
    def __init__(self, corpus, gensim_w2v, embedding_dim, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>'):

        print(corpus)
        self.idx2word = {}
        self.word2idx = {}
        self.embedding_dim = embedding_dim
        self.w2v = gensim_w2v

        self.w2v.add_vectors([bos_token], [np.random.uniform(low=-1, high=1.0, size=(self.embedding_dim,))])
        self.w2v.add_vectors([eos_token], [np.random.uniform(low=-1, high=1.0, size=(self.embedding_dim,))])
        self.w2v.add_vectors([unk_token], [np.random.uniform(low=-1, high=1.0, size=(self.embedding_dim,))])
        self.w2v.add_vectors([pad_token], [np.random.uniform(low=-1, high=1.0, size=(self.embedding_dim,))])

        self.bos_index = self.add_token(bos_token)
        self.eos_index = self.add_token(eos_token)
        self.unk_index = self.add_token(unk_token)
        self.pad_index = self.add_token(pad_token)
        self.tokenizer = lambda text: [t for t in text.split(' ')]

        for _sentence in corpus:
            #print(_sentence)
            for _token in self.tokenizer(_sentence):
                self.add_token(_token)
        self.pre_trained_embeddings = torch.zeros([len(self.idx2word.keys()), self.embedding_dim])

        for idx, word in self.idx2word.items():
            if self.w2v.has_index_for(word):
                self.pre_trained_embeddings[idx] = torch.tensor(self.w2v.get_vector(self.w2v.key_to_index.get(word), norm=True))
            else:
                self.pre_trained_embeddings[idx] = torch.tensor(self.w2v.get_vector(self.w2v.key_to_index.get(unk_token), norm=True))

    def add_token(self, token):
        if token not in self.word2idx:
            self.word2idx[token] = new_index = len(self.word2idx)
            self.idx2word[new_index] = token
            return new_index
        else:
            return self.word2idx[token]

    def encode(self, text):
        tokens = self.tokenizer(text)
        sequence = []
        sequence.append(self.bos_index)

        for token in tokens:

            index = self.word2idx.get(token, self.unk_index)
            sequence.append(index)

        sequence.append(self.eos_index)
        return sequence

    def create_padded_tensor(self, sequences):
        lengths = [len(sequence) for sequence in sequences]
        max_seq_len = max(lengths)
        tensor = torch.full((len(sequences), max_seq_len), self.pad_index, dtype=torch.long)

        for i, sequence in enumerate(sequences):
            for j, token in enumerate(sequence):
                tensor[i][j] = token

        return tensor, lengths

In [None]:
#@title Dataset class

class MELDDataset(Dataset):
    def __init__(self, df, sequencer):
        self.sequencer = sequencer

        # Create list of (target, candidate, trigger) groupings
        self.utt_pairs = []
        for i, conversation in df.iterrows():
            target = conversation['utterances'][-1]
            for utt, trigger, in zip(conversation['utterances'], conversation['triggers']):
                self.utt_pairs.append([target, utt, trigger])

    def __getitem__(self, index):
        # Return encode target, candidate, and trigger
        utt_pair = self.utt_pairs[index]
        x_target = self.sequencer.encode(utt_pair[0])
        x_candidate = self.sequencer.encode(utt_pair[1])
        y = utt_pair[2]
        return x_target, x_candidate, y

    def __len__(self):
        return len(self.utt_pairs)

In [None]:
#@title Model

import torch
import torch.nn as nn

class DualLSTM(nn.Module):
    def __init__(self, pad_index, embedding_dim, batch_size,
                 pre_trained_embeddings, num_layers=1, hidden_size=100,
                 dropout_p=0.1):
        super(DualLSTM, self).__init__()

        self.pad_index = pad_index
        self.hidden_size = hidden_size
        self.dropout_p = dropout_p
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size
        self.pre_trained_embeddings = pre_trained_embeddings

        # Layers
        self.embedding = nn.Embedding.from_pretrained(pre_trained_embeddings)
        self.dropout = nn.Dropout(dropout_p)  # Added dropout after embedding

        self.target_lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout_p if num_layers > 1 else 0,
            bidirectional=False,
            batch_first=True
        )

        self.cand_lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout_p if num_layers > 1 else 0,
            bidirectional=False,
            batch_first=True
        )

        # Concatenate the two lstm outputs
        self.fc1 = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 10)
        self.fc3 = nn.Linear(10, 1)

    def forward(self, targets, target_l, candidates, cand_l):
        embed_target = self.dropout(self.embedding(targets))  # Applying dropout
        embed_cand = self.dropout(self.embedding(candidates))  # Applying dropout

        packed_target_i = nn.utils.rnn.pack_padded_sequence(embed_target, target_l, batch_first=True, enforce_sorted=False)
        packed_cand_i = nn.utils.rnn.pack_padded_sequence(embed_cand, cand_l, batch_first=True, enforce_sorted=False)

        o, (target_h_n, h_c) = self.target_lstm(packed_target_i)
        o, (cand_h_n, h_c) = self.cand_lstm(packed_cand_i)

        target_h_n = target_h_n.squeeze()
        cand_h_n = cand_h_n.squeeze()

        # Create context vector from both LSTM's by concatenating
        context_vector = torch.cat((target_h_n, cand_h_n), dim=1)

        fc1_o = self.fc1(context_vector)
        fc2_o = self.fc2(fc1_o)
        logits = self.fc3(fc2_o).squeeze()

        return logits  # Removed softmax, logits will be passed to BCEWithLogitsLoss


In [None]:
def prepare_batch(batch, sequencer):
    target_utts, candidate_utts, triggers = zip(*batch)
    input_target_tensor, target_lengths = sequencer.create_padded_tensor(target_utts)
    input_candidate_tensor, cand_lengths = sequencer.create_padded_tensor(candidate_utts)
    return (input_target_tensor, target_lengths, input_candidate_tensor, cand_lengths, triggers)

In [None]:
# Parameters
embedding_dim = 100
batch_size = 32
hidden_size = 512

learning_rate = 0.00001
loss_function = nn.BCEWithLogitsLoss()  # Consider using pos weighting

train_utts = [utt for sublist in train_df['utterances'] for utt in sublist]
sequencer = PreTrainedSequencer(train_utts, word2vec_weights, embedding_dim)

model = DualLSTM(
    sequencer.pad_index,
    embedding_dim=embedding_dim,
    batch_size=batch_size,
    pre_trained_embeddings = sequencer.pre_trained_embeddings,
    hidden_size=hidden_size
)
#pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#print(pytorch_total_params)  # ~3,000,000 right now

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

train_dataset = MELDDataset(train_df, sequencer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, collate_fn=lambda batch: prepare_batch(batch, sequencer))

TypeError: ignored

In [None]:
def train(model, optimizer, loss_function, loader):
    model.train()  # Set the model to training mode
    running_loss_history = []
    running_loss = 0.
    print_step = 25

    for i, batch in enumerate(loader):
        targets, target_l, candidates, cand_l, triggers = batch
        batch_size = targets.shape[0]

        if i % print_step == 0:
            print(f"    {i} / {len(loader)}")

        optimizer.zero_grad()  # Zero the gradients

        logits = model(targets, target_l, candidates, cand_l)

        # Convert triggers to a tensor of dtype float32, if not already
        triggers = torch.tensor(triggers, dtype=torch.float32)

        # Compute the loss
        loss = loss_function(logits, triggers)
        running_loss += (loss.item() - running_loss) / (i + 1)
        running_loss_history.append(running_loss)

        if i % print_step == 0:
            print(f'    running loss: {running_loss}')

        # Backward pass
        loss.backward()

        # Gradient Clipping
        nn.utils.clip_grad_norm_(model.parameters(), 3.0)

        # Optimization step
        optimizer.step()

    mean_running_loss = np.mean(running_loss_history)
    return mean_running_loss


In [None]:
def run_training(model, optimizer, loss_function, train_loader, valid_loader=None, n_epochs=10):
    train_running_losses = []

    for i in range(n_epochs):
        print(f"EPOCH {i}")
        mean_running_loss = train(model, optimizer, loss_function, train_loader)

        # Append the mean running loss for the epoch
        train_running_losses.append(mean_running_loss)

    return train_running_losses


In [None]:
run_training(model, optimizer, loss_function, train_loader, n_epochs=1)

EPOCH 0
    0 / 871
    running loss: 0.6515674591064453
    25 / 871
    running loss: 0.6343209239152762
    50 / 871
    running loss: 0.6256203838423188
    75 / 871
    running loss: 0.6183358470076008
    100 / 871
    running loss: 0.6112603846162851
    125 / 871
    running loss: 0.6037156581878661
    150 / 871
    running loss: 0.5926535774935158
    175 / 871
    running loss: 0.5815709521147335
    200 / 871
    running loss: 0.5700720364774635
    225 / 871
    running loss: 0.557276178909614
    250 / 871
    running loss: 0.5475377846524059
    275 / 871
    running loss: 0.5397769117700878
    300 / 871
    running loss: 0.5306451561245014
    325 / 871
    running loss: 0.5270089348035355
    350 / 871
    running loss: 0.5240806364945196
    375 / 871
    running loss: 0.517772857179033
    400 / 871
    running loss: 0.5130296660926279
    425 / 871
    running loss: 0.5094585521540173
    450 / 871
    running loss: 0.5053077166334223
    475 / 871
    running loss

[0.5287331507126807]

In [None]:
def evaluate(model, loss_function, loader):
    model.eval()
    print_step = 25

    accuracy_list = []
    for i, batch in enumerate(loader):
        targets, target_l, candidates, cand_l, triggers = batch
        batch_size = targets.shape[0]

        logits = model(targets, target_l, candidates, cand_l)

        # Convert triggers to a tensor of dtype float32, if not already
        #triggers = torch.tensor(triggers, dtype=torch.float32)

        thresh = 0.30  # this is dumb isn't it
        sig_layer = torch.nn.Sigmoid()
        scores = sig_layer(logits)

        pred = [1 if score > thresh else 0 for score in scores]
        #print(pred)

        accuracy_list.append(accuracy_score(triggers, pred))
        if i % print_step == 0:
            print(pred)
            print(f'current accuracy: {sum(accuracy_list) / i}')

    avg_accuracy = sum(accuracy_list) / len(accuracy_list)
    print(avg_accuracy)

evaluate(model, loss_function, train_loader)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
current accuracy: inf


  print(f'current accuracy: {sum(accuracy_list) / i}')


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
current accuracy: 0.88
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
current accuracy: 0.87125
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
current accuracy: 0.86375
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
current accuracy: 0.8575
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
current accuracy: 0.8515


KeyboardInterrupt: ignored