In [71]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from pathlib import Path
import spacy
from torch.utils.tensorboard import SummaryWriter


torch.manual_seed(1)

nlp = spacy.load("en_core_web_sm")

writer = SummaryWriter('runs/tweeteval')


In [62]:
from collections import Counter

word_to_ix = {}
tag_to_ix = {0: "anger", 1: "joy",2: "optimism", 3: "sadness"} 
def process_tweets(data_dir, is_train=True, word_to_ix={}):
    file_name = "train_text.txt" if is_train else "test_text.txt"
    label_file_name = "train_labels.txt" if is_train else "test_labels.txt"
    
    with open(data_dir / file_name, "r", encoding="utf-8") as f:
        texts = [line.strip() for line in f]
    
    with open(data_dir / label_file_name, "r", encoding="utf-8") as f:
        labels = [int(line.strip()) for line in f]
    tweets = zip(texts, labels)

    processed_tweets = []
    
    word_counter = Counter()
    for tweet, label in tweets:
        doc = nlp(tweet)
        tokens = []
        for token in doc:
            if not token.is_stop and not token.is_punct and not token.is_space:
                token_text = token.text.lower()
                tokens.append(token_text)
                word_counter[token_text] += 1
        processed_tweets.append((tokens, label))
    
    most_common_words = word_counter.most_common(3000)
    for sent, _ in processed_tweets:
        for word in sent:
            if word not in most_common_words:
                word_to_ix[word] = 3001
            elif word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    return processed_tweets

In [63]:
processed_tweets = process_tweets(Path("tweeteval/datasets/emotion"), is_train=True, word_to_ix=word_to_ix)
processed_tweets_test = process_tweets(Path("tweeteval/datasets/emotion"), is_train=False, word_to_ix=word_to_ix)


In [64]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


In [65]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        last_lstm_out = lstm_out[-1]  # Get the last output of the LSTM
        tag_space = self.hidden2tag(last_lstm_out.view(1, -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [79]:
class GRUTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(GRUTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The GRU takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.gru = nn.GRU(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        gru_out, _ = self.gru(embeds.view(len(sentence), 1, -1))
        ast_gru_out = gru_out[-1]  # Get the last output of the LSTM
        tag_space = self.hidden2tag(ast_gru_out.view(1, -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [66]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 256

In [73]:
def log_metrics_to_tensorboard(model_name, epoch, test_loss):
    writer.add_scalar(f'{model_name}/test_loss', test_loss, epoch)

In [None]:


def train_model(model, model_name: str, loss_function, optimizer, epochs: int):
    for epoch in range(epochs):
        total_loss = 0
        for sentence, label in processed_tweets:
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix)
            target = torch.tensor([label], dtype=torch.long)

            # forward pass
            last_tag_score = model(sentence_in)

            # Compute the loss, gradients, and update the parameters
            loss = loss_function(last_tag_score, target)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        log_metrics_to_tensorboard(model_name, epoch, total_loss)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")


def test_model(model, processed_tweets_test):
    correct = 0
    total = len(processed_tweets_test)
    with torch.no_grad():
        for sentence, label in processed_tweets_test:
            sentence_in = prepare_sequence(sentence, word_to_ix)
            tag_scores = model(sentence_in)
            predicted_tags = torch.argmax(tag_scores, dim=1)
            if predicted_tags.item() == label:
                correct += 1
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")


In [None]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
train_model(model, "lstm", loss_function, optimizer, epochs=100)
test_model(model, processed_tweets_test)

Epoch 1/100, Loss: 1113246.3883
Epoch 2/100, Loss: 5790.8274
Epoch 3/100, Loss: 5790.3456
Epoch 4/100, Loss: 5789.8798
Epoch 5/100, Loss: 5789.4182
Epoch 6/100, Loss: 5788.9604
Epoch 7/100, Loss: 5788.5037
Epoch 8/100, Loss: 5788.0469
Epoch 9/100, Loss: 5787.5909
Epoch 10/100, Loss: 5787.1358
Epoch 11/100, Loss: 5786.6805
Epoch 12/100, Loss: 5786.2243
Epoch 13/100, Loss: 5785.7697
Epoch 14/100, Loss: 5785.3148
Epoch 15/100, Loss: 5784.8594
Epoch 16/100, Loss: 5784.4049
Epoch 17/100, Loss: 5783.9500
Epoch 18/100, Loss: 5783.4962
Epoch 19/100, Loss: 5783.0411
Epoch 20/100, Loss: 5782.5865
Epoch 21/100, Loss: 5782.1308
Epoch 22/100, Loss: 5781.6764
Epoch 23/100, Loss: 5781.2218
Epoch 24/100, Loss: 5780.7663
Epoch 25/100, Loss: 5780.3111
Epoch 26/100, Loss: 5779.8563
Epoch 27/100, Loss: 5779.4013
Epoch 28/100, Loss: 5778.9461
Epoch 29/100, Loss: 5778.4915
Epoch 30/100, Loss: 5778.0373
Epoch 31/100, Loss: 5777.5830
Epoch 32/100, Loss: 5777.1293
Epoch 33/100, Loss: 5776.6739
Epoch 34/100, Lo

In [91]:
gru_model = GRUTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(gru_model.parameters(), lr=0.1)
train_model(gru_model, "gru", loss_function, optimizer, epochs=100)
test_model(gru_model, processed_tweets_test)

Epoch 1/100, Loss: 4442.3897
Epoch 2/100, Loss: 4213.4029
Epoch 3/100, Loss: 4194.1765
Epoch 4/100, Loss: 4187.9751
Epoch 5/100, Loss: 4181.4128
Epoch 6/100, Loss: 4177.5328
Epoch 7/100, Loss: 4174.9855
Epoch 8/100, Loss: 4172.6767
Epoch 9/100, Loss: 4170.2457
Epoch 10/100, Loss: 4167.4800
Epoch 11/100, Loss: 4165.7044
Epoch 12/100, Loss: 4165.2013
Epoch 13/100, Loss: 4164.9363
Epoch 14/100, Loss: 4164.7352
Epoch 15/100, Loss: 4164.5601
Epoch 16/100, Loss: 4164.4009
Epoch 17/100, Loss: 4164.2524
Epoch 18/100, Loss: 4164.1114
Epoch 19/100, Loss: 4163.9753
Epoch 20/100, Loss: 4163.8426
Epoch 21/100, Loss: 4163.7123
Epoch 22/100, Loss: 4163.5832
Epoch 23/100, Loss: 4163.4544
Epoch 24/100, Loss: 4163.3248
Epoch 25/100, Loss: 4163.1933
Epoch 26/100, Loss: 4163.0584
Epoch 27/100, Loss: 4162.9184
Epoch 28/100, Loss: 4162.7714
Epoch 29/100, Loss: 4162.6152
Epoch 30/100, Loss: 4162.4468
Epoch 31/100, Loss: 4162.2633
Epoch 32/100, Loss: 4162.0609
Epoch 33/100, Loss: 4161.8351
Epoch 34/100, Loss:

In [93]:
%load_ext tensorboard
%tensorboard --logdir=runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 15443), started 0:00:04 ago. (Use '!kill 15443' to kill it.)