In [71]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from pathlib import Path
import spacy
from torch.utils.tensorboard import SummaryWriter


torch.manual_seed(1)

nlp = spacy.load("en_core_web_sm")

writer = SummaryWriter('runs/tweeteval')


In [62]:
from collections import Counter

word_to_ix = {}
tag_to_ix = {0: "anger", 1: "joy",2: "optimism", 3: "sadness"} 
def process_tweets(data_dir, is_train=True, word_to_ix={}):
    file_name = "train_text.txt" if is_train else "test_text.txt"
    label_file_name = "train_labels.txt" if is_train else "test_labels.txt"
    
    with open(data_dir / file_name, "r", encoding="utf-8") as f:
        texts = [line.strip() for line in f]
    
    with open(data_dir / label_file_name, "r", encoding="utf-8") as f:
        labels = [int(line.strip()) for line in f]
    tweets = zip(texts, labels)

    processed_tweets = []
    
    word_counter = Counter()
    for tweet, label in tweets:
        doc = nlp(tweet)
        tokens = []
        for token in doc:
            if not token.is_stop and not token.is_punct and not token.is_space:
                token_text = token.text.lower()
                tokens.append(token_text)
                word_counter[token_text] += 1
        processed_tweets.append((tokens, label))
    
    most_common_words = word_counter.most_common(3000)
    for sent, _ in processed_tweets:
        for word in sent:
            if word not in most_common_words:
                word_to_ix[word] = 3001
            elif word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)
    return processed_tweets

In [63]:
processed_tweets = process_tweets(Path("tweeteval/datasets/emotion"), is_train=True, word_to_ix=word_to_ix)
processed_tweets_test = process_tweets(Path("tweeteval/datasets/emotion"), is_train=False, word_to_ix=word_to_ix)


In [64]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


In [65]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        last_lstm_out = lstm_out[-1]  # Get the last output of the LSTM
        tag_space = self.hidden2tag(last_lstm_out.view(1, -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [66]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 256

In [73]:
def log_metrics_to_tensorboard(model_name, epoch, test_loss):
    writer.add_scalar(f'{model_name}/test_loss', test_loss, epoch)

In [72]:
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

def train_model(model, loss_function, optimizer, epochs):
    for epoch in range(epochs):
        total_loss = 0
        for sentence, label in processed_tweets:
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix)
            target = torch.tensor([label], dtype=torch.long)

            # forward pass
            last_tag_score = model(sentence_in)

            # Compute the loss, gradients, and update the parameters
            loss = loss_function(last_tag_score, target)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        log_metrics_to_tensorboard("LSTMTagger", epoch, total_loss)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")


def test_model(model, processed_tweets_test):
    correct = 0
    total = len(processed_tweets_test)
    with torch.no_grad():
        for sentence, label in processed_tweets_test:
            sentence_in = prepare_sequence(sentence, word_to_ix)
            tag_scores = model(sentence_in)
            predicted_tags = torch.argmax(tag_scores, dim=1)
            if predicted_tags.item() == label:
                correct += 1
    accuracy = correct / total
    print(f"Accuracy: {accuracy:.4f}")


In [77]:
train_model(model, loss_function, optimizer, epochs=100)
test_model(model, processed_tweets_test)

Epoch 1/100, Loss: 4118.7536
Epoch 2/100, Loss: 4117.7466
Epoch 3/100, Loss: 4116.9765
Epoch 4/100, Loss: 4116.4035
Epoch 5/100, Loss: 4115.9769
Epoch 6/100, Loss: 4115.6591
Epoch 7/100, Loss: 4115.4205
Epoch 8/100, Loss: 4115.2379
Epoch 9/100, Loss: 4115.0942
Epoch 10/100, Loss: 4114.9775
Epoch 11/100, Loss: 4114.8801
Epoch 12/100, Loss: 4114.7968
Epoch 13/100, Loss: 4114.7243
Epoch 14/100, Loss: 4114.6606
Epoch 15/100, Loss: 4114.6041
Epoch 16/100, Loss: 4114.5539
Epoch 17/100, Loss: 4114.5090
Epoch 18/100, Loss: 4114.4687
Epoch 19/100, Loss: 4114.4323
Epoch 20/100, Loss: 4114.3995
Epoch 21/100, Loss: 4114.3696
Epoch 22/100, Loss: 4114.3422
Epoch 23/100, Loss: 4114.3166
Epoch 24/100, Loss: 4114.2925
Epoch 25/100, Loss: 4114.2694
Epoch 26/100, Loss: 4114.2470
Epoch 27/100, Loss: 4114.2251
Epoch 28/100, Loss: 4114.2034
Epoch 29/100, Loss: 4114.1818
Epoch 30/100, Loss: 4114.1601
Epoch 31/100, Loss: 4114.1382
Epoch 32/100, Loss: 4114.1161
Epoch 33/100, Loss: 4114.0935
Epoch 34/100, Loss:

In [78]:
%load_ext tensorboard
%tensorboard --logdir=runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 7104), started 0:12:36 ago. (Use '!kill 7104' to kill it.)