In [None]:
!pip install -Uqq datasets
!pip install -Uqq transformers

## Load dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("carblacac/twitter-sentiment-analysis")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'feeling'],
        num_rows: 119988
    })
    validation: Dataset({
        features: ['text', 'feeling'],
        num_rows: 29997
    })
    test: Dataset({
        features: ['text', 'feeling'],
        num_rows: 61998
    })
})


## Split dataset

In [None]:
train_data = dataset["train"]
val_data = dataset["validation"]
test_data = dataset["test"]

# sample data point
sample = train_data[2]
print("Sample:", sample)

Sample: {'text': "@bradleyjp decidedly undecided. Depends on the situation. When I'm out with the people I'll be in Chicago with? Maybe.", 'feeling': 1}


In [None]:
x_train = [item['text'] for item in train_data]
y_train = [item['feeling'] for item in train_data]

x_val = [item['text'] for item in val_data]
y_val = [item['feeling'] for item in val_data]

x_test = [item['text'] for item in test_data]
y_test = [item['feeling'] for item in test_data]

## Preprocess data

In [None]:
import re

def preprocess_string(s):
    s = re.sub(r"[^\w\s]", '', s)
    s = re.sub(r"\s+", ' ', s)
    s = re.sub(r"\d", '', s)
    s = re.sub(r" {2,}", ' ', s)
    return s

# Example
text = "This is 500  example sentence.\n"
prs_text = preprocess_string(text)
print(prs_text)

This is example sentence 


## Tokenize

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

def tokenize(text, max_length):
    pre_text = preprocess_string(text)
    doc = nlp(pre_text)
    tokens = [token.text for token in doc]
    padded_tokens = tokens + ['<PAD>'] * (max_length - len(tokens))
    return padded_tokens[:max_length]

# Example
text = "This is 500  example sentence.\n"
tokens = tokenize(text, 10)
print(tokens)


['This', 'is', 'example', 'sentence', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [None]:
max_length = 20

# Tokenize and pad
x_train_tokenized = [tokenize(text, max_length) for text in x_train]
x_val_tokenized = [tokenize(text, max_length) for text in x_val]
x_test_tokenized = [tokenize(text, max_length) for text in x_test]

print(x_train_tokenized[0])

['faami', 'so', 'happy', 'that', 'salman', 'won', 'btw', 'the', 'sec', 'clip', 'is', 'truely', 'a', 'teaser', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>', '<PAD>']


In [None]:
all_tokenized_sentences = x_train_tokenized + x_val_tokenized + x_test_tokenized
vocab = set(word for sentence in all_tokenized_sentences for word in sentence)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}

x_train_indices = [[word_to_idx[word] for word in sentence] for sentence in x_train_tokenized]
x_val_indices = [[word_to_idx[word] for word in sentence] for sentence in x_val_tokenized]
x_test_indices = [[word_to_idx[word] for word in sentence] for sentence in x_test_tokenized]

# PyTorch tensors
x_train_tensor = torch.tensor(x_train_indices)
y_train_tensor = torch.tensor(y_train)

x_val_tensor = torch.tensor(x_val_indices)
y_val_tensor = torch.tensor(y_val)

x_test_tensor = torch.tensor(x_test_indices)
y_test_tensor = torch.tensor(y_test)

In [None]:
x_train_tensor = x_train_tensor.to(torch.float32)
y_train_tensor = y_train_tensor.to(torch.float32)

x_val_tensor = x_val_tensor.to(torch.float32)
y_val_tensor = y_val_tensor.to(torch.float32)

x_test_tensor = x_test_tensor.to(torch.float32)
y_test_tensor = y_test_tensor.to(torch.float32)

print("x_train shape: ", x_train_tensor.shape)
print("y_train shape: ", y_train_tensor.shape)

x_train shape:  torch.Size([119988, 20])
y_train shape:  torch.Size([119988])


## Model definition and instantiation

In [None]:
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, bidirectional=True, num_layers=2)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

In [None]:
input_dim = 20
hidden_dim = 64
output_dim = 1

model = BiLSTMModel(input_dim, hidden_dim, output_dim)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training

In [None]:
for epoch in range(5):
    for i in range(len(x_train_tensor)):

        x = x_train_tensor[i].unsqueeze(0)
        y = y_train_tensor[i]

        outputs = model(x)
        loss = criterion(outputs, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if i % 20000 == 0:
            print('Epoch: {}, Iteration: {}, Loss: {}'.format(epoch, i, loss.item()))


  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 0, Iteration: 0, Loss: 0.00027785994461737573
Epoch: 0, Iteration: 20000, Loss: 0.25416967272758484
Epoch: 0, Iteration: 40000, Loss: 0.22865530848503113
Epoch: 0, Iteration: 60000, Loss: 0.2692406177520752
Epoch: 0, Iteration: 80000, Loss: 0.26451122760772705
Epoch: 0, Iteration: 100000, Loss: 0.24509358406066895
Epoch: 1, Iteration: 0, Loss: 0.2372581660747528
Epoch: 1, Iteration: 20000, Loss: 0.2535167336463928
Epoch: 1, Iteration: 40000, Loss: 0.2395762801170349
Epoch: 1, Iteration: 60000, Loss: 0.2693035304546356
Epoch: 1, Iteration: 80000, Loss: 0.2642890214920044
Epoch: 1, Iteration: 100000, Loss: 0.24509380757808685
Epoch: 2, Iteration: 0, Loss: 0.23725979030132294
Epoch: 2, Iteration: 20000, Loss: 0.2535351514816284
Epoch: 2, Iteration: 40000, Loss: 0.2388027310371399
Epoch: 2, Iteration: 60000, Loss: 0.26929134130477905
Epoch: 2, Iteration: 80000, Loss: 0.26429951190948486
Epoch: 2, Iteration: 100000, Loss: 0.24503910541534424
Epoch: 3, Iteration: 0, Loss: 0.2372926026

## Evaulation

In [1]:
with torch.no_grad():
    outputs = model(x_test_tensor)
    predictions = outputs.round()
    accuracy = (predictions == y_test_tensor).float().mean()
    print('Accuracy:', accuracy.item())

Accuracy: 73.3453
