# Sentiment Analysis - RNN

## 1. load data

In [1]:
import pandas as pd
import bz2

# file_path = './test.ft.txt.bz2'
# with bz2.open(file_path, 'rt', encoding='utf-8') as file:
#     for i in range(5):
#         print(next(file).strip())

def load_bz2_file(file_path):
    """extract labels and texts"""
    labels = []
    texts = []
    with bz2.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            label, text = line.split(' ', 1)
            labels.append(1 if label.strip() == "__label__2" else 0)
            texts.append(text.strip())
    return pd.DataFrame({'label': labels, 'text': texts})

test_file = './test.ft.txt.bz2'
train_file = './train.ft.txt.bz2'

test_df = load_bz2_file(test_file)
train_df = load_bz2_file(train_file)

### * stop words

In [None]:
from nltk.corpus import stopwords

# download stop words list
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

train_df['text'] = train_df['text'].apply(remove_stop_words)
test_df['text'] = test_df['text'].apply(remove_stop_words)


In [2]:
print(f'Number of training examples: {len(train_df)}')
print(f'Number of testing examples: {len(test_df)}')


print("Test DataFrame:")
print(test_df.head())

print("\nTrain DataFrame:")
print(train_df.head())

Number of training examples: 3600000
Number of testing examples: 400000
Test DataFrame:
   label                                               text
0      1  Great CD: My lovely Pat has one of the GREAT v...
1      1  One of the best game music soundtracks - for a...
2      0  Batteries died within a year ...: I bought thi...
3      1  works fine, but Maha Energy is better: Check o...
4      1  Great for the non-audiophile: Reviewed quite a...

Train DataFrame:
   label                                               text
0      1  Stuning even for the non-gamer: This sound tra...
1      1  The best soundtrack ever to anything.: I'm rea...
2      1  Amazing!: This soundtrack is my favorite music...
3      1  Excellent Soundtrack: I truly like this soundt...
4      1  Remember, Pull Your Jaw Off The Floor After He...


## 2. preprocess

In [3]:
import torch

SEED = 42
torch.manual_seed(SEED)

from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(train_df, test_size=400000, random_state=SEED)

print(f'Number of training examples: {len(train_df)}')
print(f'Number of validation examples: {len(valid_df)}')
print(f'Number of testing examples: {len(test_df)}')

Number of training examples: 3200000
Number of validation examples: 400000
Number of testing examples: 400000


### * torch == 2.3.0; torchtext == 0.18.0

In [4]:
import torchtext
print(torchtext.__version__)

0.18.0+cpu


In [5]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer # type: ignore
from torchtext.vocab import build_vocab_from_iterator # type: ignore

class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, vocab):
        self.labels =  [int(label) for label in data['label']]
        self.texts = [torch.tensor([vocab[token] for token in tokenizer(text)]) for text in data['text']]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# create tokenizer and vocabulary dictionary
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter['text']:
        yield tokenizer(text)

# limit the number of words in vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(train_df),
    specials=["<unk>"],
    max_tokens=10000  # only keep the most frequent 10000 words
)
vocab.set_default_index(vocab["<unk>"])


# create data loader
def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=vocab["<unk>"])
    labels = torch.tensor(labels)
    return texts, labels

train_df = SentimentDataset(train_df, tokenizer, vocab)
test_df = SentimentDataset(test_df, tokenizer, vocab)
valid_df = SentimentDataset(valid_df, tokenizer, vocab)

val_loader = DataLoader(valid_df, batch_size=32, shuffle=False, collate_fn=collate_batch)
train_loader = DataLoader(train_df, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_df, batch_size=32, shuffle=False, collate_fn=collate_batch)



## 3. build RNN model

In [6]:
import torch.nn as nn
import torch.optim as optim

class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

# parameters
vocab_size = len(vocab)
embed_dim = 100
hidden_dim = 128
output_dim = 2
model = SimpleRNN(vocab_size, embed_dim, hidden_dim, output_dim)


In [7]:
from tqdm import tqdm

# loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            predicted_labels = outputs.argmax(dim=1)
            correct += (predicted_labels == labels).sum().item()
            total += labels.size(0)
    return total_loss / len(val_loader), correct / total

for epoch in range(5):
    model.train()
    total_loss = 0
    for texts, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}", unit="batch"):
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    val_loss, val_acc = evaluate(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}, Training Loss: {total_loss:.4f}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.2%}")


Epoch 1: 100%|██████████| 100000/100000 [2:28:38<00:00, 11.21batch/s]     


Epoch 1, Training Loss: 69435.2183, Validation Loss: 0.6949, Validation Accuracy: 50.60%


Epoch 2: 100%|██████████| 100000/100000 [2:32:34<00:00, 10.92batch/s] 


Epoch 2, Training Loss: 64336.2877, Validation Loss: 0.6907, Validation Accuracy: 50.49%


Epoch 3: 100%|██████████| 100000/100000 [2:24:42<00:00, 11.52batch/s] 


Epoch 3, Training Loss: 61460.7827, Validation Loss: 0.6054, Validation Accuracy: 70.43%


Epoch 4: 100%|██████████| 100000/100000 [2:37:58<00:00, 10.55batch/s] 


Epoch 4, Training Loss: 66411.2595, Validation Loss: 0.6911, Validation Accuracy: 50.71%


Epoch 5: 100%|██████████| 100000/100000 [1:32:59<00:00, 17.92batch/s]


Epoch 5, Training Loss: 68958.6260, Validation Loss: 0.6929, Validation Accuracy: 50.23%


In [8]:
# evaluation
model.eval()
correct, total = 0, 0
with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        predictions = model(texts)
        predicted_labels = predictions.argmax(dim=1)
        correct += (predicted_labels == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct / total:.2%}")


Test Accuracy: 50.28%
