# Sentiment Analysis - LSTM

## 1. load data

In [None]:
import pandas as pd
import bz2

# file_path = './test.ft.txt.bz2'
# with bz2.open(file_path, 'rt', encoding='utf-8') as file:
#     for i in range(5):
#         print(next(file).strip())

def load_bz2_file(file_path):
    """extract labels and texts"""
    labels = []
    texts = []
    with bz2.open(file_path, 'rt', encoding='utf-8') as file:
        for line in file:
            label, text = line.split(' ', 1)
            labels.append(1 if label.strip() == "__label__2" else 0)
            texts.append(text.strip())
    return pd.DataFrame({'label': labels, 'text': texts})

test_file = './test.ft.txt.bz2'
train_file = './train.ft.txt.bz2'

test_df = load_bz2_file(test_file)
train_df = load_bz2_file(train_file)

Test DataFrame:
   label                                               text
0      1  Great CD: My lovely Pat has one of the GREAT v...
1      1  One of the best game music soundtracks - for a...
2      0  Batteries died within a year ...: I bought thi...
3      1  works fine, but Maha Energy is better: Check o...
4      1  Great for the non-audiophile: Reviewed quite a...

Train DataFrame:
   label                                               text
0      1  Stuning even for the non-gamer: This sound tra...
1      1  The best soundtrack ever to anything.: I'm rea...
2      1  Amazing!: This soundtrack is my favorite music...
3      1  Excellent Soundtrack: I truly like this soundt...
4      1  Remember, Pull Your Jaw Off The Floor After He...


### * stop words

In [None]:
from nltk.corpus import stopwords

# download stop words list
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stop_words(text):
    return ' '.join(word for word in text.split() if word not in stop_words)

train_df['text'] = train_df['text'].apply(remove_stop_words)
test_df['text'] = test_df['text'].apply(remove_stop_words)

In [None]:
print(f'Number of training examples: {len(train_df)}')
print(f'Number of testing examples: {len(test_df)}')

print("Test DataFrame:")
print(test_df.head())

print("\nTrain DataFrame:")
print(train_df.head())

Number of training examples: 3600000
Number of testing examples: 400000


In [None]:
import torchtext
import torch
print(torchtext.__version__)

0.18.0+cpu


## 2. preprocess

In [None]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer # type: ignore
from torchtext.vocab import build_vocab_from_iterator # type: ignore

class SentimentDataset(Dataset):
    def __init__(self, data, tokenizer, vocab):
        self.labels =  [int(label) for label in data['label']]
        self.texts = [torch.tensor([vocab[token] for token in tokenizer(text)]) for text in data['text']]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# create tokenizer and vocabulary dictionary
tokenizer = get_tokenizer("basic_english")
def yield_tokens(data_iter):
    for text in data_iter['text']:
        yield tokenizer(text)

# limit the number of words in vocabulary
vocab = build_vocab_from_iterator(
    yield_tokens(train_df),
    specials=["<unk>"],
    max_tokens=10000  # only keep the most frequent 10000 words
)
vocab.set_default_index(vocab["<unk>"])

# create data loader
def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=vocab["<unk>"])
    labels = torch.tensor(labels)
    return texts, labels

train_df = SentimentDataset(train_df, tokenizer, vocab)
test_df = SentimentDataset(test_df, tokenizer, vocab)
valid_df = SentimentDataset(valid_df, tokenizer, vocab)

val_loader = DataLoader(valid_df, batch_size=32, shuffle=False, collate_fn=collate_batch)
train_loader = DataLoader(train_df, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_df, batch_size=32, shuffle=False, collate_fn=collate_batch)



## 3. build LSTM model

In [None]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        
        self.rnn = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        
        #text = [sent len, batch size]
        
        embedded = self.dropout(self.embedding(text))
        
        #embedded = [sent len, batch size, emb dim]
        
        #pack sequence
        # lengths need to be on CPU!
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        
        #unpack sequence
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)

        #output = [sent len, batch size, hid dim * num directions]
        #output over padding tokens are zero tensors
        
        #hidden = [num layers * num directions, batch size, hid dim]
        #cell = [num layers * num directions, batch size, hid dim]
        
        #concat the final forward (hidden[-2,:,:]) and backward (hidden[-1,:,:]) hidden layers
        #and apply dropout
        
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
                
        #hidden = [batch size, hid dim * num directions]
            
        return self.fc(hidden)