Process:
1. read the vocab file and build the vacab
2. split the data set
3. create the dataset in reqluired format
4. tokenize the and pad sequence
5. crate model and train and save model
6. use for prediction

Batch Size: Choose based on memory capacity and training stability. Common values are 32 or 64.
Sequence Length: Reflects the typical length of the text input, typically 100-300 words for reviews.
Embedding Dimension: Affects the richness of word representations, with common values ranging from 50 to 300.

hidden_dim in LSTM: Controls the size of the hidden states and the cell states. It determines the model's capacity to learn and retain information from sequences.
Choosing hidden_dim: Balance between too small (which may underfit) and too large (which may overfit and require more computation).
Impact: Larger hidden_dim increases the model's ability to capture complex patterns but also increases training time and memory usage


In [None]:
#Step 1: Prepare the Data
# Here, text contains the review and label contains the sentiment (1 for positive, 0 for negative).
text,label
"I love this movie!",1
"This is terrible.",0
...


Load and Preprocess Data
Read the CSV File: Use pandas to read the file.
Tokenize: Convert text into numerical tokens.
Pad Sequences: Make all sequences the same length.
Convert to Tensors: Prepare the data for PyTorch.

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np

In [None]:
# Read CSV file
df = pd.read_csv('sentiment_data.csv')

In [None]:
# Train-test split
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# Build vocabulary
def build_vocab(sentences, max_vocab_size=25000):
    words = Counter()
    for sentence in sentences:
        words.update(word_tokenize(sentence))
    common_words = words.most_common(max_vocab_size)
    vocab = {word: idx+2 for idx, (word, _) in enumerate(common_words)}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

In [None]:
# Tokenize and pad sequences
def tokenize_and_pad(sentence, vocab, max_length=100):
    tokens = [vocab.get(word, vocab['<UNK>']) for word in word_tokenize(sentence)]
    if len(tokens) < max_length:
        tokens.extend([vocab['<PAD>']] * (max_length - len(tokens)))
    else:
        tokens = tokens[:max_length]
    return tokens

In [None]:
# Convert dataset to PyTorch Dataset
class SentimentDataset(Dataset):
    def __init__(self, data, vocab, max_length=100):
        self.data = data
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        tokens = tokenize_and_pad(text, self.vocab, self.max_length)
        return torch.tensor(tokens, dtype=torch.long), torch.tensor(label, dtype=torch.float)

In [None]:
# Build the vocabulary from the training data
vocab = build_vocab(train_data['text'].tolist())

# Create PyTorch datasets
train_dataset = SentimentDataset(train_data, vocab)
test_dataset = SentimentDataset(test_data, vocab)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [None]:
# Define RNN Model

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(RNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths=[len(x)], batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(packed_embedded)
        output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
        return self.fc(hidden)

Explanation
Embedding Layer: Converts tokens into dense vectors.
RNN Layer: Processes the sequence of embeddings.
Fully Connected Layer: Produces the final output.
Training Loop: Updates the model parameters based on the loss.

In [None]:
# Training the RNN Model

vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 256
output_dim = 1
n_layers = 2
bidirectional = True
dropout = 0.5

rnn_model = RNNModel(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)

optimizer = optim.Adam(rnn_model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    rnn_model.train()
    epoch_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        text, label = batch
        predictions = rnn_model(text).squeeze(1)
        loss = criterion(predictions, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch {epoch+1}, Loss: {epoch_loss/len(train_loader)}')