In [25]:
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from torch import nn, optim
import torch
import pandas as pd
import numpy as np
import tiktoken

if torch.cuda.is_available():
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

In [26]:
data_path = '/content/twitter_training.csv'
dataset = pd.read_csv(data_path, header=None)

In [27]:
np.hstack((dataset.iloc[0, 0:2].values,dataset.iloc[0, 3:].values, dataset.iloc[0, 2]))

array([np.int64(2401), 'Borderlands',
       'im getting on borderlands and i will murder you all ,',
       'Positive'], dtype=object)

In [28]:
dataset.iloc[:, 2].unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [29]:
encoder = LabelEncoder()

encoded_df = encoder.fit_transform(dataset.iloc[: , 2])

dataset[2] = encoded_df

dataset.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,3,im getting on borderlands and i will murder yo...
1,2401,Borderlands,3,I am coming to the borders and I will kill you...
2,2401,Borderlands,3,im getting on borderlands and i will kill you ...
3,2401,Borderlands,3,im coming on borderlands and i will murder you...
4,2401,Borderlands,3,im getting on borderlands 2 and i will murder ...


In [30]:
dataset.iloc[0, 3]

'im getting on borderlands and i will murder you all ,'

In [31]:
tokenizer = tiktoken.get_encoding("gpt2")

np.array(tokenizer.encode(dataset.iloc[0, 3:].to_string()))

array([   18,   220,   220,   220,   545,  1972,   319,  4865,  4447,
         290,  1312,   481,  5123, 27406,   986])

In [32]:
import re

def clean_text(text):
  text = text.lower()
  text =  re.sub(r'[^a-z\s]', '', text)
  text = re.sub(r'\s+', ' ', text).strip()
  return text

In [33]:
class WordDataset(Dataset):
  def __init__(self, path, max_length, tokenizer):
      self.dataset = pd.read_csv(path, header=None)
      self.dataset = self.dataset.dropna(subset=[2,3])
      self.max_length = max_length
      self.tokenizer = tokenizer
      self.encoder = LabelEncoder()
      encoded_df = self.encoder.fit_transform(self.dataset.iloc[: , 2])
      self.dataset[3] = self.dataset[3].apply(clean_text)
      self.dataset[2] = encoded_df

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, index):
    text = self.dataset.iloc[index, 3]
    label = self.dataset.iloc[index, 2]
    encoded_token = self.tokenizer.encode(text)
    if len(encoded_token) < self.max_length:
        encoded_token = encoded_token + [0] * (self.max_length - len(encoded_token))
    else:
        encoded_token = encoded_token[:self.max_length]

    return torch.tensor(encoded_token), torch.tensor(label, dtype=torch.long)


In [34]:
def create_word_dataloader(
    path = data_path,
    batch_size = 4,
    shuffle = False,
    num_workers = 0,
):

  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = WordDataset(path, max_length=128, tokenizer=tokenizer)
  dataloader = DataLoader(
      dataset,
      batch_size = batch_size,
      shuffle = shuffle,
      num_workers = num_workers,
  )

  return dataloader, dataset

In [35]:
data_loader, dataset = create_word_dataloader()

label = next(iter(data_loader))

print(label)

[tensor([[  320,  1972,   319,  4865,  4447,   290,  1312,   481,  5123,   345,
           477,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0, 

In [36]:
class SentimentLSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim=4, num_layers=2, dropout = 0.3):
    super().__init__()
    self.num_layers = num_layers
    self.hidden_dim = hidden_dim
    self.embedding = nn.Embedding(vocab_size, embedding_dim)
    pretrained_embedding_weights = torch.randn((vocab_size, embedding_dim))
    self.embedding.weight.data.copy_(pretrained_embedding_weights)
    self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True)
    self.dropout = nn.Dropout(dropout)
    self.output_layer = nn.Linear(hidden_dim * 2, output_dim)

  def forward(self, X):
    hidden_states = torch.zeros(self.num_layers * 2, X.size(0), self.hidden_dim)
    cell_states = torch.zeros(self.num_layers * 2, X.size(0), self.hidden_dim)
    embedded = self.embedding(X)

    _, (hidden, _) = self.lstm(embedded, (hidden_states, cell_states))
    hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
    output = self.dropout(hidden)
    return self.output_layer(output)


In [37]:
embedding_dim = 128
hidden_dim = 128
num_epochs = 10
learning_rate = 1e-3
tokenizer = tiktoken.get_encoding("gpt2")
vocab_size = tokenizer.n_vocab


model = SentimentLSTM(vocab_size, hidden_dim, embedding_dim)
model

SentimentLSTM(
  (embedding): Embedding(50257, 128)
  (lstm): LSTM(128, 128, num_layers=2, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (output_layer): Linear(in_features=256, out_features=4, bias=True)
)

In [38]:
training_dataloader, training_dataset = create_word_dataloader(
    path = '/content/twitter_training.csv',
    batch_size=32,
    shuffle=False
)

testing_dataloader, testing_dataset = create_word_dataloader(
    path = '/content/twitter_validation.csv',
    batch_size=32,
    shuffle=False
)


In [39]:
criterion = nn.CrossEntropyLoss()

In [40]:
def train_model(num_epochs, model, train_dataloader, criterion):
  total_steps = len(train_dataloader)
  optimizer = optim.Adam(model.parameters(), lr=learning_rate)
  for epoch in range(num_epochs):
    model.train()
    for batch, (inputs, labels) in enumerate(train_dataloader):
      optimizer.zero_grad()
      outputs = model(inputs)
      loss = criterion(outputs, labels)
      loss.backward()
      optimizer.step()

      if (batch+1)%100==0:
        print(f"Epoch {epoch+1}/{num_epochs}, Step {batch+1}/{total_steps}, Loss: {loss.item():.4f}")


In [41]:
train_model(num_epochs, model, training_dataloader, criterion)

Epoch 1/10, Step 100/2313, Loss: 1.3508
Epoch 1/10, Step 200/2313, Loss: 0.8860
Epoch 1/10, Step 300/2313, Loss: 1.3792
Epoch 1/10, Step 400/2313, Loss: 1.3132
Epoch 1/10, Step 500/2313, Loss: 1.5206
Epoch 1/10, Step 600/2313, Loss: 1.0419
Epoch 1/10, Step 700/2313, Loss: 1.3524
Epoch 1/10, Step 800/2313, Loss: 1.5280
Epoch 1/10, Step 900/2313, Loss: 1.5878
Epoch 1/10, Step 1000/2313, Loss: 1.4360
Epoch 1/10, Step 1100/2313, Loss: 1.1971
Epoch 1/10, Step 1200/2313, Loss: 1.1113
Epoch 1/10, Step 1300/2313, Loss: 1.5290
Epoch 1/10, Step 1400/2313, Loss: 1.3612
Epoch 1/10, Step 1500/2313, Loss: 1.0741
Epoch 1/10, Step 1600/2313, Loss: 1.3307
Epoch 1/10, Step 1700/2313, Loss: 1.1780
Epoch 1/10, Step 1800/2313, Loss: 0.8610
Epoch 1/10, Step 1900/2313, Loss: 1.4422
Epoch 1/10, Step 2000/2313, Loss: 1.1028
Epoch 1/10, Step 2100/2313, Loss: 1.4406
Epoch 1/10, Step 2200/2313, Loss: 1.2066
Epoch 1/10, Step 2300/2313, Loss: 1.0870
Epoch 2/10, Step 100/2313, Loss: 1.5158
Epoch 2/10, Step 200/2313,

In [42]:
from sklearn.metrics import accuracy_score

def evaluate(model, dataloader, criterion):
    model.eval()  # set to eval mode (important: disables dropout, etc.)
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)             # [batch_size, num_classes]
            loss = criterion(outputs, labels)   # compute loss
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    return avg_loss, accuracy


In [43]:
evaluate(model, testing_dataloader,criterion)

(0.4225795868005662, 0.9)

In [44]:
# Save only the model weights (state_dict)
torch.save(model.state_dict(), "sentiment_lstm_weights.pt")
print("✅ Model weights saved as sentiment_lstm_weights.pt")


✅ Model weights saved as sentiment_lstm_weights.pt


In [45]:
# Save the full model (architecture + weights)
torch.save(model, "sentiment_lstm_full.pt")
print("✅ Full model saved as sentiment_lstm_full.pt")


✅ Full model saved as sentiment_lstm_full.pt


In [46]:
# --- Option A: Load only weights ---
model = SentimentLSTM(vocab_size, hidden_dim, embedding_dim)
model.load_state_dict(torch.load("sentiment_lstm_weights.pt"))
model.eval()
print("✅ Model reloaded from weights")

# --- Option B: Load full model ---
# model = torch.load("sentiment_lstm_full.pt")
# model.eval()
# print("✅ Full model reloaded")


✅ Model reloaded from weights


In [47]:
def predict_sentiment(model, text, tokenizer, max_length=50, device="cpu"):
    """
    Predict sentiment for a single phrase.
    Args:
        model: trained LSTM model
        text (str): phrase to analyze
        tokenizer: your tokenizer (e.g. tiktoken or custom)
        max_length (int): max sequence length
        device (str): "cpu" or "cuda"
    Returns:
        int: predicted class index
    """
    model.eval()
    text = clean_text(text)
    # Tokenize and convert to IDs
    token_ids = tokenizer.encode(text)

    # Pad or truncate
    if len(token_ids) < max_length:
        token_ids += [0] * (max_length - len(token_ids))
    else:
        token_ids = token_ids[:max_length]

    # Convert to tensor with batch size 1
    input_tensor = torch.tensor([token_ids], dtype=torch.long)

    with torch.no_grad():
        output = model(input_tensor)
        pred_class = torch.argmax(output, dim=1).item()

    return pred_class


In [48]:
text = "I really love this game! 😍"
response = predict_sentiment(model, text, tokenizer)
print(f"Message: {text} | Sentiment: {dataset.encoder.classes_[response]}")
text =  "This was the worst experience ever 😡"
response = predict_sentiment(model, text, tokenizer)
print(f"Message: {text} | Sentiment: {dataset.encoder.classes_[response]}")


Message: I really love this game! 😍 | Sentiment: Positive
Message: This was the worst experience ever 😡 | Sentiment: Negative
