# LSTM

In [1]:
import pandas as pd
import numpy as np

In [2]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")

Number of GPU:  1
GPU Name:  NVIDIA GeForce GTX 1650
GPU: NVIDIA GeForce GTX 1650 is available.


In [3]:
#Configure device: Set the device configuration to GPU using the torch.device class in PyTorch:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Selected Device:", device)

Selected Device: cuda


In [4]:
bca = pd.read_csv("bca_preprocessed_data.csv", delimiter=',')
bca.head(5)

Unnamed: 0,content,score,sentiment
0,mohon donk kembalikan tanda biru hitam saat li...,1,0
1,kenapa sekarang untuk transfer ke blu bca ada ...,2,0
2,oke terimakasih,5,2
3,tolong pihak developer untuk update hp saya ga...,5,2
4,ok,5,2


In [5]:
bca.isnull().sum()

content      61
score         0
sentiment     0
dtype: int64

In [6]:
bca = bca.dropna()

In [7]:
bca.isnull().sum()

content      0
score        0
sentiment    0
dtype: int64

In [8]:
#Split features (content) and labels (sentiment)

X = bca['content']
y = bca['sentiment']

In [9]:
# Check dataset balance and apply oversampling for "Neutral" class if needed
print("Before balancing:")
print(y.value_counts())

Before balancing:
sentiment
0    7443
2    5429
1    1067
Name: count, dtype: int64


In [10]:
# Oversampling technique
from sklearn.utils import resample

bca_clean_again = pd.concat([X, y], axis=1)

#separating each class

class_0 = bca_clean_again[bca_clean_again['sentiment'] == 0]
class_1 = bca_clean_again[bca_clean_again['sentiment'] == 1]
class_2 = bca_clean_again[bca_clean_again['sentiment'] == 2]

#oversampling neutral class 1 with majority class 2

class_1_oversampled = resample(class_1, replace=True, n_samples=len(class_2), random_state=42)

#combine all class
bca_clean_again = pd.concat([class_0, class_1_oversampled, class_2])

#shuffling the data
bca_clean_again = bca_clean_again.sample(frac=1, random_state=42)

X = bca_clean_again['content']
y = bca_clean_again['sentiment']

In [11]:
# Check dataset balance and apply oversampling for "Neutral" class if needed
print("After balancing:")
print(y.value_counts())

After balancing:
sentiment
0    7443
1    5429
2    5429
Name: count, dtype: int64


In [12]:
#Tokenization
from nltk.tokenize import word_tokenize
from collections import Counter

X = [word_tokenize(text.lower()) for text in X]

In [13]:
#build vocabulary

word_counts = Counter(word for sentence in X for word in sentence)
vocab = {word: i+1 for i, (word, _)in enumerate(word_counts.most_common())}
vocab["PAD"] = 0

#Convert text to sequences
X = [[vocab[word]for word in sentence]for sentence in X]

# Padding
def pad_sequence(sequences, maxlen):
    return [seq[:maxlen] + [0] * (maxlen - len(seq)) if len(seq) < maxlen else seq[:maxlen] for seq in sequences]

maxlen = 128
X = pad_sequence(X, maxlen)

In [14]:
#Split dataset
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [15]:
#convert to tensors

X_train, y_train = torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train.values, dtype=torch.long)
X_val, y_val = torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val.values, dtype=torch.long)
X_test, y_test = torch.tensor(X_test, dtype=torch.long), torch.tensor(y_test.values, dtype=torch.long)

In [16]:
from torch.utils.data import DataLoader, Dataset

class SentimentDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
train_loader = DataLoader(SentimentDataset(X_train, y_train), batch_size=128, shuffle=True)
val_loader = DataLoader(SentimentDataset(X_val, y_val), batch_size=128)
test_loader = DataLoader(SentimentDataset(X_test, y_test), batch_size=128)

In [17]:
import torch.nn as nn
import torch.optim as optim
# built LSTM Model

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers, dropout):
        super(LSTMSentiment,self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        return self.fc(hidden[-1])

In [52]:
from tqdm import tqdm
#create model, and training function

model = LSTMSentiment(len(vocab), 256, 256, 3, 2, 0.3).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4, betas=(0.9, 0,999))

# training loop function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=30):
    for epoch in range(epochs):
        model.train()
        train_loss, correct = 0,0
        train_progress = tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs} [Train]')
        for i, (X_batch, y_batch) in enumerate(train_progress):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            batch_correct = (outputs.argmax(1) == y_batch).sum().item()
            correct += batch_correct
            
            # Update progress bar
            train_progress.set_postfix({
                'loss': train_loss/(i+1),
                'accuracy': f"{correct/len(X_train)*100:.2f}%"
            })

        model.eval()
        val_loss, val_correct = 0, 0
        val_progress = tqdm(val_loader, desc=f'Epoch {epoch+1}/{epochs} [Val]')
        with torch.no_grad():
            for i, (X_batch, y_batch) in enumerate(val_progress):
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)

                val_loss += loss.item()
                batch_correct = (outputs.argmax(1) == y_batch).sum().item()
                val_correct += batch_correct

                # Update progress bar
                val_progress.set_postfix({
                    'loss': val_loss/(i+1),
                    'accuracy':f"{val_correct/len(X_val)*100:.2f}%"
                })

        #Print epoch summary
        print(f"\nEpoch {epoch+1} Summary:")
        print(f"Train Loss: {train_loss/len(train_loader):.4f} Accuracy: {correct/len(X_train):.4f}")
        print(f"Val Loss : {val_loss/len(val_loader):.4f} Acc: {val_correct/len(X_val):.4f}\n")

train_model(model, train_loader, val_loader, criterion, optimizer)


Epoch 1/20 [Train]:   0%|          | 0/115 [00:00<?, ?it/s]

Epoch 1/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.08it/s, loss=1.09, accuracy=40.53%]
Epoch 1/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.57it/s, loss=1.09, accuracy=40.66%]



Epoch 1 Summary:
Train Loss: 1.0877 Accuracy: 0.4053
Val Loss : 1.0861 Acc: 0.4066



Epoch 2/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.00it/s, loss=1.09, accuracy=40.67%]
Epoch 2/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.57it/s, loss=1.09, accuracy=40.66%]



Epoch 2 Summary:
Train Loss: 1.0869 Accuracy: 0.4067
Val Loss : 1.0866 Acc: 0.4066



Epoch 3/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 10.95it/s, loss=1.09, accuracy=40.67%]
Epoch 3/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.19it/s, loss=1.09, accuracy=40.66%]



Epoch 3 Summary:
Train Loss: 1.0871 Accuracy: 0.4067
Val Loss : 1.0863 Acc: 0.4066



Epoch 4/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 10.95it/s, loss=1.09, accuracy=40.67%]
Epoch 4/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.40it/s, loss=1.09, accuracy=40.66%]



Epoch 4 Summary:
Train Loss: 1.0878 Accuracy: 0.4067
Val Loss : 1.0864 Acc: 0.4066



Epoch 5/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.12it/s, loss=1.09, accuracy=40.67%]
Epoch 5/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.95it/s, loss=1.09, accuracy=40.66%]



Epoch 5 Summary:
Train Loss: 1.0871 Accuracy: 0.4067
Val Loss : 1.0860 Acc: 0.4066



Epoch 6/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.14it/s, loss=1.09, accuracy=40.67%]
Epoch 6/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.90it/s, loss=1.09, accuracy=40.66%]



Epoch 6 Summary:
Train Loss: 1.0871 Accuracy: 0.4067
Val Loss : 1.0860 Acc: 0.4066



Epoch 7/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.12it/s, loss=1.09, accuracy=40.67%]
Epoch 7/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.95it/s, loss=1.09, accuracy=40.66%]



Epoch 7 Summary:
Train Loss: 1.0872 Accuracy: 0.4067
Val Loss : 1.0862 Acc: 0.4066



Epoch 8/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.12it/s, loss=1.09, accuracy=40.67%]
Epoch 8/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.78it/s, loss=1.09, accuracy=40.66%]



Epoch 8 Summary:
Train Loss: 1.0872 Accuracy: 0.4067
Val Loss : 1.0860 Acc: 0.4066



Epoch 9/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.11it/s, loss=1.09, accuracy=40.67%]
Epoch 9/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.95it/s, loss=1.09, accuracy=40.66%]



Epoch 9 Summary:
Train Loss: 1.0871 Accuracy: 0.4067
Val Loss : 1.0868 Acc: 0.4066



Epoch 10/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.13it/s, loss=1.09, accuracy=40.67%]
Epoch 10/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.73it/s, loss=1.09, accuracy=40.66%]



Epoch 10 Summary:
Train Loss: 1.0872 Accuracy: 0.4067
Val Loss : 1.0866 Acc: 0.4066



Epoch 11/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.11it/s, loss=1.09, accuracy=40.67%]
Epoch 11/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.67it/s, loss=1.09, accuracy=40.66%]



Epoch 11 Summary:
Train Loss: 1.0874 Accuracy: 0.4067
Val Loss : 1.0861 Acc: 0.4066



Epoch 12/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.09it/s, loss=1.09, accuracy=40.67%]
Epoch 12/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.57it/s, loss=1.09, accuracy=40.66%]



Epoch 12 Summary:
Train Loss: 1.0870 Accuracy: 0.4067
Val Loss : 1.0860 Acc: 0.4066



Epoch 13/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 10.81it/s, loss=1.09, accuracy=40.67%]
Epoch 13/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.24it/s, loss=1.09, accuracy=40.66%]



Epoch 13 Summary:
Train Loss: 1.0871 Accuracy: 0.4067
Val Loss : 1.0861 Acc: 0.4066



Epoch 14/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.04it/s, loss=1.09, accuracy=40.67%]
Epoch 14/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.84it/s, loss=1.09, accuracy=40.66%]



Epoch 14 Summary:
Train Loss: 1.0872 Accuracy: 0.4067
Val Loss : 1.0859 Acc: 0.4066



Epoch 15/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.10it/s, loss=1.09, accuracy=40.67%]
Epoch 15/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.84it/s, loss=1.09, accuracy=40.66%]



Epoch 15 Summary:
Train Loss: 1.0871 Accuracy: 0.4067
Val Loss : 1.0859 Acc: 0.4066



Epoch 16/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.10it/s, loss=1.09, accuracy=40.67%]
Epoch 16/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 29.01it/s, loss=1.09, accuracy=40.66%]



Epoch 16 Summary:
Train Loss: 1.0870 Accuracy: 0.4067
Val Loss : 1.0869 Acc: 0.4066



Epoch 17/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.09it/s, loss=1.09, accuracy=40.67%]
Epoch 17/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.35it/s, loss=1.09, accuracy=40.66%]



Epoch 17 Summary:
Train Loss: 1.0874 Accuracy: 0.4067
Val Loss : 1.0859 Acc: 0.4066



Epoch 18/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 10.79it/s, loss=1.09, accuracy=40.67%]
Epoch 18/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.14it/s, loss=1.09, accuracy=40.66%]



Epoch 18 Summary:
Train Loss: 1.0870 Accuracy: 0.4067
Val Loss : 1.0859 Acc: 0.4066



Epoch 19/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.02it/s, loss=1.09, accuracy=40.67%]
Epoch 19/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 28.73it/s, loss=1.09, accuracy=40.66%]



Epoch 19 Summary:
Train Loss: 1.0872 Accuracy: 0.4067
Val Loss : 1.0860 Acc: 0.4066



Epoch 20/20 [Train]: 100%|██████████| 115/115 [00:10<00:00, 11.10it/s, loss=1.09, accuracy=40.67%]
Epoch 20/20 [Val]: 100%|██████████| 15/15 [00:00<00:00, 29.01it/s, loss=1.09, accuracy=40.66%]


Epoch 20 Summary:
Train Loss: 1.0871 Accuracy: 0.4067
Val Loss : 1.0859 Acc: 0.4066




