# Sentiment Analysis of Indonesian SMS Data

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from transformers import get_linear_schedule_with_warmup

In [None]:
df = pd.read_csv('IDSMSA.csv')
df.head()

## 2. Data Preprocessing and Splitting

In [None]:
# Check for missing values
print(df.isnull().sum())

# Check class distribution
print(df['Sentiment'].value_counts())

In [None]:
# Clean and map sentiment labels
df.dropna(subset=['Sentiment'], inplace=True)
df['Sentiment'] = df['Sentiment'].str.lower().str.strip()
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df['label'] = df['Sentiment'].map(label_map)
df.dropna(subset=['label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['Sentence'], 
    df['label'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']
)

print(f'Train set size: {len(X_train)}')
print(f'Test set size: {len(X_test)}')

## 3. Direct Evaluation (Without Fine-Tuning)

### 3.1. IndoBERT (indonesia-bert-sentiment-classification)

In [None]:
bert_model_name = 'indonesia-bert-sentiment-classification'
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name)

bert_pipeline = pipeline('sentiment-analysis', model=bert_model, tokenizer=bert_tokenizer)

# The model labels are different, we need to map them
bert_label_map = {'LABEL_0': 'positive', 'LABEL_1': 'neutral', 'LABEL_2': 'negative'}
reverse_label_map = {v: k for k, v in label_map.items()}

def predict_bert(text):
    result = bert_pipeline(text)[0]
    return label_map[bert_label_map[result['label']]]

bert_preds = [predict_bert(text) for text in tqdm(X_test)]

print("\nIndoBERT Classification Report:")
print(classification_report(y_test, bert_preds, target_names=label_map.keys()))

### 3.2. IndoRoBERTa (indonesian-roberta-base-sentiment-classifier)

In [None]:
roberta_model_name = 'indonesian-roberta-base-sentiment-classifier'
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name)

roberta_pipeline = pipeline('sentiment-analysis', model=roberta_model, tokenizer=roberta_tokenizer)

def predict_roberta(text):
    result = roberta_pipeline(text)[0]
    # The labels from this model are already 'positive', 'neutral', 'negative'
    return label_map[result['label']]

roberta_preds = [predict_roberta(text) for text in tqdm(X_test)]

print("\nIndoRoBERTa Classification Report:")
print(classification_report(y_test, roberta_preds, target_names=label_map.keys()))

## 4. Fine-Tuning and Evaluation

### 4.1. Dataset and DataLoader

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts.iloc[item])
        label = self.labels.iloc[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

### 4.2. Fine-Tuning Loop

In [None]:
def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for d in tqdm(data_loader, desc="Training"):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)

def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in tqdm(data_loader, desc="Evaluating"):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            logits = outputs.logits
            
            _, preds = torch.max(logits, dim=1)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

def get_predictions(model, data_loader, device):
    model = model.eval()
    predictions = []
    real_values = []
    with torch.no_grad():
        for d in tqdm(data_loader, desc="Predicting"):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            predictions.extend(preds)
            real_values.extend(labels)
    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return predictions, real_values

### 4.3. Fine-Tune IndoBERT

In [None]:
import numpy as np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
EPOCHS = 3
BATCH_SIZE = 16

bert_ft_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name)
bert_ft_model = bert_ft_model.to(device)

train_dataset = SentimentDataset(X_train, y_train, bert_tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = SentimentDataset(X_test, y_test, bert_tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

optimizer = torch.optim.AdamW(bert_ft_model.parameters(), lr=2e-5)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(
        bert_ft_model, train_loader, optimizer, device, scheduler, len(X_train)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

y_pred, y_true = get_predictions(bert_ft_model, test_loader, device)
print("\nFine-Tuned IndoBERT Classification Report:")
print(classification_report(y_true, y_pred, target_names=label_map.keys()))

### 4.4. Fine-Tune IndoRoBERTa

In [None]:
roberta_ft_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name)
roberta_ft_model = roberta_ft_model.to(device)

train_dataset_roberta = SentimentDataset(X_train, y_train, roberta_tokenizer)
train_loader_roberta = DataLoader(train_dataset_roberta, batch_size=BATCH_SIZE, shuffle=True)

test_dataset_roberta = SentimentDataset(X_test, y_test, roberta_tokenizer)
test_loader_roberta = DataLoader(test_dataset_roberta, batch_size=BATCH_SIZE)

optimizer_roberta = torch.optim.AdamW(roberta_ft_model.parameters(), lr=2e-5)
total_steps_roberta = len(train_loader_roberta) * EPOCHS
scheduler_roberta = get_linear_schedule_with_warmup(
    optimizer_roberta,
    num_warmup_steps=0,
    num_training_steps=total_steps_roberta
)

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(
        roberta_ft_model, train_loader_roberta, optimizer_roberta, device, scheduler_roberta, len(X_train)
    )
    print(f'Train loss {train_loss} accuracy {train_acc}')

y_pred_roberta, y_true_roberta = get_predictions(roberta_ft_model, test_loader_roberta, device)
print("\nFine-Tuned IndoRoBERTa Classification Report:")
print(classification_report(y_true_roberta, y_pred_roberta, target_names=label_map.keys()))

## 5. Conclusion

In [None]:
from sklearn.metrics import accuracy_score

# Get accuracy scores
bert_accuracy = accuracy_score(y_test, bert_preds)
roberta_accuracy = accuracy_score(y_test, roberta_preds)
bert_ft_accuracy = accuracy_score(y_true, y_pred)
roberta_ft_accuracy = accuracy_score(y_true_roberta, y_pred_roberta)

conclusion_data = {
    'Model': ['IndoBERT', 'IndoRoBERTa', 'IndoBERT (Fine-Tuned)', 'IndoRoBERTa (Fine-Tuned)'],
    'Accuracy': [bert_accuracy, roberta_accuracy, bert_ft_accuracy, roberta_ft_accuracy]
}

conclusion_df = pd.DataFrame(conclusion_data)
print(conclusion_df)

print("\n## Conclusion\n")
print("The table above summarizes the accuracy of the four different approaches. Based on the results, we can determine which model and approach works best for this dataset.")
print("\nAs a reminder, the classification reports are generated using the **test data**. This provides an unbiased evaluation of how the models perform on new, unseen data, which is the standard practice for assessing a model's true performance.")