# Sentiment Analysis of Indonesian SMS Data (Traditional ML)

## 1. Setup and Data Loading

In [1]:
# --- Standard & Data Handling ---
import re
import pandas as pd
import numpy as np

# --- Plotting & Visualization ---
import matplotlib.pyplot as plt
import seaborn as sns

# --- NLP & Text Processing ---
import nltk
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import word_tokenize
import gensim.downloader as api

# --- Deep Learning (Original Imports) ---
import torch
from transformers import BertTokenizer, BertModel

# --- Machine Learning (Scikit-learn and LightGBM) ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV # Added GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix # Added confusion_matrix
import lightgbm as lgb # Added LightGBM

# --- Download required NLTK data ---
nltk.download('punkt')
nltk.download('stopwords')

print("All necessary libraries have been imported.")

All necessary libraries have been imported.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tertius\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tertius\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv('IDSMSA.csv')
df.head()

Unnamed: 0,Tweet Date,Sentence,Quote Count,Reply Count,Retweet Count,Favorite Count,Sentiment,English Translation
0,Thu Feb 29 11:21:27 +0000 2024,"Gk muluk muluk, 100,000 lot saham BBCA aja",0,0,0,0,Positive,"Not too ambitious, just 100,000 lots of BBCA s..."
1,Thu Feb 29 10:11:05 +0000 2024,BCA Expoversary 2024 menawarkan promo suku bun...,0,0,0,0,Neutral,BCA Expoversary 2024 offers special interest r...
2,Thu Feb 29 10:06:04 +0000 2024,[USERNAME] saham bca nya menyusul ya 🙂,0,0,0,0,Positive,[USERNAME] BCA shares will follow 🙂
3,Thu Feb 29 07:42:09 +0000 2024,PT Bank BCA Syariah (BCA Syariah) turut memeri...,0,0,0,0,Neutral,PT Bank BCA Syariah (BCA Syariah) also enliven...
4,Thu Feb 29 06:06:17 +0000 2024,[USERNAME] Begitu byk saham kamu memilih saham...,0,0,0,1,Positive,[USERNAME] So many stocks you choose those sto...


## 2. Data Preprocessing and Splitting

In [3]:
# Check for missing values
print(df.isnull().sum())

# Check class distribution
print(df['Sentiment'].value_counts())

Tweet Date             0
Sentence               0
Quote Count            0
Reply Count            0
Retweet Count          0
Favorite Count         0
Sentiment              0
English Translation    0
dtype: int64
Sentiment
Positive    1769
Negative     786
Neutral      733
Name: count, dtype: int64


In [4]:
# Clean and map sentiment labels
df.dropna(subset=['Sentiment'], inplace=True)
df['Sentiment'] = df['Sentiment'].str.lower().str.strip()
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df['label'] = df['Sentiment'].map(label_map)
df.dropna(subset=['label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['Sentence'], 
    df['label'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']
)

print(f'Train set size: {len(X_train)}')
print(f'Test set size: {len(X_test)}')

Train set size: 2630
Test set size: 658


## 3. Traditional ML Model Comparison

### 3.1. Indonesian Text Preprocessing

In [5]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

def preprocess_text(text, use_stopwords=True, use_stemming=True):
    text = text.lower()
    if use_stopwords:
        text = stopword_remover.remove(text)
    if use_stemming:
        text = stemmer.stem(text)
    return text

### 3.2. Advanced Indonesian Text Preprocessing

In [6]:
# More comprehensive stopword list
stopword_list = nltk.corpus.stopwords.words('indonesian')

def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, '_'.join(EMOTICONS_EMO[emot].replace(',', '').split()))
    return text

def preprocess_text_advanced(text, use_stopwords=True, use_stemming=True):
    text = convert_emoticons(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s_]', '', text) # Remove punctuation and special chars
    tokens = word_tokenize(text)
    if use_stopwords:
        tokens = [word for word in tokens if word not in stopword_list]
    text = ' '.join(tokens)
    if use_stemming:
        text = stemmer.stem(text)
    return text

### 3.3. TF-IDF Vectorization and Model Training

In [7]:
preprocessing_options = {
    'Stemming + Stopwords (Sastrawi)': lambda x: preprocess_text(x, use_stopwords=True, use_stemming=True),
    'Stemming Only (Sastrawi)': lambda x: preprocess_text(x, use_stopwords=False, use_stemming=True),
    'Stopwords Only (Sastrawi)': lambda x: preprocess_text(x, use_stopwords=True, use_stemming=False),
    'No Preprocessing': lambda x: x,
    'Advanced (Stem+Stop+Emoticon)': lambda x: preprocess_text_advanced(x, use_stopwords=True, use_stemming=True),
    'Advanced (Stop+Emoticon)': lambda x: preprocess_text_advanced(x, use_stopwords=True, use_stemming=False),
    'Advanced (Stem+Emoticon)': lambda x: preprocess_text_advanced(x, use_stopwords=False, use_stemming=True),
}

In [8]:
# --- Define Models with Hyperparameter Grids and Enhancements ---
models_and_params = [
    {
        'model': LogisticRegression(
            max_iter=2000, 
            class_weight='balanced' # Handles class imbalance
        ),
        'params': {
            'C': [0.1, 1, 10]
        }
    },
    {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.1, 0.5, 1.0]
        }
    },
    {
        'model': LinearSVC(
            max_iter=5000, 
            class_weight='balanced', # Handles class imbalance
            dual=True 
        ),
        'params': {
            'C': [0.1, 1, 10]
        }
    },
    {
        'model': lgb.LGBMClassifier(
            objective='multiclass', 
            random_state=42
        ),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'num_leaves': [20, 31]
        }
    }
]

# --- Run the Main Experiment Loop and Print Results ---
results = []

for pp_name, pp_func in preprocessing_options.items():
    print(f'--- Preprocessing: {pp_name} ---')
    print('='*80)
    
    # Apply the preprocessing with N-grams
    tfidf_vectorizer = TfidfVectorizer(
        preprocessor=pp_func,
        ngram_range=(1, 2) # Use both unigrams and bigrams
    )
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    for model_info in models_and_params:
        model = model_info['model']
        params = model_info['params']
        model_name = model.__class__.__name__

        # Set up and run GridSearchCV
        grid_search = GridSearchCV(model, params, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train_tfidf, y_train)
        
        # Get the best model and evaluate it
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_tfidf)
        report = classification_report(y_test, y_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        
        # Store results
        results.append({
            'Model': model_name,
            'Preprocessing': pp_name,
            'Best Params': grid_search.best_params_,
            'Accuracy': report['accuracy'],
            'F1-Score (Weighted)': report['weighted avg']['f1-score']
        })
        
        # Print the intermediate report for each combination
        print(f'\n--- {model_name} with {pp_name} ---')
        print(f'Best Hyperparameters: {grid_search.best_params_}')
        print(classification_report(y_test, y_pred, target_names=label_map.keys(), zero_division=0))

# --- Display Final Comparative Results Table ---
results_df_tfidf = pd.DataFrame(results)
print("\n--- Final Results with Hyperparameter Tuning ---")
# Display relevant columns for clarity
display_cols = ['Model', 'Preprocessing', 'Best Params', 'Accuracy', 'F1-Score (Weighted)']
print(results_df_tfidf[display_cols].sort_values(by='Accuracy', ascending=False))

--- Preprocessing: Stemming + Stopwords (Sastrawi) ---
Fitting 5 folds for each of 3 candidates, totalling 15 fits

--- LogisticRegression with Stemming + Stopwords (Sastrawi) ---
Best Hyperparameters: {'C': 10}
              precision    recall  f1-score   support

    positive       0.81      0.85      0.83       354
     neutral       0.66      0.54      0.60       147
    negative       0.71      0.76      0.73       157

    accuracy                           0.76       658
   macro avg       0.73      0.72      0.72       658
weighted avg       0.75      0.76      0.75       658

Fitting 5 folds for each of 3 candidates, totalling 15 fits

--- MultinomialNB with Stemming + Stopwords (Sastrawi) ---
Best Hyperparameters: {'alpha': 0.1}
              precision    recall  f1-score   support

    positive       0.70      0.94      0.80       354
     neutral       0.86      0.37      0.51       147
    negative       0.76      0.57      0.65       157

    accuracy                    




--- LGBMClassifier with Stemming + Stopwords (Sastrawi) ---
Best Hyperparameters: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 20}
              precision    recall  f1-score   support

    positive       0.76      0.88      0.81       354
     neutral       0.67      0.45      0.54       147
    negative       0.72      0.70      0.71       157

    accuracy                           0.74       658
   macro avg       0.72      0.68      0.69       658
weighted avg       0.73      0.74      0.73       658

--- Preprocessing: Stemming Only (Sastrawi) ---
Fitting 5 folds for each of 3 candidates, totalling 15 fits

--- LogisticRegression with Stemming Only (Sastrawi) ---
Best Hyperparameters: {'C': 10}
              precision    recall  f1-score   support

    positive       0.83      0.85      0.84       354
     neutral       0.69      0.63      0.65       147
    negative       0.73      0.74      0.73       157

    accuracy                           0.78       658
   




--- LGBMClassifier with Stemming Only (Sastrawi) ---
Best Hyperparameters: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 20}
              precision    recall  f1-score   support

    positive       0.75      0.88      0.81       354
     neutral       0.67      0.52      0.58       147
    negative       0.73      0.61      0.66       157

    accuracy                           0.73       658
   macro avg       0.72      0.67      0.69       658
weighted avg       0.73      0.73      0.73       658

--- Preprocessing: Stopwords Only (Sastrawi) ---
Fitting 5 folds for each of 3 candidates, totalling 15 fits

--- LogisticRegression with Stopwords Only (Sastrawi) ---
Best Hyperparameters: {'C': 10}
              precision    recall  f1-score   support

    positive       0.80      0.84      0.82       354
     neutral       0.66      0.56      0.61       147
    negative       0.71      0.71      0.71       157

    accuracy                           0.75       658
   macro




--- LGBMClassifier with Stopwords Only (Sastrawi) ---
Best Hyperparameters: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 20}
              precision    recall  f1-score   support

    positive       0.74      0.88      0.80       354
     neutral       0.66      0.39      0.49       147
    negative       0.67      0.63      0.65       157

    accuracy                           0.71       658
   macro avg       0.69      0.63      0.65       658
weighted avg       0.70      0.71      0.70       658

--- Preprocessing: No Preprocessing ---
Fitting 5 folds for each of 3 candidates, totalling 15 fits

--- LogisticRegression with No Preprocessing ---
Best Hyperparameters: {'C': 10}
              precision    recall  f1-score   support

    positive       0.81      0.84      0.82       354
     neutral       0.66      0.62      0.64       147
    negative       0.70      0.69      0.69       157

    accuracy                           0.75       658
   macro avg       0.72  




--- LGBMClassifier with No Preprocessing ---
Best Hyperparameters: {'learning_rate': 0.1, 'n_estimators': 100, 'num_leaves': 31}
              precision    recall  f1-score   support

    positive       0.71      0.86      0.78       354
     neutral       0.61      0.46      0.52       147
    negative       0.63      0.48      0.55       157

    accuracy                           0.68       658
   macro avg       0.65      0.60      0.61       658
weighted avg       0.67      0.68      0.66       658

--- Preprocessing: Advanced (Stem+Stop+Emoticon) ---
Fitting 5 folds for each of 3 candidates, totalling 15 fits

--- LogisticRegression with Advanced (Stem+Stop+Emoticon) ---
Best Hyperparameters: {'C': 10}
              precision    recall  f1-score   support

    positive       0.82      0.84      0.83       354
     neutral       0.64      0.57      0.60       147
    negative       0.72      0.76      0.74       157

    accuracy                           0.76       658
   macro 




--- LGBMClassifier with Advanced (Stem+Stop+Emoticon) ---
Best Hyperparameters: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 20}
              precision    recall  f1-score   support

    positive       0.75      0.87      0.81       354
     neutral       0.65      0.46      0.54       147
    negative       0.70      0.64      0.67       157

    accuracy                           0.72       658
   macro avg       0.70      0.66      0.67       658
weighted avg       0.72      0.72      0.71       658

--- Preprocessing: Advanced (Stop+Emoticon) ---
Fitting 5 folds for each of 3 candidates, totalling 15 fits

--- LogisticRegression with Advanced (Stop+Emoticon) ---
Best Hyperparameters: {'C': 10}
              precision    recall  f1-score   support

    positive       0.81      0.84      0.83       354
     neutral       0.65      0.59      0.62       147
    negative       0.73      0.73      0.73       157

    accuracy                           0.76       658
   ma




--- LGBMClassifier with Advanced (Stop+Emoticon) ---
Best Hyperparameters: {'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 20}
              precision    recall  f1-score   support

    positive       0.74      0.86      0.80       354
     neutral       0.57      0.41      0.48       147
    negative       0.68      0.60      0.64       157

    accuracy                           0.70       658
   macro avg       0.66      0.63      0.64       658
weighted avg       0.69      0.70      0.69       658

--- Preprocessing: Advanced (Stem+Emoticon) ---
Fitting 5 folds for each of 3 candidates, totalling 15 fits

--- LogisticRegression with Advanced (Stem+Emoticon) ---
Best Hyperparameters: {'C': 10}
              precision    recall  f1-score   support

    positive       0.83      0.85      0.84       354
     neutral       0.66      0.63      0.64       147
    negative       0.74      0.74      0.74       157

    accuracy                           0.77       658
   macro a

