# Sentiment Analysis of Indonesian SMS Data (Traditional ML)

## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO
import torch
from transformers import BertTokenizer, BertModel
import gensim.downloader as api
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
df = pd.read_csv('IDSMSA.csv')
df.head()

## 2. Data Preprocessing and Splitting

In [None]:
# Check for missing values
print(df.isnull().sum())

# Check class distribution
print(df['Sentiment'].value_counts())

In [None]:
# Clean and map sentiment labels
df.dropna(subset=['Sentiment'], inplace=True)
df['Sentiment'] = df['Sentiment'].str.lower().str.strip()
label_map = {'positive': 0, 'neutral': 1, 'negative': 2}
df['label'] = df['Sentiment'].map(label_map)
df.dropna(subset=['label'], inplace=True)
df['label'] = df['label'].astype(int)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    df['Sentence'], 
    df['label'], 
    test_size=0.2, 
    random_state=42, 
    stratify=df['label']
)

print(f'Train set size: {len(X_train)}')
print(f'Test set size: {len(X_test)}')

## 3. Traditional ML Model Comparison

### 3.1. Indonesian Text Preprocessing

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stopword_factory = StopWordRemoverFactory()
stopword_remover = stopword_factory.create_stop_word_remover()

def preprocess_text(text, use_stopwords=True, use_stemming=True):
    text = text.lower()
    if use_stopwords:
        text = stopword_remover.remove(text)
    if use_stemming:
        text = stemmer.stem(text)
    return text

### 3.2. Advanced Indonesian Text Preprocessing

In [None]:
# More comprehensive stopword list
stopword_list = nltk.corpus.stopwords.words('indonesian')

def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, '_'.join(EMOTICONS_EMO[emot].replace(',', '').split()))
    return text

def preprocess_text_advanced(text, use_stopwords=True, use_stemming=True):
    text = convert_emoticons(text)
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s_]', '', text) # Remove punctuation and special chars
    tokens = word_tokenize(text)
    if use_stopwords:
        tokens = [word for word in tokens if word not in stopword_list]
    text = ' '.join(tokens)
    if use_stemming:
        text = stemmer.stem(text)
    return text

### 3.3. TF-IDF Vectorization and Model Training

In [None]:
preprocessing_options = {
    'Stemming + Stopwords (Sastrawi)': lambda x: preprocess_text(x, use_stopwords=True, use_stemming=True),
    'Stemming Only (Sastrawi)': lambda x: preprocess_text(x, use_stopwords=False, use_stemming=True),
    'Stopwords Only (Sastrawi)': lambda x: preprocess_text(x, use_stopwords=True, use_stemming=False),
    'No Preprocessing': lambda x: x,
    'Advanced (Stem+Stop+Emoticon)': lambda x: preprocess_text_advanced(x, use_stopwords=True, use_stemming=True),
    'Advanced (Stop+Emoticon)': lambda x: preprocess_text_advanced(x, use_stopwords=True, use_stemming=False),
    'Advanced (Stem+Emoticon)': lambda x: preprocess_text_advanced(x, use_stopwords=False, use_stemming=True),
}

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC()
}

results = []

for pp_name, pp_func in preprocessing_options.items():
    print(f'--- Preprocessing: {pp_name} ---')
    tfidf_vectorizer = TfidfVectorizer(preprocessor=pp_func)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)
    
    for model_name, model in models.items():
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
        report = classification_report(y_test, y_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        results.append({
            'Model': model_name,
            'Preprocessing': pp_name,
            'Accuracy': report['accuracy'],
            'F1-Score (Weighted)': report['weighted avg']['f1-score']
        })
        print(f'\n--- {model_name} with {pp_name} ---')
        print(classification_report(y_test, y_pred, target_names=label_map.keys(), zero_division=0))

results_df_tfidf = pd.DataFrame(results)
print(results_df_tfidf.sort_values(by='Accuracy', ascending=False))

## 4. Embedding-based Model Comparison

### 4.1. Load Embedding Models

In [22]:
# Load local BERT model
bert_tokenizer = BertTokenizer.from_pretrained('./indonesia-bert-sentiment-classification')
bert_model = BertModel.from_pretrained('./indonesia-bert-sentiment-classification')

In [21]:
# Load pre-trained FastText model
fasttext_model = api.load('fasttext-wiki-news-subwords-300')



### 4.2. Create Document Vectors

In [23]:
def get_bert_embedding(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

def get_fasttext_embedding(text, model):
    tokens = word_tokenize(text.lower())
    vectors = [model[word] for word in tokens if word in model]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Create embeddings for the datasets
X_train_bert = np.array([get_bert_embedding(text, bert_tokenizer, bert_model) for text in X_train])
X_test_bert = np.array([get_bert_embedding(text, bert_tokenizer, bert_model) for text in X_test])

X_train_ft = np.array([get_fasttext_embedding(text, fasttext_model) for text in X_train])
X_test_ft = np.array([get_fasttext_embedding(text, fasttext_model) for text in X_test])

### 4.3. Train and Evaluate Models on Embeddings

In [24]:
embedding_results = []
embedding_data = {
    'BERT': (X_train_bert, X_test_bert),
    'FastText': (X_train_ft, X_test_ft)
}

embedding_models = {
    'Logistic Regression': LogisticRegression(max_iter=2000),
    'SVM': LinearSVC(max_iter=2000)
}

for emb_name, (X_train_emb, X_test_emb) in embedding_data.items():
    for model_name, model in embedding_models.items():
        print(f'\n--- {model_name} with {emb_name} Embeddings ---')
        model.fit(X_train_emb, y_train)
        y_pred = model.predict(X_test_emb)
        report = classification_report(y_test, y_pred, target_names=label_map.keys(), output_dict=True, zero_division=0)
        embedding_results.append({
            'Model': model_name,
            'Preprocessing': f'{emb_name} Embeddings',
            'Accuracy': report['accuracy'],
            'F1-Score (Weighted)': report['weighted avg']['f1-score']
        })
        print(classification_report(y_test, y_pred, target_names=label_map.keys(), zero_division=0))

results_df_emb = pd.DataFrame(embedding_results)
print(results_df_emb.sort_values(by='Accuracy', ascending=False))


--- Logistic Regression with BERT Embeddings ---
              precision    recall  f1-score   support

    positive       0.76      0.82      0.79       354
     neutral       0.65      0.61      0.63       147
    negative       0.68      0.60      0.64       157

    accuracy                           0.72       658
   macro avg       0.70      0.68      0.68       658
weighted avg       0.72      0.72      0.72       658


--- SVM with BERT Embeddings ---
              precision    recall  f1-score   support

    positive       0.76      0.79      0.78       354
     neutral       0.64      0.62      0.63       147
    negative       0.63      0.59      0.61       157

    accuracy                           0.71       658
   macro avg       0.68      0.67      0.67       658
weighted avg       0.70      0.71      0.70       658


--- Logistic Regression with FastText Embeddings ---
              precision    recall  f1-score   support

    positive       0.56      0.97      0.71  

## 5. Final Results Comparison

In [25]:
final_results = pd.concat([results_df_tfidf, results_df_emb], ignore_index=True)
print("--- All Results ---")
print(final_results.sort_values(by='Accuracy', ascending=False))

--- All Results ---
                  Model                    Preprocessing  Accuracy  \
5                   SVM         Stemming Only (Sastrawi)  0.772036   
20                  SVM         Advanced (Stem+Emoticon)  0.762918   
14                  SVM    Advanced (Stem+Stop+Emoticon)  0.759878   
2                   SVM  Stemming + Stopwords (Sastrawi)  0.755319   
17                  SVM         Advanced (Stop+Emoticon)  0.752280   
8                   SVM        Stopwords Only (Sastrawi)  0.750760   
11                  SVM                 No Preprocessing  0.747720   
18  Logistic Regression         Advanced (Stem+Emoticon)  0.746201   
3   Logistic Regression         Stemming Only (Sastrawi)  0.735562   
15  Logistic Regression         Advanced (Stop+Emoticon)  0.735562   
12  Logistic Regression    Advanced (Stem+Stop+Emoticon)  0.734043   
0   Logistic Regression  Stemming + Stopwords (Sastrawi)  0.729483   
6   Logistic Regression        Stopwords Only (Sastrawi)  0.729483   
