In [16]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Preprocessing

In [17]:
# List of stopwords in indonesian
stop_words = set(stopwords.words('indonesian'))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    clean_data = ' '.join(tokens)
    return clean_data

# Load dataset
data_train = pd.read_csv("dataset/train_preprocess.tsv", sep='\t', header=None, names=['text', 'label'])
data_test = pd.read_csv("dataset/test_preprocess.tsv", sep='\t', header=None, names=['text', 'label'])

# Preprocess text
data_train['text'] = data_train['text'].apply(preprocess_text)
data_test['text'] = data_test['text'].apply(preprocess_text)

In [18]:
# Feature extraction with bag of words
vectorizer = CountVectorizer()

# Split data train and data test
X_train, X_test, y_train, y_test = train_test_split(data_train['text'], data_train['label'], test_size=0.2, random_state=42)

In [19]:
# Oversampler to balance the data
oversampler = RandomOverSampler(random_state=42)

# Pipeline SVM with oversampling
pipeline_svm = ImbPipeline([
    ('vectorizer',vectorizer), 
    ('oversampler', oversampler),
    ('classifier', SVC())])

# Pipeline Naive Bayes
pipeline_nb = ImbPipeline([
    ('vectorizer', vectorizer),
    ('oversampler', oversampler),
    ('classifier', MultinomialNB())
])

# Pipeline Logistic Regression
pipeline_logreg = ImbPipeline([
    ('vectorizer', vectorizer),
    ('oversampler', oversampler),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Hyperparameter tuning for Logistic Regression
logreg_param_grid = {
    'classifier__C': [0.1, 1, 10],
    'classifier__solver': ['liblinear', 'lbfgs']}

logreg_grid = GridSearchCV(pipeline_logreg, logreg_param_grid, cv=5)


## Train and Evaluating Model

In [20]:
# Function to train and evaluating model
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Show classification report
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
    print(classification_report(y_test, y_pred))

## Evaluated Model with Naive Bayes

In [21]:
# Evaluated model with Naive Bayes
train_and_evaluate(pipeline_nb, X_train, X_test, y_train, y_test, "Naive Bayes")


Model: Naive Bayes
Accuracy: 0.8313636363636364
Precision: 0.8303951673296521
Recall: 0.8313636363636364
F1 Score: 0.8306650149195587
              precision    recall  f1-score   support

    negative       0.78      0.74      0.76       680
     neutral       0.74      0.76      0.75       239
    positive       0.87      0.89      0.88      1281

    accuracy                           0.83      2200
   macro avg       0.80      0.80      0.80      2200
weighted avg       0.83      0.83      0.83      2200



## Evaluated Model with Logistic Regression

In [22]:
# Evaluated model with Logistic Regression after tuning hyperparameter with Grid Search
logreg_grid.fit(X_train, y_train) 
train_and_evaluate(logreg_grid, X_train, X_test, y_train, y_test, "Logistic Regression (with tuning)")


Model: Logistic Regression (with tuning)
Accuracy: 0.8418181818181818
Precision: 0.8466360088430017
Recall: 0.8418181818181818
F1 Score: 0.8434669697208951
              precision    recall  f1-score   support

    negative       0.77      0.81      0.79       680
     neutral       0.72      0.80      0.76       239
    positive       0.91      0.87      0.89      1281

    accuracy                           0.84      2200
   macro avg       0.80      0.83      0.81      2200
weighted avg       0.85      0.84      0.84      2200



## Evaluated Model with SVM

In [23]:
# Evaluated model with SVM
train_and_evaluate(pipeline_svm, X_train, X_test, y_train, y_test, "SVM")

Model: SVM
Accuracy: 0.8340909090909091
Precision: 0.8352122612486028
Recall: 0.8340909090909091
F1 Score: 0.8345824905037513
              precision    recall  f1-score   support

    negative       0.75      0.77      0.76       680
     neutral       0.78      0.77      0.77       239
    positive       0.89      0.88      0.89      1281

    accuracy                           0.83      2200
   macro avg       0.81      0.81      0.81      2200
weighted avg       0.84      0.83      0.83      2200

