In [46]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Preprocessing

In [47]:
# List of stopwords in indonesian
stop_words = set(stopwords.words('indonesian'))

# Function to preprocess text
def preprocess_text(text):
    text = text.lower()
    tokens = text.split()
    tokens_without_stopwords = [word for word in tokens if word not in stop_words] # Stop-word removal
    clean_data = ' '.join(tokens_without_stopwords)
    return clean_data

# Load dataset
data_train = pd.read_csv("dataset/train_preprocess.tsv", sep='\t', header=None)
data_valid = pd.read_csv("dataset/valid_preprocess.tsv", sep='\t', header=None)
data_test = pd.read_csv("dataset/test_preprocess.tsv", sep='\t', header=None)

# Define column data
X_train = data_train[0]
y_train = data_train[1]
X_valid = data_valid[0]
y_valid = data_valid[1]
X_test = data_test[0]
y_test = data_test[1]

# Preprocess text
data_train['clean_data'] = X_train.apply(preprocess_text)
data_valid['clean_data'] = X_valid.apply(preprocess_text)
data_test['clean_data'] = X_test.apply(preprocess_text)

X_train = data_train['clean_data']
X_valid = data_valid['clean_data']
X_test = data_test['clean_data']

## Feature Extraction

In [48]:
# Feature extraction with bag of words
bow_vectorizer = CountVectorizer()

# Fit and transfrom data
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_valid_bow = bow_vectorizer.transform(X_valid)
X_test_bow = bow_vectorizer.transform(X_test)

## Train and Evaluating Model

In [49]:
# Function to train and evaluating model
def train_and_evaluate(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Show classification report
    print(f"Model: {model_name}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred, average='weighted')}")
    print(f"Recall: {recall_score(y_test, y_pred, average='weighted')}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
    print(classification_report(y_test, y_pred))

### With Logistic Regression

In [50]:
log_regression = LogisticRegression(max_iter=1000)
train_and_evaluate(log_regression, X_train_bow, X_test_bow, y_train, y_test, "Logistic Regression")

Model: Logistic Regression
Accuracy: 0.752
Precision: 0.7540786834476674
Recall: 0.752
F1 Score: 0.7446503364301793
              precision    recall  f1-score   support

    negative       0.72      0.85      0.78       204
     neutral       0.74      0.45      0.56        88
    positive       0.79      0.78      0.79       208

    accuracy                           0.75       500
   macro avg       0.75      0.70      0.71       500
weighted avg       0.75      0.75      0.74       500



### With K-Nearest Neighbors (KNN)

In [51]:
KNN = KNeighborsClassifier()
train_and_evaluate(KNN, X_train_bow, X_test_bow, y_train, y_test, "K-Nearest Neighbors")

Model: K-Nearest Neighbors
Accuracy: 0.458
Precision: 0.47430046082949306
Recall: 0.458
F1 Score: 0.4226230869001297
              precision    recall  f1-score   support

    negative       0.43      0.65      0.51       204
     neutral       0.50      0.05      0.08        88
    positive       0.51      0.45      0.48       208

    accuracy                           0.46       500
   macro avg       0.48      0.38      0.36       500
weighted avg       0.47      0.46      0.42       500



## With SVM

In [52]:
SVM = SVC()
train_and_evaluate(SVM, X_train_bow, X_test_bow, y_train, y_test, "SVM")

Model: SVM
Accuracy: 0.678
Precision: 0.7404846118964531
Recall: 0.678
F1 Score: 0.6547846932130185
              precision    recall  f1-score   support

    negative       0.58      0.91      0.71       204
     neutral       0.86      0.22      0.35        88
    positive       0.84      0.64      0.73       208

    accuracy                           0.68       500
   macro avg       0.76      0.59      0.60       500
weighted avg       0.74      0.68      0.65       500

