# Tugas Pemrograman Text Classification menggunakan ML
## 13521009 - Christophorus Dharma Winata

# 0. Notebook setup

In [33]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopword_indo = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christodharma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1. Memuat dataset

In [22]:
test_preprocess = pd.read_csv('dataset/test_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
test_preprocess_masked_label = pd.read_csv('dataset/test_preprocess_masked_label.tsv', sep='\t', header=None, names=['text', 'label'])
train_preprocess = pd.read_csv('dataset/train_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])
valid_preprocess = pd.read_csv('dataset/valid_preprocess.tsv', sep='\t', header=None, names=['text', 'label'])

# 2. Preprocessing

In [23]:
data_df = pd.concat([train_preprocess, valid_preprocess])
test_df = test_preprocess

# membuat semua teks menjadi lowercase
data_df['text'] = data_df['text'].str.lower()
test_df['text'] = test_df['text'].str.lower()

label_encoder = LabelEncoder()
data_df['label'] = label_encoder.fit_transform(data_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

# Split into X (features) and y (labels)
X_train = data_df['text']
y_train = data_df['label']
X_test = test_df['text']
y_test = test_df['label']

# 3. Feature extraction

In [25]:
# Bag of Words feature extraction
vectorizer = CountVectorizer(stop_words=stopword_indo, max_features=1000)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)



# 4. Model training

3 Model yang digunakan:
1. Logistic Regression
2. Naive Bayes
3. Support Vector Machine (SVM)

In [31]:
# Logistic Regression
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_bow, y_train)
y_pred_logreg = logreg.predict(X_test_bow)

# Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_bow, y_train)
y_pred_nb = nb.predict(X_test_bow)

# Support Vector Machine
svm = SVC(kernel='linear')
svm.fit(X_train_bow, y_train)
y_pred_svm = svm.predict(X_test_bow)

# 5. Model evaluation

In [32]:
def evaluate_model(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro')
    rec = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')
    return acc, prec, rec, f1

logreg_results = evaluate_model(y_test, y_pred_logreg)
nb_results = evaluate_model(y_test, y_pred_nb)
svm_results = evaluate_model(y_test, y_pred_svm)

print("Logistic Regression Results: ", logreg_results)
print("Naive Bayes Results: ", nb_results)
print("SVM Results: ", svm_results)

Logistic Regression Results:  (0.718, np.float64(0.709477620804852), np.float64(0.6615818821701175), np.float64(0.6728771705300874))
Naive Bayes Results:  (0.574, np.float64(0.5419683249140613), np.float64(0.5310257552904611), np.float64(0.5178924489739062))
SVM Results:  (0.718, np.float64(0.7354401154401154), np.float64(0.668106403400521), np.float64(0.6846018332616475))
