# Tugas 2

## Preprocesing

In [56]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



data = pd.read_csv('data/spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]
data.rename(columns={'v1': 'Labels', 'v2': 'SMS'}, inplace=True)

# Tampilkan 5 data teratas
display(data.head())
display(data.shape)
display(data.info())
display(data.describe())

Unnamed: 0,Labels,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


(5572, 2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Labels  5572 non-null   object
 1   SMS     5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


None

Unnamed: 0,Labels,SMS
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [57]:
X = data['SMS']
y = data['Labels']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


## MODEL 1: CountVectorizer

In [None]:
cv = CountVectorizer(stop_words='english')

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

model_cv = MultinomialNB()
model_cv.fit(X_train_cv, y_train)

y_pred_cv = model_cv.predict(X_test_cv)

print("=== Hasil Evaluasi: CountVectorizer ===")
print("Akurasi:", accuracy_score(y_test, y_pred_cv))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_cv))
print("\nClassification Report:\n", classification_report(y_test, y_pred_cv))


=== Hasil Evaluasi: CountVectorizer ===
Akurasi: 0.9838565022421525

Confusion Matrix:
 [[960   6]
 [ 12 137]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99       966
        spam       0.96      0.92      0.94       149

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



## MODEL 2: TF-IDF Vectorizer

In [None]:
tfidf = TfidfVectorizer(stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

model_tfidf = MultinomialNB()
model_tfidf.fit(X_train_tfidf, y_train)

y_pred_tfidf = model_tfidf.predict(X_test_tfidf)

print("\n=== Hasil Evaluasi: TF-IDF ===")
print("Akurasi:", accuracy_score(y_test, y_pred_tfidf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_tfidf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_tfidf))



=== Hasil Evaluasi: TF-IDF ===
Akurasi: 0.968609865470852

Confusion Matrix:
 [[966   0]
 [ 35 114]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       966
        spam       1.00      0.77      0.87       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



## Memandingkan Hasil

In [60]:
acc_cv = accuracy_score(y_test, y_pred_cv)
acc_tfidf = accuracy_score(y_test, y_pred_tfidf)

print("\n=== Perbandingan Akurasi ===")
print(f"CountVectorizer: {acc_cv:.4f}")
print(f"TF-IDF:          {acc_tfidf:.4f}")



=== Perbandingan Akurasi ===
CountVectorizer: 0.9839
TF-IDF:          0.9686


## Kesimpulan

Berdasarkan hasil pengujian, model Multinomial Naive Bayes dengan **CountVectorizer** menghasilkan akurasi **0.9839**, sedangkan dengan **TF-IDF** hanya **0.9686**. Hal ini menunjukkan bahwa **CountVectorizer** lebih efektif untuk mendeteksi pesan *spam* karena frekuensi kemunculan kata-kata seperti “free” atau “win” lebih berpengaruh dalam membedakan *spam* dan *ham*. Dengan demikian, fitur **CountVectorizer** merupakan pilihan terbaik untuk dataset `spam.csv`.
