In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
import pickle
import re
from sklearn.preprocessing import LabelEncoder
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# --- 1. Load dataset ---
data = pd.read_csv('spam-2.csv')

# --- 2. Preprocessing ---
def preprocessing(text):
    # case folding
    text = text.lower()

    # replace numbers with token
    text = re.sub(r'\d+', ' <NUM> ', text)

    # remove punctuation (tapi jangan spasi)
    text = re.sub(r'[^\w\s]', '', text)

    # stopword removal (tapi jangan terlalu agresif)
    factory = StopWordRemoverFactory()
    stopwords = set(factory.get_stop_words())
    words = text.split()
    text = " ".join([word for word in words if word not in stopwords])

    return text

data['Pesan'] = data['Pesan'].astype(str).apply(preprocessing)

# --- 3. Encode label secara eksplisit ---
Le = LabelEncoder()
data['Kategori'] = Le.fit_transform(data['Kategori'])

print("Mapping label:", dict(zip(Le.classes_, Le.transform(Le.classes_))))
# contoh hasil: {'ham': 0, 'spam': 1}

# --- 4. TF-IDF dengan n-gram ---
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)  # min_df=2 untuk buang kata langka
X = vectorizer.fit_transform(data["Pesan"])
y = data["Kategori"]

# --- 5. Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# --- 6. Train Naive Bayes ---
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# --- 7. Train SVM dengan balancing ---
svm_model = LinearSVC(class_weight="balanced", random_state=42)
svm_model.fit(X_train, y_train)

# --- 8. Evaluasi ---
print("Naive Bayes")
print(classification_report(y_test, nb_model.predict(X_test), target_names=Le.classes_))
print(confusion_matrix(y_test, nb_model.predict(X_test)))

print("\nSVM")
print(classification_report(y_test, svm_model.predict(X_test), target_names=Le.classes_))
print(confusion_matrix(y_test, svm_model.predict(X_test)))

# --- 9. Simpan model & vectorizer ---
pickle.dump(vectorizer, open("vectorizer_2.pkl", "wb"))
pickle.dump(nb_model, open("naive_bayes_2.pkl", "wb"))
pickle.dump(svm_model, open("svm_2.pkl", "wb"))
pickle.dump(Le, open("label_encoder_2.pkl", "wb"))


Mapping label: {'ham': np.int64(0), 'spam': np.int64(1)}
Naive Bayes
              precision    recall  f1-score   support

         ham       0.98      0.99      0.98       254
        spam       0.99      0.98      0.99       274

    accuracy                           0.98       528
   macro avg       0.98      0.98      0.98       528
weighted avg       0.98      0.98      0.98       528

[[251   3]
 [  5 269]]

SVM
              precision    recall  f1-score   support

         ham       0.99      0.98      0.99       254
        spam       0.99      0.99      0.99       274

    accuracy                           0.99       528
   macro avg       0.99      0.99      0.99       528
weighted avg       0.99      0.99      0.99       528

[[250   4]
 [  2 272]]


In [2]:
import pickle

# Load model dan vectorizer
vectorizer = pickle.load(open("vectorizer_2.pkl", "rb"))
nb_model = pickle.load(open("naive_bayes_2.pkl", "rb"))
svm_model = pickle.load(open("svm_2.pkl", "rb"))
label_encoder = pickle.load(open("label_encoder_2.pkl", "rb"))

def predict_text(message):
    # Preprocessing sama seperti waktu training (vectorizer sudah include)
    X = vectorizer.transform([message])

    # Prediksi dengan Naive Bayes
    pred_nb = nb_model.predict(X)[0]
    label_nb = label_encoder.inverse_transform([pred_nb])[0]

    # Prediksi dengan SVM
    pred_svm = svm_model.predict(X)[0]
    label_svm = label_encoder.inverse_transform([pred_svm])[0]

    return {
        "Naive Bayes": label_nb,
        "SVM": label_svm
    }

# --- Coba tes beberapa kalimat ---
test_texts = [
    "Segera klaim hadiah Anda sekarang, klik link ini untuk verifikasi!",
    "Halo, besok kita rapat jam 10 di kantor ya.",
    "Diskon besar-besaran 70% hanya hari ini, buruan beli sekarang!",
    "Apakah kamu sudah makan?"
]

for text in test_texts:
    hasil = predict_text(text)
    print(f"Teks: {text}")
    print(f"  Naive Bayes: {hasil['Naive Bayes']}")
    print(f"  SVM       : {hasil['SVM']}")
    print("-" * 50)


Teks: Segera klaim hadiah Anda sekarang, klik link ini untuk verifikasi!
  Naive Bayes: spam
  SVM       : spam
--------------------------------------------------
Teks: Halo, besok kita rapat jam 10 di kantor ya.
  Naive Bayes: ham
  SVM       : ham
--------------------------------------------------
Teks: Diskon besar-besaran 70% hanya hari ini, buruan beli sekarang!
  Naive Bayes: spam
  SVM       : spam
--------------------------------------------------
Teks: Apakah kamu sudah makan?
  Naive Bayes: ham
  SVM       : ham
--------------------------------------------------
