In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle
import re
import numpy as np
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [8]:
data = pd.read_csv('spam-2.csv')
data

def preprocessing(text):
    # case folding
    text = text.lower()

    # remove punctuation and non-alphabetic characters
    text = re.sub(r'[^\w\s]', '', text)

    # remove numbers
    text = re.sub(r'\d+', '', text)

    # stopword removal
    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()
    words = text.split()
    text = " ".join([word for word in words if word not in stopwords])

    return text
data

Unnamed: 0,Kategori,Pesan
0,spam,Secara alami tak tertahankan identitas perusah...
1,spam,Fanny Gunslinger Perdagangan Saham adalah Merr...
2,spam,Rumah -rumah baru yang luar biasa menjadi muda...
3,spam,4 Permintaan Khusus Pencetakan Warna Informasi...
4,spam,"Jangan punya uang, dapatkan CD perangkat lunak..."
...,...,...
2631,ham,Pengingat halo semuanya: Vince telah meminta s...
2632,ham,Re: Argentina Power & Gas Market Modeling Oke ...
2633,ham,"Re: Program Enron / Stanford Stinson, hebat! S..."
2634,ham,"Persetujuan untuk peninjau Roberts JR, Michael..."


In [9]:
data['Pesan'] = data['Pesan'].apply(preprocessing)
data

Unnamed: 0,Kategori,Pesan
0,spam,alami tak tertahankan identitas perusahaan san...
1,spam,fanny gunslinger perdagangan saham merrill muz...
2,spam,rumah rumah baru luar biasa menjadi mudah menu...
3,spam,permintaan khusus pencetakan warna informasi t...
4,spam,jangan punya uang dapatkan cd perangkat lunak ...
...,...,...
2631,ham,pengingat halo semuanya vince meminta mengirim...
2632,ham,re argentina power gas market modeling oke jul...
2633,ham,re program enron stanford stinson hebat menant...
2634,ham,persetujuan peninjau roberts jr michael a meny...


In [10]:
Le = LabelEncoder()
data['Kategori'] = Le.fit_transform(data['Kategori'])
data

Unnamed: 0,Kategori,Pesan
0,1,alami tak tertahankan identitas perusahaan san...
1,1,fanny gunslinger perdagangan saham merrill muz...
2,1,rumah rumah baru luar biasa menjadi mudah menu...
3,1,permintaan khusus pencetakan warna informasi t...
4,1,jangan punya uang dapatkan cd perangkat lunak ...
...,...,...
2631,0,pengingat halo semuanya vince meminta mengirim...
2632,0,re argentina power gas market modeling oke jul...
2633,0,re program enron stanford stinson hebat menant...
2634,0,persetujuan peninjau roberts jr michael a meny...


In [13]:
# Preprocessing TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data["Pesan"])
y = data["Kategori"]


In [14]:
# Split data (kalau dataset lebih besar)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Train SVM
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Evaluasi
print("Naive Bayes")
print(classification_report(y_test, nb_model.predict(X_test)))

print("SVM")
print(classification_report(y_test, svm_model.predict(X_test)))

Naive Bayes
              precision    recall  f1-score   support

           0       1.00      0.98      0.99       263
           1       0.98      1.00      0.99       265

    accuracy                           0.99       528
   macro avg       0.99      0.99      0.99       528
weighted avg       0.99      0.99      0.99       528

SVM
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       263
           1       0.97      1.00      0.99       265

    accuracy                           0.98       528
   macro avg       0.99      0.98      0.98       528
weighted avg       0.99      0.98      0.98       528



In [15]:
# Simpan model & vectorizer
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("naive_bayes.pkl", "wb") as f:
    pickle.dump(nb_model, f)

with open("svm.pkl", "wb") as f:
    pickle.dump(svm_model, f)