In [32]:
import pandas as pd
import re
import pickle
from sklearn.preprocessing import LabelEncoder
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from os import path

In [36]:
dataset_path = path.abspath('../') + "/datasets/"
model_path = path.abspath('../') + "/models/"
data = pd.read_csv(dataset_path + 'spam.csv')

In [21]:
def preprocessing(text):
    # case folding
    text = text.lower()

    # replace numbers with token
    text = re.sub(r'\d+', ' <NUM> ', text)

    # remove punctuation (tapi jangan spasi)
    text = re.sub(r'[^\w\s]', '', text)

    # stopword removal (tapi jangan terlalu agresif)
    factory = StopWordRemoverFactory()
    stopwords = set(factory.get_stop_words())
    words = text.split()
    text = " ".join([word for word in words if word not in stopwords])

    return text

In [26]:
data['Pesan'] = data['Pesan'].astype(str).apply(preprocessing)

In [27]:
Le = LabelEncoder()
data['Kategori'] = Le.fit_transform(data['Kategori'])

In [28]:
print("Mapping label:", dict(zip(Le.classes_, Le.transform(Le.classes_))))

Mapping label: {'ham': np.int64(0), 'spam': np.int64(1)}


In [29]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)  # min_df=2 untuk buang kata langka
X = vectorizer.fit_transform(data["Pesan"])
y = data["Kategori"]

In [30]:
pickle.dump(vectorizer, open(model_path + "vectorizer_2.pkl", "wb"))
pickle.dump(Le, open(model_path + "label_encoder_2.pkl", "wb"))
data.to_csv(dataset_path + "cleaned.csv")