In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import re, emoji
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory


In [None]:
df = pd.read_csv("../data/raw/email_spam_indo.csv")
df.head()


In [None]:
STOPWORDS = set(StopWordRemoverFactory().get_stop_words())
STOPWORDS.update([
    "hou","kaminski","vince","enron","corp","edu","cc","re","fw",
    "subject","email","houston","pm","am","com","net","org",
    "ltd","co","inc","ect"
])

URL_RE = re.compile(r'https?://\S+|www\.\S+')
EMAIL_RE = re.compile(r'\S+@\S+')
NON_ALPHA = re.compile(r'[^a-zA-Z\s]')

def strip_emoji(text):
    try:
        return emoji.replace_emoji(text, replace=" ")
    except:
        return text

def clean_text(t):
    t = t.lower()
    t = URL_RE.sub(" ", t)
    t = EMAIL_RE.sub(" ", t)
    t = strip_emoji(t)
    t = NON_ALPHA.sub(" ", t)
    return " ".join([w for w in t.split() if w not in STOPWORDS and len(w) > 2])


In [None]:
df["clean"] = df["Pesan"].apply(clean_text)

X = df["clean"]
y = df["Kategori"]

le = LabelEncoder()
y_enc = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
)


# experiment 1 tfidf + cnn

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf  = tfidf.transform(X_test).toarray()

X_train_cnn = np.expand_dims(X_train_tfidf, axis=2)
X_test_cnn  = np.expand_dims(X_test_tfidf, axis=2)


In [None]:
from tensorflow.keras import layers, models

model_tfidf = models.Sequential([
    layers.Conv1D(128, 5, activation='relu', input_shape=(5000, 1)),
    layers.MaxPooling1D(2),
    layers.Conv1D(64, 3, activation='relu'),
    layers.GlobalMaxPooling1D(),
    layers.Dense(64, activation='relu'),
    layers.Dense(len(le.classes_), activation='softmax')
])

model_tfidf.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model_tfidf.summary()


In [None]:
history_tfidf = model_tfidf.fit(
    X_train_cnn, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=32
)


In [None]:
pred_tfidf = model_tfidf.predict(X_test_cnn).argmax(axis=1)

print("TF-IDF Accuracy:", accuracy_score(y_test, pred_tfidf))
print(classification_report(y_test, pred_tfidf, target_names=le.classes_))


In [None]:
cm = confusion_matrix(y_test, pred_tfidf)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("TF-IDF + CNN Confusion Matrix")
plt.show()


# experiment 2 tokenizer+embedding+cnn

In [None]:
max_words = 5000
max_len = 300

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq  = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)


In [None]:
cnn_model = Sequential()
cnn_model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
cnn_model.add(Conv1D(64, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(len(le.classes_), activation='softmax'))

cnn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.summary()


In [None]:
history_tok = cnn_model.fit(
    X_train_seq, y_train,
    validation_split=0.1,
    epochs=5,
    batch_size=32
)


In [None]:
pred_tok = cnn_model.predict(X_test_seq).argmax(axis=1)

print("Tokenizer CNN Accuracy:", accuracy_score(y_test, pred_tok))
print(classification_report(y_test, pred_tok, target_names=le.classes_))


In [None]:
cm2 = confusion_matrix(y_test, pred_tok)

plt.figure(figsize=(6,4))
sns.heatmap(cm2, annot=True, fmt="d", cmap="Greens",
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Tokenizer + CNN Confusion Matrix")
plt.show()
