In [10]:
# Import Library
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [11]:
# Load dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]  # Hanya ambil kolom yang diperlukan
df.columns = ['label', 'text']  # Rename kolom

# Tampilkan beberapa baris dataset
print(df.head())

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [12]:
# Tokenisasi teks
tokenizer = Tokenizer(num_words=5000)  # Batasi vocabulary ke 5000 kata
tokenizer.fit_on_texts(df['text'])

# Ubah teks menjadi sequences
sequences = tokenizer.texts_to_sequences(df['text'])

# Padding sequences agar memiliki panjang yang sama
data = pad_sequences(sequences, maxlen=100)

# Ubah label menjadi binary (spam: 1, ham: 0)
labels = df['label'].apply(lambda x: 1 if x == 'spam' else 0).values

# Bagi dataset menjadi training dan testing
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

In [14]:
# Membangun model
model = Sequential()

# Embedding Layer (tanpa input_length)
model.add(Embedding(input_dim=5000, output_dim=64))  # Hapus input_length

# LSTM Layer
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.5))

# Dense Layer
model.add(Dense(1, activation='sigmoid'))

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
dummy_input = np.zeros((1, 100))
model(dummy_input)

# Ringkasan model
model.summary()

In [6]:
# Training model
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 67ms/step - accuracy: 0.8837 - loss: 0.3306 - val_accuracy: 0.9785 - val_loss: 0.0693
Epoch 2/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 64ms/step - accuracy: 0.9910 - loss: 0.0381 - val_accuracy: 0.9848 - val_loss: 0.0516
Epoch 3/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 67ms/step - accuracy: 0.9974 - loss: 0.0141 - val_accuracy: 0.9839 - val_loss: 0.0580
Epoch 4/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 56ms/step - accuracy: 0.9979 - loss: 0.0110 - val_accuracy: 0.9830 - val_loss: 0.0548
Epoch 5/5
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 64ms/step - accuracy: 0.9995 - loss: 0.0041 - val_accuracy: 0.9839 - val_loss: 0.0586


In [8]:
# Evaluasi model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Validation Accuracy: {accuracy* 100:.2f}%')

[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9880 - loss: 0.0472
Validation Accuracy: 98.39%


In [9]:
# Fungsi untuk memprediksi sentimen
def predict_sentiment(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=100)
    prediction = model.predict(padded_sequence)
    return 'spam' if prediction > 0.5 else 'ham'

# Contoh prediksi
print(predict_sentiment("This movie was amazing, I loved it!"))  # Harusnya hasilnya ham
print(predict_sentiment("Congratulations! You've won a $1000 gift card. Click here to claim."))  # Harusnya hasilnya spam

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
ham
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
spam
