In [6]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense

In [7]:
data = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/dl/spam.csv", encoding="latin-1")
messages = []
labels = []

for index, row in data.iterrows():
    messages.append(row['v2'])
    if row['v1'] == 'ham':
        labels.append(0)
    else:
        labels.append(1)

In [8]:
messages = np.asarray(messages)
labels = np.asarray(labels)

max_vocab = 10000
max_len = 500
tokenizer = Tokenizer(num_words=max_vocab)
tokenizer.fit_on_texts(messages)
sequences = tokenizer.texts_to_sequences(messages)

word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=max_len)



In [9]:
train_samples = int(len(messages) * 0.8)
messages_train = data[:train_samples]
labels_train = labels[:train_samples]
messages_test = data[train_samples:len(messages)-2]
labels_test = labels[train_samples:len(messages)-2]

In [10]:

embedding_mat_columns = 32
model = Sequential()
model.add(Embedding(input_dim=max_vocab, output_dim=embedding_mat_columns, input_length=max_len))
model.add(SimpleRNN(units=embedding_mat_columns))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

model.fit(messages_train, labels_train, epochs=10, batch_size=60, validation_split=0.2)

acc = model.evaluate(messages_test, labels_test)
print("Test loss is {0:.2f} accuracy is {1:.2f} ".format(acc[0], acc[1]))


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 32)           320000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                2080      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
Total params: 322,113
Trainable params: 322,113
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss is 0.07 accuracy is 0.99 


In [15]:
def message_to_array(msg):
    msg = msg.lower().split(' ')
    test_seq = np.array([word_index.get(word, 0) for word in msg])
    test_seq = np.pad(test_seq, (500 - len(test_seq), 0), 'constant', constant_values=(0))
    test_seq = test_seq.reshape(1, 500)
    return test_seq


In [17]:
custom_msg = 'Congratulations ur awarded 500 of CD vouchers or 125gift guaranteed Free ent'
test_seq = message_to_array(custom_msg)
predict_x = model.predict(test_seq)
classes_x = np.argmax(predict_x, axis=1)
print(classes_x)


[0]
