In [5]:
import pandas as pd
import numpy as np

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

In [8]:
data = pd.read_csv('/content/spam.csv', encoding="ISO-8859-1")

In [9]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [10]:
data.shape

(5572, 5)

In [11]:
data.rename({"v2": "Text"}, axis=1, inplace=True)

In [12]:
data.head()

Unnamed: 0,v1,Text,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


The dataset I am using is spam related data, which is useful for classification tasks. But I'm using one column
from the dataset that is "**Test**" column.

In [13]:
data['Text'].isnull().sum()

0

**Cleaning the data**
We can see they are some unwanted words and characters in it, which are not useful to model and also decrease our model accuracy

In [14]:
data['Text'] = data['Text'].apply(lambda x: x.replace(u'\xa0',u' '))
data['Text'] = data['Text'].apply(lambda x: x.replace('\u200a',' '))

In [15]:
data = np.array(data['Text'])

In [16]:
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(data)
totalWords = len(tokenizer.word_index) + 1

print("words count: ", len(tokenizer.word_counts))
print('\n\nWord indexes:', len(tokenizer.word_index))

words count:  8920


Word indexes: 8921


In [17]:
sequence = tokenizer.texts_to_sequences(data)

In [18]:
input_seq = []
for i in sequence:
    for j in range(1, len(i)):
        N_gram = i[:j+1]
        input_seq.append(N_gram)

In [19]:
max_seq = max([len(i) for i in input_seq])

new_data = pad_sequences(input_seq, maxlen=max_seq, dtype=np.float32, padding='pre')

In [20]:
new_data[0]

array([  0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   

In [21]:
features, labels = new_data[:, :-1], new_data[:, -1]
labels = to_categorical(labels, num_classes=totalWords)

In [22]:
print(len(features), len(labels))

82439 82439


In [23]:
model = Sequential()
model.add(Embedding(totalWords, 150, input_length = max_seq-1))
model.add(Bidirectional(LSTM(200)))
model.add(Dense(totalWords, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [24]:
model.fit(features, labels, epochs=75, verbose=1)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<keras.callbacks.History at 0x7f4a446a0cd0>

In [47]:
seed_text = "I'm really not up"
next_words = 2
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_seq-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted= np.argmax(predicted,axis=1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)

I'm really not up to it


In [46]:
data[88]

"I'm really not up to it still tonight babe"