In [None]:
#importing important libraries fixing dataset
import gensim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

messages=pd.read_csv('../input/spamcsv/spam.csv',encoding='latin1' )
messages=messages[['v1','v2']]
messages.columns=['label', 'text']



In [None]:
#target variable encoding
labels=np.where(messages['label']=='spam',1,0)

In [None]:
#train and test split
X_train, X_test, y_train, y_test= train_test_split(messages['text'], labels, test_size=0.2)


In [None]:
#importing tensorflow and model building libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## Using Tokenizer

Tokenizer: Allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary) or into a vector where the coefficient for each token could be binary, based on word count, based on tf-idf...

By default, all punctuation is removed, turning the texts into space-separated sequences of words (words maybe include the ' character). These sequences are then split into lists of tokens. They will then be indexed or vectorized.m

## Using Pad Sequences
This function transforms a list (of length num_samples) of sequences (lists of integers) into a 2D Numpy array of shape (num_samples, num_timesteps). num_timesteps is either the maxlen argument if provided, or the length of the longest sequence in the list.



In [None]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer=Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

In [None]:
print(tokenizer.index_word[1])
print(tokenizer.index_word[2])

In [None]:
X_train_seq=tokenizer.texts_to_sequences(X_train)
X_test_seq=tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_seq[0]

In [None]:
X_train_seq_padded = pad_sequences(X_train_seq,maxlen=50, padding='post')
X_test_seq_padded = pad_sequences(X_test_seq,maxlen=50, padding='post')

In [None]:
X_test_seq_padded[0]

In [None]:
#Building the model
model = keras.Sequential()
# Add an Embedding layer expecting input vocab of size 1000, and
# output embedding dimension of size 32.
model.add(layers.Embedding(input_dim=len(tokenizer.index_word)+1, output_dim=32)) #ypu can test output_dim

# Add a LSTM layer with 128 internal units.
model.add(layers.LSTM(32, dropout=0, recurrent_dropout=0)) #output of previous layer i.e 32

# Add a Dense layer with 10 units.
model.add(layers.Dense(32, activation='relu'))
#final layer which will tell whether it's a spam or ham
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
import tensorflow.keras.backend as K
def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

#compile the model
model.compile(
    loss='binary_crossentropy',
    optimizer="adam",
    metrics=['accuracy',recall_m,precision_m]
)


In [None]:
history=model.fit(
    X_train_seq_padded, y_train, validation_data=(X_test_seq_padded, y_test), batch_size=32, epochs=10
)

In [None]:
# Plot the evaluation metrics by each epoch for the model to see if we are over or underfitting
import matplotlib.pyplot as plt

for i in ['accuracy', 'precision_m', 'recall_m']:
    acc = history.history[i]
    val_acc = history.history['val_{}'.format(i)]
    epochs = range(1, len(acc) + 1)

    plt.figure()
    plt.plot(epochs, acc, label='Training Accuracy')
    plt.plot(epochs, val_acc, label='Validation Accuracy')
    plt.title('Results for {}'.format(i))
    plt.legend()
    plt.show()