In [16]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
import tensorflow as tf
import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
import tensorflow_probability as tfp
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
from vae_testing import data_sp,data_nsp

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tejasasija/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tejasasija/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
data_path = 'spam_or_not_spam.csv'

data = pd.read_csv(data_path).dropna(subset=['email'])
emails = data['email']
labels = data['label']

print(data)


                                                  email  label
0      date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...      0
1     martin a posted tassos papadopoulos the greek ...      0
2     man threatens explosion in moscow thursday aug...      0
3     klez the virus that won t die already the most...      0
4      in adding cream to spaghetti carbonara which ...      0
...                                                 ...    ...
2995   abc s good morning america ranks it the NUMBE...      1
2996   hyperlink hyperlink hyperlink let mortgage le...      1
2997   thank you for shopping with us gifts for all ...      1
2998   the famous ebay marketing e course learn to s...      1
2999   hello this is chinese traditional 子 件 NUMBER世...      1

[2999 rows x 2 columns]


In [3]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.strip()  # Remove leading/trailing whitespaces
    words = text.split()

    # Remove stopwords and lemmatize
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [4]:
# GloVe file path (update this with your local path)
glove_path = "glove.6B.100d.txt"

# Load the GloVe embeddings
def load_glove_embeddings(glove_file, embedding_dim):
    embeddings_index = {}
    with open(glove_file, encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

# Create an embedding matrix for the tokenizer
def create_embedding_matrix(tokenizer, embeddings_index, embedding_dim):
    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [6]:
# Tokenize and pad sequences
max_len = 200  # Limit email length
embedding_dim = 100  # GloVe embedding dimension
tokenizer = Tokenizer()

emails = [' '.join(preprocess_text(email)) for email in emails]

tokenizer.fit_on_texts(emails)
email_sequences = tokenizer.texts_to_sequences(emails)

email_padded = pad_sequences(email_sequences, maxlen=max_len, padding='post')

# Create embedding matrix
embeddings_index = load_glove_embeddings(glove_path, embedding_dim)
embedding_matrix = create_embedding_matrix(tokenizer, embeddings_index, embedding_dim)

# Split data into spam and not-spam
spam_data = email_padded[labels == 1]
not_spam_data = email_padded[labels == 0]


In [31]:
# Constants (keeping your original values)
LATENT_DIM = 16
EMBEDDING_DIM = 100
BATCH_SIZE = 64

SEQUENCE_LENGTH = email_padded.shape[1]  # Number of tokens in each padded email

trainX = email_padded

# Embedding layer using pre-trained GloVe embeddings
embedding_layer = layers.Embedding(
    input_dim=embedding_matrix.shape[0],  # Vocabulary size
    output_dim=embedding_matrix.shape[1],  # Embedding dimensions
    weights=[embedding_matrix],
    trainable=True,
)


In [32]:
# Sampling Layer for Latent Space
class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

In [33]:
# Encoder1
spam_encoder_inputs = Input(shape=(SEQUENCE_LENGTH,), name="spam_encoder_input")
x = embedding_layer(spam_encoder_inputs)  # Shape: (batch_size, SEQUENCE_LENGTH, EMBEDDING_DIM)
x = layers.LSTM(64, return_sequences=True)(x)  # Shape: (batch_size, SEQUENCE_LENGTH, 64)
x = layers.LSTM(32)(x)  # Shape: (batch_size, 32)

z_mean = layers.Dense(LATENT_DIM, name="z_mean")(x)
z_log_var = layers.Dense(LATENT_DIM, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])

spam_encoder = Model(spam_encoder_inputs, [z_mean, z_log_var, z], name="spam_encoder")
spam_encoder.summary()

# Encoder2
nspam_encoder_inputs = Input(shape=(SEQUENCE_LENGTH,), name="nspam_encoder_input")
x = embedding_layer(nspam_encoder_inputs)  # Shape: (batch_size, SEQUENCE_LENGTH, EMBEDDING_DIM)
x = layers.LSTM(64, return_sequences=True)(x)  # Shape: (batch_size, SEQUENCE_LENGTH, 64)
x = layers.LSTM(32)(x)  # Shape: (batch_size, 32)

z_mean = layers.Dense(LATENT_DIM, name="z_mean")(x)
z_log_var = layers.Dense(LATENT_DIM, name="z_log_var")(x)
z = Sampling()([z_mean, z_log_var])

nspam_encoder = Model(spam_encoder_inputs, [z_mean, z_log_var, z], name="nspam_encoder")
nspam_encoder.summary()


In [34]:
# Decoder
spam_latent_inputs = Input(shape=(LATENT_DIM,), name="spam_latent_input")
x = layers.Dense(32, activation="relu")(spam_latent_inputs)
x = layers.RepeatVector(SEQUENCE_LENGTH)(x)  # Shape: (batch_size, SEQUENCE_LENGTH, 32)
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.LSTM(EMBEDDING_DIM, return_sequences=True)(x)
spam_decoder_outputs = layers.TimeDistributed(layers.Dense(embedding_matrix.shape[0], activation="softmax"))(x)
spam_decoder = Model(spam_latent_inputs, spam_decoder_outputs, name="spam_decoder")
spam_decoder.summary()


# Decoder
nspam_latent_inputs = Input(shape=(LATENT_DIM,), name="nspam_latent_input")
x = layers.Dense(32, activation="relu")(nspam_latent_inputs)
x = layers.RepeatVector(SEQUENCE_LENGTH)(x)  # Shape: (batch_size, SEQUENCE_LENGTH, 32)
x = layers.LSTM(64, return_sequences=True)(x)
x = layers.LSTM(EMBEDDING_DIM, return_sequences=True)(x)
nspam_decoder_outputs = layers.TimeDistributed(layers.Dense(embedding_matrix.shape[0], activation="softmax"))(x)
nspam_decoder = Model(nspam_latent_inputs, nspam_decoder_outputs, name="decoder")
nspam_decoder.summary()


In [63]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)

            # Reconstruction loss (using sparse categorical crossentropy since we're working with token indices)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    tf.keras.losses.sparse_categorical_crossentropy(
                        data, reconstruction
                    ), axis=1
                )
            )

            # KL divergence loss
            kl_loss = -0.5 * tf.reduce_mean(
                tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)
            )

            total_loss = reconstruction_loss + 0.01 * kl_loss

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)

        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    def test_step(self, data):
        if isinstance(data, tuple):
            data = data[0]

        z_mean, z_log_var, z = self.encoder(data)
        reconstruction = self.decoder(z)

        reconstruction_loss = tf.reduce_mean(
            tf.reduce_sum(
                tf.keras.losses.sparse_categorical_crossentropy(
                    data, reconstruction
                ), axis=1
            )
        )

        kl_loss = -0.5 * tf.reduce_mean(
            tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)
        )

        total_loss = reconstruction_loss + 0.01 * kl_loss

        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)

        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    def generate_email(self, num_samples=1, temperature=0.8):
        # Sample from latent space
        random_latent_vectors = tf.random.normal(
            shape=(num_samples, LATENT_DIM)
        )

        # Generate sequences
        generated_sequences = self.decoder.predict(random_latent_vectors)

        # Apply temperature scaling
        generated_sequences = generated_sequences / temperature

        # Convert to token indices and then to text
        generated_emails = []
        for seq in generated_sequences:
            # Get most likely tokens
            tokens = tf.argmax(seq, axis=-1).numpy()

            # Convert to text
            words = []
            for token in tokens:
                if token != 0:  # Skip padding
                    word = tokenizer.index_word.get(token, '')
                    if word:
                        words.append(word)

            email = ' '.join(words)
            generated_emails.append(email)

        

    def call(self, data):
        z_mean, z_log_var, z = self.encoder(data)
        reconstruction = self.decoder(z)
        return reconstruction


In [60]:
# Instantiate and compile the VAE
spam_vae = VAE(spam_encoder, spam_decoder)
spam_vae.compile(optimizer=keras.optimizers.Adam())

nspam_vae = VAE(nspam_encoder,nspam_decoder)
nspam_vae.compile(optimizer=keras.optimizers.Adam())

# Train the VAE
spam_vae.fit(
    spam_data, spam_data,  # Inputs and targets are the same for autoencoders
    batch_size=BATCH_SIZE,
    epochs=20,
)

nspam_vae.fit(
    not_spam_data,not_spam_data,  # Inputs and targets are the same for autoencoders
    batch_size=BATCH_SIZE,
    epochs=20,
)

Epoch 1/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 4s/step - kl_loss: 19.7676 - loss: 860.6661 - reconstruction_loss: 860.4684
Epoch 2/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 4s/step - kl_loss: 18.1848 - loss: 851.3441 - reconstruction_loss: 851.1623
Epoch 3/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 4s/step - kl_loss: 19.1129 - loss: 848.0837 - reconstruction_loss: 847.8926
Epoch 4/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 4s/step - kl_loss: 19.9343 - loss: 843.6889 - reconstruction_loss: 843.4895
Epoch 5/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 4s/step - kl_loss: 21.4501 - loss: 839.0408 - reconstruction_loss: 838.8262
Epoch 6/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 4s/step - kl_loss: 21.8885 - loss: 831.8679 - reconstruction_loss: 831.6490
Epoch 7/20
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 5s/step - kl_loss: 23.7850

KeyboardInterrupt: 

In [64]:

# Generate sample emails
def generate_samples(model, num_samples=5, temperature=0.8):
    print("\nGenerated Email Samples:")
    #emails = model.generate_email(num_samples=num_samples, temperature=temperature)
    count = 1
    for key in  data_sp:
        print(f"\nSample {count}:")
        print("Orignal : ",key[0])
        print("Reconstructed : ", data_sp[key])
        print("-" * 50)
        count+=1

# Generate some samples
generate_samples(spam_vae, num_samples=5, temperature=0.8)

generate_samples(nspam_vae,5,temperature=0.8)


Generated Email Samples:


NameError: name 'data_sp' is not defined

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.stem import PorterStemmer
import re

stemmer = PorterStemmer()

def preprocess_text(text):
    words = re.findall(r'\b\w+\b', text.lower())
    processed_words = [stemmer.stem(word) for word in words]
    return processed_words

df = data

df['email'] = df['email'].fillna('')
for i in range (len(df)):
    df.loc[i,"email"]=" ".join(preprocess_text(df.loc[i,"email"]))

vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['email']).toarray()

# Convert the sparse matrix to a dense numpy array (optional, depending on your use case)

df_count = pd.DataFrame(X, columns=vectorizer.get_feature_names_out())
y= df['label'].values


from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1,random_state=2)

def accuracy(y_pred1):
    right=0
    for i in range (len(y_pred1)):
        if(y_pred1[i]==y[i]):
            right+=1
    return right/len(y_pred1)



from sklearn.naive_bayes import GaussianNB,MultinomialNB, BernoulliNB
gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

gnb.fit(X_train,y_train)
y_pred1=gnb.predict(X_test)
mnb.fit(X_train,y_train)
y_pred2=mnb.predict(X_test)
bnb.fit(X_train,y_train)
y_pred3=bnb.predict(X_test)

print("Accuracy of the model by the gnb", accuracy(y_pred1))
print("Accuracy of the model by the mnb", accuracy(y_pred2))
print("Accuracy of the model by the bnb", accuracy(y_pred3))

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
print(confusion_matrix(y_test, y_pred1))
print(confusion_matrix(y_test,y_pred2))
cm=confusion_matrix(y_test,y_pred3)

TP = cm[1, 1]  # True Positives
TN = cm[0, 0]  # True Negatives
FP = cm[0, 1]  # False Positives
FN = cm[1, 0]  # False Negatives

from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, f1_score

accuracy = (TP + TN) / (TP + TN + FP + FN)
precision = precision_score(y_test, y_pred3)
recall = TP / (TP + FN)
f1 = f1_score(y_test, y_pred3)
print(precision,recall,f1)

