# All Embedding Techniques

In [1]:
import pandas as pd
df = pd.read_csv('email_classification.csv')
df.head()

Unnamed: 0,email,label
0,Upgrade to our premium plan for exclusive acce...,ham
1,Happy holidays from our team! Wishing you joy ...,ham
2,We're hiring! Check out our career opportuniti...,ham
3,Your Amazon account has been locked. Click her...,spam
4,Your opinion matters! Take our survey and help...,ham


In [2]:
X = df.drop('label', axis =1)
y = df['label']

In [3]:
X.head()

Unnamed: 0,email
0,Upgrade to our premium plan for exclusive acce...
1,Happy holidays from our team! Wishing you joy ...
2,We're hiring! Check out our career opportuniti...
3,Your Amazon account has been locked. Click her...
4,Your opinion matters! Take our survey and help...


Copying X variable because I dont want to modify the X variable

In [4]:
Messages = X.copy()

In [5]:
Messages['email'][0]

'Upgrade to our premium plan for exclusive access to premium content and features.'

In [6]:
Messages.reset_index(inplace = True)

Removing Stop Words from the text data

In [7]:
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

Applying Stemming and saving the remaining text in a corpus

In [8]:
### Dataset Preprocessing
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []
for i in range(0, len(Messages)):
    review = re.sub('[^a-zA-Z]', ' ', Messages['email'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus

['upgrad premium plan exclus access premium content featur',
 'happi holiday team wish joy prosper season',
 'hire check career opportun join dynam team',
 'amazon account lock click verifi account inform',
 'opinion matter take survey help us enhanc experi',
 'payment receiv thank prompt transact',
 'email account storag full click upgrad account',
 'dear name thank subscrib newslett welcom gift',
 'account credit loyalti point redeem excit reward',
 'chosen free iphon click claim prize',
 'miss special offer sign get discount first purchas',
 'hire intern summer appli gain valuabl experi',
 'pre approv loan click appli',
 'thrill introduc new collect shop enjoy exclus discount',
 'excit announc upcom webinar seri regist reserv spot',
 'ad new featur app base feedback updat',
 'winner click claim exclus prize',
 'facebook account hack click secur account',
 'congratul reach new mileston mani achiev',
 'congratul select job interview click schedul interview',
 'account suspend due susp

One Hot Representations

In [10]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import one_hot

In [11]:
### Vocabulary size
voc_size=5000
onehot_repr=[one_hot(words,voc_size)for words in corpus]
onehot_repr

[[4437, 2869, 1074, 1158, 4647, 2869, 2168, 1329],
 [944, 2835, 3793, 2093, 1670, 3082, 2596],
 [235, 3504, 107, 2128, 2059, 3713, 3793],
 [2184, 2617, 3171, 4093, 2451, 2617, 3538],
 [2405, 3806, 1239, 1503, 3266, 563, 1044, 562],
 [551, 2549, 1532, 2583, 872],
 [2906, 2617, 4150, 3771, 4093, 4437, 2617],
 [596, 170, 1532, 234, 720, 370, 1942],
 [2617, 909, 2748, 1911, 2939, 3157, 3169],
 [2381, 376, 2231, 4093, 2911, 1946],
 [3925, 3044, 3439, 1396, 1168, 4611, 4639, 4804],
 [235, 387, 1325, 1289, 3042, 970, 562],
 [1747, 3669, 288, 4093, 1289],
 [4142, 724, 2446, 1665, 1809, 4962, 1158, 4611],
 [3157, 4852, 928, 4049, 4533, 430, 4382, 345],
 [1608, 2446, 1329, 907, 4013, 2569, 3258],
 [575, 4093, 2911, 1158, 1946],
 [2996, 2617, 2102, 4093, 2363, 2617],
 [4635, 2868, 2446, 1895, 694, 955],
 [4635, 4458, 3507, 4787, 4093, 725, 4787],
 [2617, 2810, 3525, 2119, 2677, 4093, 3913, 2617],
 [1168, 4465, 777, 4477, 1701, 268, 3439, 4918, 2430],
 [1532, 2569, 4076, 31, 2325, 715],
 [4064, 26

Embedding Representations

In [12]:
sent_length=20
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0, 4437, 2869, 1074, 1158, 4647, 2869, 2168, 1329], dtype=int32)

Now applying tf-idf

In [13]:
corpus

['upgrad premium plan exclus access premium content featur',
 'happi holiday team wish joy prosper season',
 'hire check career opportun join dynam team',
 'amazon account lock click verifi account inform',
 'opinion matter take survey help us enhanc experi',
 'payment receiv thank prompt transact',
 'email account storag full click upgrad account',
 'dear name thank subscrib newslett welcom gift',
 'account credit loyalti point redeem excit reward',
 'chosen free iphon click claim prize',
 'miss special offer sign get discount first purchas',
 'hire intern summer appli gain valuabl experi',
 'pre approv loan click appli',
 'thrill introduc new collect shop enjoy exclus discount',
 'excit announc upcom webinar seri regist reserv spot',
 'ad new featur app base feedback updat',
 'winner click claim exclus prize',
 'facebook account hack click secur account',
 'congratul reach new mileston mani achiev',
 'congratul select job interview click schedul interview',
 'account suspend due susp

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [15]:
tfidf_vectorizer = TfidfVectorizer(ngram_range= (3,3)) #I have used the ngrams range, You can use without it.
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_array = tfidf_matrix.toarray()
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_array, columns=feature_names)
tfidf_df

Unnamed: 0,access premium content,access quick loan,access vip club,account balanc updat,account ban click,account compromis click,account credit bonu,account credit loyalti,account flag suspici,account hack click,...,win excit prize,winner click claim,winner daili giveaway,winner day click,winner holiday giveaway,wish joy prosper,within next day,work resolv issu,workshop present slide,year exclus benefit
0,0.413167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
175,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
176,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
177,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In TensorFlow 2.0 or above, you can easily apply TF-IDF (Term Frequency-Inverse Document Frequency) to a text corpus using the TfidfVectorizer from the scikit-learn library. TensorFlow itself doesn't provide a direct function for TF-IDF, but scikit-learn integrates well with TensorFlow
Incase, Remove the comments from code present in the below cell to Convert the TF-IDF matrix to TensorFlow tensor if needed

In [16]:
# tfidf_tensor = tf.convert_to_tensor(tfidf_array, dtype=tf.float32)
# tfidf_tensor

Bag of Words

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
bow_array = X.toarray()
feature_names = vectorizer.get_feature_names_out()
bow_df = pd.DataFrame(bow_array, columns=feature_names)
bow_df

Unnamed: 0,access,account,achiev,act,activ,ad,advanc,affili,alway,amazon,...,welcom,wide,win,winner,wish,within,work,workshop,xyz,year
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
175,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
176,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
177,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Word2Vec(CBOW)
I am using window size and embedding dimentions of 10 and 80 respectively because of my corpus, you can change it according to you corpus.
And ofcourse, the output was too large, so the output was truncated but its not an issue with the model and embedding layers. So dont worry about it.

In [24]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda, Flatten, Input
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

# Sample corpus


# Parameters
window_size = 10  # Size of the context window (words around the target word)
embedding_dim = 80  # Dimension of word embeddings
epochs = 100  # Number of training epochs

# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
reverse_word_index = {v: k for k, v in word_index.items()}
total_words = len(word_index) + 1  # +1 for padding

# Prepare the data (context and target pairs)
context_words = []
target_words = []

for sentence in corpus:
    words = sentence.split()
    for i, word in enumerate(words):
        # Define the context window for each word
        start = max(0, i - window_size)
        end = min(len(words), i + window_size + 1)

        # Context words (exclude the target word)
        context = [words[j] for j in range(start, end) if j != i]

        # For each context word, the target is the current word
        for c in context:
            context_words.append(c)
            target_words.append(word)

# Convert words to integer indices
context_indices = [word_index[word] for word in context_words]
target_indices = [word_index[word] for word in target_words]

# One-hot encode the target words
target_one_hot = to_categorical(target_indices, num_classes=total_words)

# Now, each context word should be treated as a separate input, so we need to adjust how we input data

# Reshape the input so that each context word is represented separately (this becomes a 2D array of context words)
context_indices = np.array(context_indices)
context_indices = context_indices.reshape(-1, 1)  # (num_samples, 1)

# Build the CBOW model
model = Sequential()

# Input layer: context words (each word in the context is represented as an integer)
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, input_length=1, name="embedding_layer"))

# Reshape output to be able to sum across the context words
model.add(Lambda(lambda x: K.sum(x, axis=1)))

# Output layer: predict the target word
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate = 0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(context_indices, target_one_hot, epochs=epochs, batch_size=256, verbose=1)

# Get the word embeddings from the trained model
word_embeddings = model.get_layer("embedding_layer").get_weights()[0]

# Print the word embeddings for each word
for word, i in word_index.items():
    print(f"Word: {word}, Embedding: {word_embeddings[i]}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  0.54216427 -0.3264384  -0.2500155   0.14513034  0.28885785 -0.04841416
 -0.05356415  0.2509211  -0.3957292  -0.07983512  0.31122234  0.22693704
  0.32368383 -0.30251583 -0.28720242 -0.00967515  0.39074692 -0.2559828
  0.42203486 -0.60702115  0.24473032 -0.676501    0.33084887 -0.27157745
 -0.19077158  0.26108655 -0.60716254  0.36790383 -0.34197888 -0.50524056
 -0.60372066 -0.23666276  0.02019728 -0.05828996 -0.6982411  -0.16758399
  0.14667797  0.7519674   0.09822267 -0.05801318  0.21046601 -0.47662932
 -0.69944715 -0.3780617 ]
Word: pleas, Embedding: [ 1.03162386e-01 -1.10118881e-01  1.24610268e-01 -1.43589810e-01
 -3.63979302e-02  6.78041220e-01 -3.71534973e-02  8.78192604e-01
 -2.28285626e-01 -6.37318492e-01 -1.48011073e-01  4.41535592e-01
 -3.93550187e-01 -2.34995648e-01  6.46245837e-01  1.26356766e-01
 -1.75898015e-01  7.17653632e-01  1.03253558e-01 -3.72213989e-01
  2.04903245e-01  3.46926063e-01 -3.03771317e-01 -

Average Word2Vec

In [27]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Lambda
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K
import nltk




# Parameters
window_size = 10  # Context window size
embedding_dim = 80  # Embedding dimension size
epochs = 100  # Number of epochs

# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
reverse_word_index = {v: k for k, v in word_index.items()}
total_words = len(word_index) + 1  # +1 for padding

# Prepare the data (context and target pairs)
context_words = []
target_words = []

for sentence in corpus:
    words = sentence.split()
    for i, word in enumerate(words):
        # Define the context window for each word
        start = max(0, i - window_size)
        end = min(len(words), i + window_size + 1)

        # Context words (exclude the target word)
        context = [words[j] for j in range(start, end) if j != i]

        # For each context word, the target is the current word
        for c in context:
            context_words.append(c)
            target_words.append(word)

# Convert words to integer indices
context_indices = [word_index[word] for word in context_words]
target_indices = [word_index[word] for word in target_words]

# One-hot encode the target words
target_one_hot = to_categorical(target_indices, num_classes=total_words)

# Reshape the input for the Embedding layer
context_indices = np.array(context_indices)
context_indices = context_indices.reshape(-1, 1)  # (num_samples, 1)

# Build the CBOW model
model = Sequential()

# Input layer: context words (each word in the context is represented as an integer)
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim, input_length=1, name="embedding_layer"))

# Sum the embeddings of the context words (CBOW approach)
model.add(Lambda(lambda x: K.sum(x, axis=1)))

# Output layer: predict the target word
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(context_indices, target_one_hot, epochs=epochs, batch_size=256, verbose=1)

# Get the word embeddings from the trained model
word_embeddings = model.get_layer("embedding_layer").get_weights()[0]

# Now let's implement the Average Word2Vec method:
def average_word2vec(corpus, word_index, word_embeddings):
    sentence_embeddings = []

    # Iterate over each sentence in the corpus
    for sentence in corpus:
        words = sentence.split()
        word_vecs = []

        # For each word in the sentence, get the word embedding
        for word in words:
            if word in word_index:
                word_idx = word_index[word]
                word_vec = word_embeddings[word_idx]
                word_vecs.append(word_vec)

        # Compute the average of the word vectors for the sentence
        if word_vecs:
            sentence_embedding = np.mean(word_vecs, axis=0)
            sentence_embeddings.append(sentence_embedding)
        else:
            # If no word embeddings found (empty sentence), use a zero vector
            sentence_embeddings.append(np.zeros(embedding_dim))

    return np.array(sentence_embeddings)

# Get the average word2vec embeddings for the entire corpus
sentence_embeddings = average_word2vec(corpus, word_index, word_embeddings)

# Print the embedding for the first sentence
for i, embedding in enumerate(sentence_embeddings):
    print(f"Sentence {i + 1} embedding:")
    print(embedding)
    print()


Epoch 1/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0126 - loss: 5.9447
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0601 - loss: 5.9099
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0932 - loss: 5.8619
Epoch 4/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0980 - loss: 5.7806
Epoch 5/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1119 - loss: 5.6449
Epoch 6/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.1018 - loss: 5.4746
Epoch 7/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0993 - loss: 5.2584
Epoch 8/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0985 - loss: 5.0839
Epoch 9/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━