In [1]:
# now taking only those values from the 'sent_mails.csv' files that have values in the body
import pandas as pd

df = pd.read_csv('email_messages.csv')


In [2]:
df = df[df['message'].fillna('').str.strip() != '']

In [3]:
len(df)

1500

In [4]:
# importing necessary libaries 
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split




In [5]:
corpus = df['message'].head(1500).astype(str).tolist()

In [6]:
#initializing tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index)+1

In [7]:
print(vocab_size)

441


In [8]:
tokenizer.word_index

{'the': 1,
 'i': 2,
 'for': 3,
 'you': 4,
 'to': 5,
 'on': 6,
 'my': 7,
 'in': 8,
 'our': 9,
 'at': 10,
 'this': 11,
 'your': 12,
 'me': 13,
 'of': 14,
 'is': 15,
 'a': 16,
 'have': 17,
 'please': 18,
 'about': 19,
 'and': 20,
 'data': 21,
 'it': 22,
 'am': 23,
 'attached': 24,
 'was': 25,
 'could': 26,
 'marketing': 27,
 'role': 28,
 'week': 29,
 'are': 30,
 'would': 31,
 'pm': 32,
 'graphic': 33,
 'designer': 34,
 'from': 35,
 'software': 36,
 'into': 37,
 'with': 38,
 'we': 39,
 'analyst': 40,
 '00': 41,
 'manager': 42,
 'an': 43,
 'next': 44,
 'found': 45,
 'believe': 46,
 'engineer': 47,
 'discussion': 48,
 'work': 49,
 'oct': 50,
 '5th': 51,
 'together': 52,
 'today': 53,
 'position': 54,
 'been': 55,
 'draft': 56,
 'like': 57,
 'last': 58,
 'let': 59,
 'thank': 60,
 'that': 61,
 'application': 62,
 'review': 63,
 'nov': 64,
 '12th': 65,
 'confirm': 66,
 'project': 67,
 'dec': 68,
 '1st': 69,
 'meeting': 70,
 'who': 71,
 'something': 72,
 'availability': 73,
 'near': 74,
 'great'

In [9]:
input_sequences=[]
for email_text in corpus:
    token_list = tokenizer.texts_to_sequences([email_text])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [10]:
input_sequences

[[218, 4],
 [218, 4, 17],
 [218, 4, 17, 219],
 [218, 4, 17, 219, 3],
 [218, 4, 17, 219, 3, 1],
 [218, 4, 17, 219, 3, 1, 415],
 [218, 4, 17, 219, 3, 1, 415, 39],
 [218, 4, 17, 219, 3, 1, 415, 39, 26],
 [218, 4, 17, 219, 3, 1, 415, 39, 26, 220],
 [218, 4, 17, 219, 3, 1, 415, 39, 26, 220, 72],
 [218, 4, 17, 219, 3, 1, 415, 39, 26, 220, 72, 221],
 [26, 4],
 [26, 4, 106],
 [26, 4, 106, 1],
 [26, 4, 106, 1, 208],
 [26, 4, 106, 1, 208, 209],
 [26, 4, 106, 1, 208, 209, 210],
 [26, 4, 106, 1, 208, 209, 210, 3],
 [26, 4, 106, 1, 208, 209, 210, 3, 1],
 [26, 4, 106, 1, 208, 209, 210, 3, 1, 21],
 [26, 4, 106, 1, 208, 209, 210, 3, 1, 21, 40],
 [26, 4, 106, 1, 208, 209, 210, 3, 1, 21, 40, 28],
 [2, 31],
 [2, 31, 57],
 [2, 31, 57, 5],
 [2, 31, 57, 5, 66],
 [2, 31, 57, 5, 66, 7],
 [2, 31, 57, 5, 66, 7, 73],
 [2, 31, 57, 5, 66, 7, 73, 3],
 [2, 31, 57, 5, 66, 7, 73, 3, 43],
 [2, 31, 57, 5, 66, 7, 73, 3, 43, 79],
 [2, 31, 57, 5, 66, 7, 73, 3, 43, 79, 10],
 [2, 31, 57, 5, 66, 7, 73, 3, 43, 79, 10, 92],
 [2

In [11]:
print(f"Total number of sequences generated: {len(input_sequences)}")
for seq in input_sequences[:5]:
    print(seq)

Total number of sequences generated: 18362
[218, 4]
[218, 4, 17]
[218, 4, 17, 219]
[218, 4, 17, 219, 3]
[218, 4, 17, 219, 3, 1]


In [12]:
max_seq_len = max([len(x) for x in input_sequences])
max_seq_len

20

In [13]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_seq_len,padding='pre'))
input_sequences

array([[  0,   0,   0, ...,   0, 218,   4],
       [  0,   0,   0, ..., 218,   4,  17],
       [  0,   0,   0, ...,   4,  17, 219],
       ...,
       [  0,   0,   0, ..., 191,  56, 140],
       [  0,   0,   0, ...,  56, 140,  64],
       [  0,   0,   0, ..., 140,  64,  65]])

In [14]:
# now creating the x,y split where we take the first n-1 values of each array at x and the last value of each at y (y will be label)

import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]

In [15]:
y

array([  4,  17, 219, ..., 140,  64,  65])

In [17]:
# converting y to categorial data
# y=tf.keras.utils.to_categorical(y,num_classes=vocab_size) # I am not using this because of memory constraints

In [16]:
# now spliting the x, y as x_test, x_train, y_test, y_train using test_train_split 

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [17]:
# defining the early stopping logic 
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [18]:
## LSTM RNN model 

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Params
vocab_size = 441
max_seq_len = 20   # your sequences are length 20
embedding_dim = 100

# Model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_dim, 
                    input_length=max_seq_len-1))  # 19
model.add(LSTM(150, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(vocab_size, activation="softmax"))

# Build the model explicitly
model.build(input_shape=(None, max_seq_len-1))

# Compile
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer="adam", 
              metrics=["accuracy"])

model.summary()




Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 100)           44100     
                                                                 
 lstm (LSTM)                 (None, 19, 150)           150600    
                                                                 
 dropout (Dropout)           (None, 19, 150)           0         
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 441)               44541     
                                                                 
Total params: 339641 (1.30 MB)
Trainable params: 339641 (1.30 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [19]:
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1,callbacks=[early_stopping])

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50


In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

In [21]:
model.save('model_2.h5')

  saving_api.save_model(


In [22]:
# now we will test our model 
# loading our model 
from tensorflow.keras.models import load_model

model = load_model('model_2.h5')


In [23]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):] 
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [24]:
def predict_next_words(model, tokenizer, text, max_sequence_len, max_words=5):
    """
    Generate up to max_words next tokens by chaining predictions.
    """
    result = []
    input_text = text

    for _ in range(max_words):
        next_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)
        if not next_word:  # if prediction failed
            break
        result.append(next_word)
        input_text += " " + next_word  # feed back into input

        # stop early if a sentence-ending token is predicted
        if next_word in [".", "!", "?"]:
            break
    
    return " ".join(result)


In [25]:
input_text="Congrats "
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_words(model,tokenizer,input_text,max_sequence_len,3)
print(f"Next Word Prediction:{next_word}")

Input text:Congrats 
Next Word Prediction:on your promotion


In [26]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)