## Team Members:

1- Abanoub Samir | ID:20190001

2- Aram Gamal    | ID:20190089

3- Fady Essam    | ID:20190370

## Import Libraries

In [1]:
import tarfile
import nltk
import pandas as pd
import gensim
from gensim.models import Word2Vec
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import re
import nltk

## Loading and Prepare Data

In [2]:
tar_path = 'enronsentv1.tar.gz'
with tarfile.open(tar_path, 'r:gz') as tar:
    tar.extractall()

In [3]:
train_list=['00','01','02','03','04','05','06','07','08','09','10']
train_data=[]
for i in train_list: 
    with open(f"enronsent//enronsent{i}", 'r') as file:
        file_contents = file.read()
        train_data.extend(file_contents.split('\n\n')) 
train_data=pd.DataFrame(train_data,columns=['paragraph'])

In [4]:
train_data

Unnamed: 0,paragraph
0,\nAttached are two files that illustrate the ...
1,"As prices rose, supply increased and demand de..."
2,Financial (6)\n West Desk (14)\nMid Market ...
3,"Share information about yourself, create your ..."
4,- utility.xls\n - utility.xls
...,...
119312,Those dates are fine for me. Copies of the do...
119313,Peter E Weidler\n12/20/2000 08:57 AM\nI have b...
119314,Think there are different definitions of succe...
119315,Cuiaba I is very much intertwined with the GTB...


In [5]:
test_data=[]
for i in range(11,16): 
    with open(f"enronsent//enronsent{i}", 'r') as file:
        file_contents = file.read()
        test_data.extend(file_contents.split('\n\n')) 
test_data=pd.DataFrame(test_data,columns=['paragraph'])

In [6]:
test_data

Unnamed: 0,paragraph
0,for how we are going to play these different o...
1,Please confirm your attendance -
2,Joe/Orlando - if I am missing a key participan...
3,Rob - can you get someone to make copy of the ...
4,Yvette - please get a large conference room fo...
...,...
61189,\tTana Jones\n\t04/16/2001 03:11 PM\n\t\t \n\t...
61190,Cargill Ferrous International is setup correct...
61191,"Also, the Global SAP team (Cheryl Johnson) wou..."
61192,Best Rgds.


## Data preprocessing 

In [7]:
def preprocess(document):
    # Convert to lowercase
    text = document.lower()
    # Remove URLs and email addresses and punct and non-alpha
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'http\S+|https\S+|\W+|\s{2,}', lambda m: ' ' if m.group().isspace() else '', text)
    return text

In [8]:
import copy
train_preprocessed_data = copy.deepcopy(train_data)
train_preprocessed_data.paragraph=train_preprocessed_data.paragraph.apply(preprocess)

In [9]:
train_data.paragraph[5]

'Enron-admin@FSDDataSvc.com on 09/06/2000 10:12:33 AM\nExecutive Impact & Influence Program\n* IMMEDIATE ACTION REQUIRED - Do Not Delete *'

In [10]:
train_preprocessed_data.paragraph[5]

'enronadminfsddatasvccom on am executive impact influence program immediate action required do not delete '

In [11]:
test_preprocessed_data = copy.deepcopy(test_data)
test_preprocessed_data.paragraph=test_preprocessed_data.paragraph.apply(preprocess)

In [12]:
test_preprocessed_data

Unnamed: 0,paragraph
0,for how we are going to play these different o...
1,please confirm your attendance
2,joeorlando if i am missing a key participant p...
3,rob can you get someone to make copy of the pp...
4,yvette please get a large conference room for ...
...,...
61189,tana jones pm samuel schott pm fyi
61190,cargill ferrous international is setup correct...
61191,also the global sap team cheryl johnson would ...
61192,best rgds


## Split Data to (samples,Timesteps)

In [13]:
def timeSteps(paragraph, timestep):
    sequences = []
    words = paragraph.split()
    for i in range(len(words) - timestep):
        seq = words[i:i + timestep]          
        sequences.append(seq)
    return sequences 

In [14]:
timestep = 15
sequences = []

for index, row in train_preprocessed_data[40000:46000].iterrows():
    paragraph = row['paragraph']
    seqs= timeSteps(paragraph, timestep)
    sequences.extend(seqs)

dtrain=np.array(sequences)
dtrain.shape

(89662, 15)

In [15]:
timestep = 15
sequences = []

for index, row in test_preprocessed_data[:2000].iterrows():
    paragraph = row['paragraph']
    seqs = timeSteps(paragraph, timestep)
    sequences.extend(seqs)

dtest = np.array(sequences)
dtest.shape

(20092, 15)

## Spliting the label from the data and apply Word2Vec 

In [16]:
train_input_sequences = []
train_labels = []

for row in dtrain:
    train_input_sequences.append(' '.join(row[:-1]))  
    train_labels.append(row[-1])  

test_input_sequences = []
test_labels = []

for row in dtest:
    test_input_sequences.append(' '.join(row[:-1]))  
    test_labels.append(row[-1]) 

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_input_sequences)
train_input_sequences = tokenizer.texts_to_sequences(train_input_sequences)
test_input_sequences = tokenizer.texts_to_sequences(test_input_sequences)

max_sequence_length = max(len(seq) for seq in train_input_sequences)
train_input_data = pad_sequences(train_input_sequences, maxlen=max_sequence_length)
test_input_data = pad_sequences(test_input_sequences, maxlen=max_sequence_length)

Word2Vec_model = Word2Vec(sentences=train_input_sequences, vector_size=300, window=5, min_count=1, workers=4)
vocabulary_size = len(tokenizer.word_index) + 1

# convert label data to onehot
train_label_data = np.array(train_labels)
test_label_data = np.array(test_labels)

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(train_label_data)
train_label_sequences = label_tokenizer.texts_to_sequences(train_label_data)
test_label_sequences = label_tokenizer.texts_to_sequences(test_label_data)
num_classes = len(label_tokenizer.word_index) + 1

train_label_data = np.zeros((len(train_label_sequences), num_classes))
test_label_data = np.zeros((len(test_label_sequences), num_classes))

for i, seq in enumerate(train_label_sequences):
    train_label_data[i, seq] = 1

for i, seq in enumerate(test_label_sequences):
    test_label_data[i, seq] = 1

## LSTM Model

In [17]:
vocabulary_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, i in tokenizer.word_index.items():
    if word in Word2Vec_model.wv:
        embedding_matrix[i] = Word2Vec_model.wv[word]

In [None]:
lstm_model = Sequential()
lstm_model.add(Embedding(vocabulary_size, 300, weights=[embedding_matrix], input_length=max_sequence_length))
lstm_model.add(LSTM(512))
lstm_model.add(Dense(num_classes, activation='softmax'))
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

lstm_model.fit(train_input_data, train_label_data, epochs=100, batch_size=512,validation_data=(test_input_data, test_label_data))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 14, 300)           2712900   
                                                                 
 lstm (LSTM)                 (None, 512)               1665024   
                                                                 
 dense (Dense)               (None, 7568)              3882384   
                                                                 
Total params: 8,260,308
Trainable params: 8,260,308
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 2

<keras.callbacks.History at 0x7f81b06d2b60>

In [None]:
lstm_model.save("lstm_model.h5")

## LSTM Evaluations

In [None]:
loss, accuracy = lstm_model.evaluate(test_input_data, test_label_data)

print("Test Loss:", loss)
print("Test Accuracy:", accuracy*100)

Test Loss: 11.019675254821777
Test Accuracy: 9.575950354337692


In [18]:
from keras.models import load_model
lstm_model=load_model('/content/lstm_model.h5')

## Test Case 1

In [35]:
sentence = ""
word = input("Enter the next word (type 'exit' to terminate): ")
sentence += " " + word

while True:
    # Convert the input sentence to a sequence
    input_sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)

    # Predict the next word
    predicted_probabilities = lstm_model.predict(padded_sequence,verbose=False)[0]
    predicted_index = np.argmax(predicted_probabilities)
    predicted_word = label_tokenizer.index_word[predicted_index]

    print("Is your next word:", predicted_word)
    user_feedback = input("Yes/ No /(type 'exit' to terminate): ")
    if user_feedback.lower() == "exit":
        break
    elif user_feedback.lower() == "no":
        word = input("Sorry, Enter the Correct word: ")
        sentence += " " + word
    else:
      sentence += " " + predicted_word

    print("Your final Sentence is:", sentence.strip())
print("Your final Sentence is:", sentence.strip())

Enter the next word (type 'exit' to terminate): does
Is your next word: not
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: does not
Is your next word: hear
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: does not hear
Is your next word: to
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: does not hear to
Is your next word: times
Yes/ No /(type 'exit' to terminate): no
Sorry, Enter the Correct word: any
Your final Sentence is: does not hear to any
Is your next word: other
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: does not hear to any other
Is your next word: so
Yes/ No /(type 'exit' to terminate): no
Sorry, Enter the Correct word: bad
Your final Sentence is: does not hear to any other bad
Is your next word: but
Yes/ No /(type 'exit' to terminate): no
Sorry, Enter the Correct word: joke
Your final Sentence is: does not hear to any other bad joke
Is your next word: or
Yes/ No /(type 'exit' to terminate): yes
Your fi

## Test Case 2

In [40]:
sentence = ""
word = input("Enter the next word (type 'exit' to terminate): ")
sentence += " " + word

while True:
    # Convert the input sentence to a sequence
    input_sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)

    # Predict the next word
    predicted_probabilities = lstm_model.predict(padded_sequence,verbose=False)[0]
    predicted_index = np.argmax(predicted_probabilities)
    predicted_word = label_tokenizer.index_word[predicted_index]

    print("Is your next word:", predicted_word)
    user_feedback = input("Yes/ No /(type 'exit' to terminate): ")
    if user_feedback.lower() == "exit":
        break
    elif user_feedback.lower() == "no":
        word = input("Sorry, Enter the Correct word: ")
        sentence += " " + word
    else:
      sentence += " " + predicted_word

    print("Your final Sentence is:", sentence.strip())
print("Your final Sentence is:", sentence.strip())

Enter the next word (type 'exit' to terminate): the
Is your next word: current
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: the current
Is your next word: agreements
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: the current agreements
Is your next word: for
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: the current agreements for
Is your next word: a
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: the current agreements for a
Is your next word: house
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: the current agreements for a house
Is your next word: a
Yes/ No /(type 'exit' to terminate): no
Sorry, Enter the Correct word: are
Your final Sentence is: the current agreements for a house are
Is your next word: is
Yes/ No /(type 'exit' to terminate): no
Sorry, Enter the Correct word: very
Your final Sentence is: the current agreements for a house are very
Is your next word: little
Yes/ No /(type 'exit

## Test Case 3

In [38]:
sentence = ""
word = input("Enter the next word (type 'exit' to terminate): ")
sentence += " " + word

while True:
    # Convert the input sentence to a sequence
    input_sequence = tokenizer.texts_to_sequences([sentence])
    padded_sequence = pad_sequences(input_sequence, maxlen=max_sequence_length)

    # Predict the next word
    predicted_probabilities = lstm_model.predict(padded_sequence,verbose=False)[0]
    predicted_index = np.argmax(predicted_probabilities)
    predicted_word = label_tokenizer.index_word[predicted_index]

    print("Is your next word:", predicted_word)
    user_feedback = input("Yes/ No /(type 'exit' to terminate): ")
    if user_feedback.lower() == "exit":
        break
    elif user_feedback.lower() == "no":
        word = input("Sorry, Enter the Correct word: ")
        sentence += " " + word
    else:
      sentence += " " + predicted_word

    print("Your final Sentence is:", sentence.strip())
print("Your final Sentence is:", sentence.strip())

Enter the next word (type 'exit' to terminate): I
Is your next word: recognize
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: I recognize
Is your next word: now
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: I recognize now
Is your next word: after
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: I recognize now after
Is your next word: a
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: I recognize now after a
Is your next word: single
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: I recognize now after a single
Is your next word: employer
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: I recognize now after a single employer
Is your next word: and
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: I recognize now after a single employer and
Is your next word: we
Yes/ No /(type 'exit' to terminate): yes
Your final Sentence is: I recognize now after a single employer and 