# **Imports**

In [None]:
import pandas as pd
import zipfile
import gensim
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import warnings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from keras.utils import to_categorical
import re

# Use the filterwarnings() function to ignore warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Loading Data**

In [None]:
# Specify the path to the enronsent.zip file
zip_file_path = '/content/drive/MyDrive/enronsent.zip'

# Open the zip file and get the list of file names
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    file_list = zip_ref.namelist()

In [None]:
# Create an empty string to store the concatenated file contents
concatenated_content_training = ''

# Open the zip file again to read the file contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Iterate through the first 10 files in the file list
    for file_path in file_list[0:11]:
        # Read the file contents as binary data
        with zip_ref.open(file_path) as file:
            file_content = file.read()

        # Decode the file content from bytes to string
        decoded_content = file_content.decode('utf-8', errors='replace')

        # Concatenate the file content to the existing string
        concatenated_content_training += decoded_content
len(concatenated_content_training)


23890105

In [None]:
# Create an empty string to store the concatenated file contents
concatenated_content_validation = ''

# Open the zip file again to read the file contents
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Iterate through the first 10 files in the file list
    for file_path in file_list[11:16]:
        # Read the file contents as binary data
        with zip_ref.open(file_path) as file:
            file_content = file.read()

        # Decode the file content from bytes to string
        decoded_content = file_content.decode('utf-8', errors='replace')

        # Concatenate the file content to the existing string
        concatenated_content_validation += decoded_content
len(concatenated_content_validation)

9490330

# **Preprocessing**

In [None]:
# Remove non-alphabetic characters
cleaned_content_training = re.sub(r'[^a-zA-Z\s]', '', concatenated_content_training)


# Remove URLs
cleaned_content_training = re.sub(r'http\S+|www\S+', '', cleaned_content_training)



In [None]:
# Remove non-alphabetic characters
cleaned_content_validation = re.sub(r'[^a-zA-Z\s]', '', concatenated_content_validation)

# Remove URLs
cleaned_conten_validation = re.sub(r'http\S+|www\S+', '', cleaned_content_validation)



In [None]:
# Split the cleaned_content into paragraphs
paragraphs = cleaned_content_training.split("\n\n")

# Convert paragraphs to a list
paragraph_list_training = list(filter(None, paragraphs))
paragraph_list_training=[re.sub('\s+', ' ', paragraph) for paragraph in paragraph_list_training]
len(paragraph_list_training)

128184

In [None]:
# Split the cleaned_content into paragraphs
paragraphs = cleaned_content_validation.split("\n\n")

# Convert paragraphs to a list
paragraph_list_validation = list(filter(None, paragraphs))
paragraph_list_validation=[re.sub('\s+', ' ', paragraph) for paragraph in paragraph_list_validation]
len(paragraph_list_validation)

63276

In [None]:
paragraph_list_training[0]

' Attached are two files that illustrate the following'

In [None]:
paragraph_list_validation[0]

'for how we are going to play these different opportunities off of one another '

In [None]:
def split_paragraph_into_time_steps(paragraph, time_steps):
    words = paragraph.split()
    data=[]
    for i in range( len(words)-time_steps):
      data.append(words[i:i+time_steps])

    return data

time_steps = 10  # Assuming each time step contains 25 words
num_training_paragraphs = 5000
num_validation_paragraphs = 1000

# Select paragraphs for training
training_paragraphs = paragraph_list_training[:num_training_paragraphs]
training_time_steps = []
for paragraph in training_paragraphs:
    time_steps_list = split_paragraph_into_time_steps(paragraph, time_steps)
    training_time_steps.extend(time_steps_list)

# Select paragraphs for validation
validation_paragraphs = paragraph_list_validation[num_training_paragraphs:num_training_paragraphs+num_validation_paragraphs]
validation_time_steps = []
for paragraph in validation_paragraphs:
    time_steps_list = split_paragraph_into_time_steps(paragraph, time_steps)
    validation_time_steps.extend(time_steps_list)

print(len(training_time_steps))
print(len(validation_time_steps))


74737
13179


In [None]:
training_time_steps=np.array(training_time_steps)
validation_time_steps=np.array(validation_time_steps)

In [None]:
print(training_time_steps.shape,validation_time_steps.shape)

(74737, 10) (13179, 10)


# **Word2Vec model**

In [None]:
from gensim.models import Word2Vec

# Train Word2Vec model
model = Word2Vec(sentences=paragraph_list_training, vector_size=300, window=5, min_count=1, workers=4)

# Save the trained model
model.save("word2vec.model")




In [None]:
import numpy as np
import keras

# Convert sentences to sequences of word indices
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(training_time_steps.tolist())  # Convert to list before fitting

sequences = tokenizer.texts_to_sequences(training_time_steps.tolist())
max_sequence_length = max(len(seq) for seq in sequences)

# Pad sequences to a fixed length
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='pre')

# Split into input (X) and target (y)
X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

# Convert validation data to sequences
validation_sequences = tokenizer.texts_to_sequences(validation_time_steps.tolist())
validation_padded_sequences = pad_sequences(validation_sequences, maxlen=max_sequence_length, padding='pre')

# Split into validation input (X_val) and target (y_val)
X_val = validation_padded_sequences[:, :-1]
y_val = validation_padded_sequences[:, -1]

# Define the LSTM model
model = keras.Sequential()
model.add(keras.layers.Embedding(len(tokenizer.word_index) + 1, 100, input_length=max_sequence_length-1))
model.add(keras.layers.LSTM(100))
model.add(keras.layers.Dense(len(tokenizer.word_index) + 1, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
model.fit(X, y, validation_data=(X_val, y_val), epochs=100, batch_size=32)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f46adcd72b0>

In [None]:
def predict_next_word(sentence, model, tokenizer):
    sequence = tokenizer.texts_to_sequences([sentence])[0]
    sequence = pad_sequences([sequence], maxlen=max_sequence_length-1, padding='pre')
    predicted_index = np.argmax(model.predict(sequence))
    predicted_word = tokenizer.index_word[predicted_index]
    return predicted_word


In [None]:
def predict_sentence():
    sentence = input("Enter Next word (-1 to terminate): ")
    predicted_sentence = [sentence]

    while sentence != '-1':
        next_word = predict_next_word(sentence, model, tokenizer)
        answer = input(f"Is your next word: \"{next_word}\"?")
        if answer.lower() == 'yes':
            sentence += ' ' + next_word
            predicted_sentence.append(next_word)
        else:
            sentence = input("Enter next word (-1 to terminate): ")
            if sentence != '-1':
                predicted_sentence.append(sentence)

    final_sentence = ' '.join(predicted_sentence)
    final_sentence = final_sentence.replace(' -1', '')
    print("Your final Sentence is:", final_sentence)

predict_sentence()

Enter Next word (-1 to terminate): my
Is your next word: "business"?yes
Is your next word: "in"?yes
Is your next word: "seguin"?no
Enter next word (-1 to terminate): cairo
Is your next word: "in"?no
Enter next word (-1 to terminate): -1
Your final Sentence is: my business in cairo


In [None]:
def predict_sentence():
    sentence = input("Enter Next word (-1 to terminate): ")
    predicted_sentence = [sentence]

    while sentence != '-1':
        next_word = predict_next_word(sentence, model, tokenizer)
        answer = input(f"Is your next word: \"{next_word}\"?")
        if answer.lower() == 'yes':
            sentence += ' ' + next_word
            predicted_sentence.append(next_word)
        else:
            sentence = input("Enter next word (-1 to terminate): ")
            if sentence != '-1':
                predicted_sentence.append(sentence)

    final_sentence = ' '.join(predicted_sentence)
    final_sentence = final_sentence.replace(' -1', '')
    print("Your final Sentence is:", final_sentence)

predict_sentence()

Enter Next word (-1 to terminate): I
Is your next word: "am"?yes
Is your next word: "inviting"?yes
Is your next word: "less"?no
Enter next word (-1 to terminate): you
Is your next word: "in"?yes
Is your next word: "california"?yes
Is your next word: "power"?no
Enter next word (-1 to terminate): -1
Your final Sentence is: I am inviting you in california


In [None]:
def predict_sentence():
    sentence = input("Enter Next word (-1 to terminate): ")
    predicted_sentence = [sentence]

    while sentence != '-1':
        next_word = predict_next_word(sentence, model, tokenizer)
        answer = input(f"Is your next word: \"{next_word}\"?")
        if answer.lower() == 'yes':
            sentence += ' ' + next_word
            predicted_sentence.append(next_word)
        else:
            sentence = input("Enter next word (-1 to terminate): ")
            if sentence != '-1':
                predicted_sentence.append(sentence)

    final_sentence = ' '.join(predicted_sentence)
    final_sentence = final_sentence.replace(' -1', '')
    print("Your final Sentence is:", final_sentence)

predict_sentence()

Enter Next word (-1 to terminate): hello
Is your next word: "estimated"?no
Enter next word (-1 to terminate): my
Is your next word: "business"?no
Enter next word (-1 to terminate): friends
Is your next word: "versus"?no
Enter next word (-1 to terminate): -1
Your final Sentence is: hello my friends
