In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding, LSTM, Bidirectional, Dropout

import email
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

In [None]:
#Read dataset
file = pd.read_csv('emails.csv', nrows=1000)
print(file)

In [None]:
#Helper function for extracting email body from raw email
def get_text_from_email(msg):
        parts = []
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                parts.append( part.get_payload() )
        return ''.join(parts)

#Helper function for seggregating the fields present in emails
def split_email_addresses(line):
    if line:
        addrs = line.split(',')
        addrs = list(frozenset(map(lambda x: x.strip(), addrs)))
    else:
        addrs = None
    return addrs

def preprocessing_emails(dataframe):
        email_df = dataframe
        messages = list(map(email.message_from_string,email_df['message']))
        keys = messages[0].keys()
        for key in keys:
            email_df[key] = [doc[key] for doc in messages]
        email_df['email_body'] = list(map(get_text_from_email, messages))
        email_df.drop(['file', 'message', 'Message-ID', 'Date', 'From', 'To', 'Subject', 'Mime-Version', 'Content-Type', 'Content-Transfer-Encoding', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Folder', 'X-Origin', 'X-FileName'], axis=1, inplace=True)
        return email_df

df = preprocessing_emails(file)

In [None]:
def tokenize_corpus(corpus, num_words=-1):
  # Fit a Tokenizer on the corpus
  if num_words > -1:
    tokenizer = Tokenizer(num_words=num_words)
  else:
    tokenizer = Tokenizer()
  tokenizer.fit_on_texts(corpus)
  return tokenizer

def create_corpus(dataset, field):
  # Make it lowercase
  dataset[field] = dataset[field].str.lower()
  # Make it one long string to split by line
  message = dataset[field].str.cat()
  corpus = message.split('.')
  return corpus

In [None]:
corpus = create_corpus(df, 'email_body')
#print(len(corpus))
tokens = tokenize_corpus(corpus)
total_words = len(tokens.word_index) + 1
#print(total_words)
print(corpus[2])

In [None]:
def get_sequences(corpus):
    sequences = []
    for line in corpus:
        token_list = tokens.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            sequences.append(n_gram_sequence)
    return sequences

In [None]:
sequences = get_sequences(corpus)

In [None]:
max_sequence_len = max([len(seq) for seq in sequences])
padded_sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='post'))
#splitting input and output variables
input_sequences, labels = padded_sequences[:,:-1], padded_sequences[:,-1]
#one hot encoding labels
labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)
input_sequences.shape, labels.shape

In [None]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.2))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
history = model.fit(input_sequences, labels, epochs=100, verbose=1)

In [None]:
import os.path
if os.path.isfile('models/text_gen_model_2.h5') is False:
  model.save('models/text_gen_model_2.h5')

In [None]:
seed_text = 'suggest holding the business.'
next_words = 50

for _ in range(next_words):
  token_list = tokens.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='post')
  predicted = np.argmax(model.predict(token_list), axis=-1)
  output_word = ""
  for word, index in tokens.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += " " + output_word
print(seed_text)

In [None]:
from tensorflow.keras.models import load_model
new_model = load_model('models/text_gen_model_2.h5')
# Use this process for the full output generation
seed_text = "suggest holding the business."
next_words = 100

for _ in range(next_words):
  token_list = tokens.texts_to_sequences([seed_text])[0]
  #print(token_list)
  token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
  #print(token_list)
  predicted_probs = new_model.predict(token_list)[0]
  predicted = np.random.choice([x for x in range(len(predicted_probs))],
                               p=predicted_probs)
  output_word = ""
  for word, index in tokens.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += " " + output_word
print(seed_text)