# Exercise 11.01: Generating Text 



In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
import tensorflow.keras.utils as ku 
from keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
from keras.callbacks import EarlyStopping
import string, os 
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)


In [None]:
our_dir = '../../Lab08/Datasets/'
our_headlines = []
for filename in os.listdir(our_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv(our_dir + filename)
        our_headlines.extend(list(article_df.headline.values))
        break

our_headlines = [h for h in our_headlines if h != "Unknown"]
len(our_headlines)

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in our_headlines]
corpus[60:80]

In [None]:
tokenizer = Tokenizer()

def get_seq_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    all_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_seq = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_seq.append(n_gram_sequence)
    return input_seq, all_words

our_sequences, all_words = get_seq_of_tokens(corpus)
our_sequences[:20]

In [None]:
def generate_padded_sequences(input_seq):
    max_sequence_len = max([len(x) for x in input_seq])
    input_seq = np.array(pad_sequences\
                         (input_seq, maxlen=max_sequence_len, \
                          padding='pre'))
    
    predictors, label = input_seq[:,:-1],input_seq[:,-1]
    label = ku.to_categorical(label, num_classes=all_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(our_sequences)

In [None]:
def create_model(max_sequence_len, all_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(all_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(all_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, all_words)
model.summary()

In [None]:
model.fit(predictors, label, epochs=200, verbose=5)

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], \
                                   maxlen=max_sequence_len-1, \
                                   padding='pre')
        predicted = model.predict(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted.any():
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("10 Ways", 11, model, max_sequence_len))
print (generate_text("europe looks to", 8, model, max_sequence_len))
print (generate_text("best way", 10, model, max_sequence_len))
print (generate_text("homeless in", 10, model, max_sequence_len))
print (generate_text("Unexpected results", 10, model, \
                     max_sequence_len))
print (generate_text("critics warn", 10, model, max_sequence_len))