# Exercise 9.03: Building an RNN with LSTM Layer for Natural Language Processing

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.utils import np_utils as ku
import pandas as pd
import numpy as np
import string, os 
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
curr_dir = '../Datasets'
all_headlines = []
for filename in os.listdir(curr_dir):
    if 'Articles' in filename:
        article_df = pd.read_csv("Articles.csv")
        all_headlines.extend(list(article_df.headline.values))
        break

all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt \
                  if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_headlines]
corpus[:10]

In [None]:
tokenizer = Tokenizer()

def get_seq_of_tokens(corpus):
    tokenizer.fit_on_texts(corpus)
    all_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, all_words

inp_sequences, all_words = get_seq_of_tokens(corpus)
inp_sequences[:10]

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array\
                      (pad_sequences(input_sequences, \
                                     maxlen=max_sequence_len, \
                                     padding='pre'))
    
    predictors, label = input_sequences[:,:-1],\
                        input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=all_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences\
                                      (inp_sequences)

In [None]:
def create_model(max_sequence_len, all_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    model.add(Embedding(all_words, 10, input_length=input_len))
    
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    model.add(Dense(all_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', \
                  optimizer='adam')
    
    return model

model = create_model(max_sequence_len, all_words)
model.summary()

In [None]:
model.fit(predictors, label, epochs=100, verbose=5)

In [None]:
def generate_text(seed_text, next_words, \
                  model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences\
                     ([seed_text])[0]
        token_list = pad_sequences([token_list], \
                                   maxlen=max_sequence_len-1, \
                                   padding='pre')
        predicted = model.predict(token_list, verbose=0)
               
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted.any():
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("the hottest new", 5, model, \
                     max_sequence_len))
print (generate_text("the stock market", 4, model, \
                     max_sequence_len))
print (generate_text("russia wants to", 3, model, \
                     max_sequence_len))
print (generate_text("french citizen", 4, model, \
                     max_sequence_len))
print (generate_text("the one thing", 15, model, \
                     max_sequence_len))
print (generate_text("the coronavirus", 5, model, \
                     max_sequence_len))