In [9]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import models 
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense,Concatenate,Attention
import numpy as np
import pickle
import nltk
from nltk import word_tokenize
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [10]:
# encoder inference
latent_dim=500
max_in_len = 74
max_tr_len = 17

#load the model
model = models.load_model("summarizer_model.h5")
in_tokenizer = pickle.load(open('in_tokenizer.pickle', 'rb'))
tr_tokenizer = pickle.load(open('tr_tokenizer.pickle', 'rb'))
contractions=pickle.load(open("contractions.pkl","rb"))['contractions']
#initialize stop words and LancasterStemmer
stop_words=set(stopwords.words('english'))
stemm=LancasterStemmer()
 
    
#construct encoder model from the output of 6 layer i.e.last LSTM layer
en_outputs,state_h_enc,state_c_enc = model.layers[6].output
en_states=[state_h_enc,state_c_enc]
#add input and state from the layer.
en_model = Model(model.input[0],[en_outputs]+en_states)

# decoder inference
#create Input object for hidden and cell state for decoder
#shape of layer with hidden or latent dimension
dec_state_input_h = Input(shape=(latent_dim,))
dec_state_input_c = Input(shape=(latent_dim,))
dec_hidden_state_input = Input(shape=(max_in_len,latent_dim))

# Get the embeddings and input layer from the model
dec_inputs = model.input[1]
dec_emb_layer = model.layers[5]
dec_lstm = model.layers[7]
dec_embedding= dec_emb_layer(dec_inputs)
 
#add input and initialize LSTM layer with encoder LSTM states.
dec_outputs2, state_h2, state_c2 = dec_lstm(dec_embedding, initial_state=[dec_state_input_h,dec_state_input_c])

#Attention layer
attention = model.layers[8]
attn_out2 = attention([dec_outputs2,dec_hidden_state_input])
 
merge2 = Concatenate(axis=-1)([dec_outputs2, attn_out2])

#Dense layer
dec_dense = model.layers[10]
dec_outputs2 = dec_dense(merge2)
 
# Finally define the Model Class
dec_model = Model(
    [dec_inputs] + [dec_hidden_state_input,dec_state_input_h,dec_state_input_c],
    [dec_outputs2] + [state_h2, state_c2])

#create a dictionary with a key as index and value as words.
reverse_target_word_index = tr_tokenizer.index_word
reverse_source_word_index = in_tokenizer.index_word
target_word_index = tr_tokenizer.word_index
reverse_target_word_index[0]=' '

In [14]:
def decode_sequence(input_seq):
    #get the encoder output and states by passing the input sequence
    en_out, en_h, en_c= en_model.predict(input_seq)

    #target sequence with inital word as 'sos'
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_word_index['sos']

    #if the iteration reaches the end of text than it will be stop the iteration
    stop_condition = False
    #append every predicted word in decoded sentence
    decoded_sentence = ""
    while not stop_condition: 
        #get predicted output, hidden and cell state.
        output_words, dec_h, dec_c= dec_model.predict([target_seq] + [en_out,en_h, en_c])
        
        #get the index and from the dictionary get the word for that index.
        word_index = np.argmax(output_words[0, -1, :])
        text_word = reverse_target_word_index[word_index]
        decoded_sentence += text_word +" "

        # Exit condition: either hit max length
        # or find a stop word or last word.
        if text_word == "eos" or len(decoded_sentence) > max_tr_len:
          stop_condition = True
        
        #update target sequence to the current word index.
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = word_index
        en_h, en_c = dec_h, dec_c
    
    #return the deocded sentence
    return decoded_sentence

In [15]:
def clean(texts,src):
    #remove the html tags
    texts = BeautifulSoup(texts, "html").text
    #tokenize the text into words 
    words=word_tokenize(texts.lower())
    words= list(filter(lambda w:(w.isalpha() and len(w)>=3),words))
    #filter words which contains \ 
    #integers or their length is less than or equal to 3
    #contraction file to expand shortened words
    words= [contractions[w] if w in contractions else w for w in words ]
    #stem the words to their root word and filter stop words
    if src=="inputs":
        words= [stemm.stem(w) for w in words if w not in stop_words]
    else:
        words= [w for w in words if w not in stop_words]
    return words

In [18]:
def summary_text(inp_review):
    print("Review :",inp_review)
    inp_review = clean(inp_review,"inputs")
    inp_review = ' '.join(inp_review)
    inp_x= in_tokenizer.texts_to_sequences([inp_review]) 
    inp_x= pad_sequences(inp_x,  maxlen=max_in_len, padding='post')
    
    summary=decode_sequence(inp_x.reshape(1,max_in_len))
    if 'eos' in summary :
        summary=summary.replace('eos','')
    print("\nPredicted summary:",summary);print("\n")
    return summary

    

In [20]:
data = input("Enter : ")

print(summary_text(data))

Enter : i love the taste of tea
Review : i love the taste of tea

Predicted summary: great  


great  
