# Imports

In [29]:
#from __future__ import print_function

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.utils.data_utils import get_file


import numpy as np
import random
import sys
import os    
os.environ['THEANO_FLAGS'] = "device=gpu, floatX=float32"
import gensim
from functions.words_chars import vocabulary_from_json_corpus

json_corpus_path = "/home/ubuntu/summarization_query_oriented/data/wikipedia/json/td_qfs_rank_1/"

import time

# Loading LSTM

In [2]:
# building vocabulary of the corpus
words = vocabulary_from_json_corpus(json_corpus_path)
word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))
print("word_indices", type(word_indices), "length:",len(word_indices))
print("indices_words", type(indices_word), "length", len(indices_word))

maxlen = 10

#defining the lstm model
print('Build model...')
model = Sequential()
model.add(LSTM(400, return_sequences=True, input_shape=(maxlen, len(word_indices))))
model.add(Dropout(0.6))
model.add(LSTM(400, return_sequences=False))
model.add(Dropout(0.6))
model.add(Dense(len(words)))
#model.add(Dense(1000))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print('Model built...')

# naming 
model_folder = "/home/ubuntu/summarization_query_oriented/nn_models/language_models/RNN/wider_30102016/"
model_name = "tdqfs_lstm_wider_corpus_last.hdf5"

if os.path.isfile('/home/ubuntu/summarization_query_oriented/nn_models/language_models/RNN/wider_30102016/tdqfs_lstm_wider_corpus_last.hdf5'):
    model.load_weights('/home/ubuntu/summarization_query_oriented/nn_models/language_models/RNN/wider_30102016/tdqfs_lstm_wider_corpus_last.hdf5')

('word_indices', <type 'dict'>, 'length:', 83161)
('indices_words', <type 'dict'>, 'length', 83161)
Build model...
Model built...


In [3]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lstm_1 (LSTM)                    (None, 10, 400)       133699200   lstm_input_1[0][0]               
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 10, 400)       0           lstm_1[0][0]                     
____________________________________________________________________________________________________
lstm_2 (LSTM)                    (None, 400)           1281600     dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 400)           0           lstm_2[0][0]                     
___________________________________________________________________________________________

In [6]:
print(model.layers[0].name)
print(model.layers[2].name)

lstm_1
lstm_2


# Build model to get the hidden layer

In [27]:
print('Build model...')
model2 = Sequential()
model2.add(LSTM(400, return_sequences=True, batch_input_shape=(1,maxlen, len(word_indices)),weights=model.layers[0].get_weights(),stateful=True))
model2.add(LSTM(400, return_sequences=False,weights=model.layers[2].get_weights(),stateful=True))
print('Model built...')

Build model...
Model built...


# predict vector from sentence

In [10]:
import gensim
import numpy as np

sentence_str = "Diabetes mellitus (DM), commonly referred to as diabetes, is a group of metabolic diseases in which there are high blood sugar levels over a prolonged period."
sentence = gensim.utils.simple_preprocess(sentence_str)
starting_idx = 0
#sentence = sentence[starting_idx :maxlen + starting_idx]

#start_index = random.randint(0, len(list_words) - maxlen - 1)

print()
print('----- Example:')
generated = ''
generated += ' '.join(sentence)
print('----- Generating with seed: "' , sentence , '"')
print()

for i in range(3):
    x = np.zeros((1, maxlen, len(words)))
    for t, word in enumerate(sentence):
        x[0, t, word_indices[word]] = 1.

    preds = model.predict(x, verbose=0)[0]

    next_index = np.argmax(preds)
    next_word = indices_word[next_index]
    generated += " " + next_word
    print(generated)
    del sentence[0]
    sentence.append(next_word)
    
    tic = time.time()
    preds = model2.predict(x, verbose=0)[0]
    toc = time.time()
    print("Time : ", toc-tic)
    print(preds.shape)
    print(preds[:5])
    

()
----- Example:
('----- Generating with seed: "', [u'diabetes', u'mellitus', u'dm', u'commonly', u'referred', u'to', u'as', u'diabetes', u'is', u'group', u'of', u'metabolic', u'diseases', u'in', u'which', u'there', u'are', u'high', u'blood', u'sugar', u'levels', u'over', u'prolonged', u'period'], '"')
()


Exception: Error when checking : expected lstm_input_1 to have shape (None, 10, 83161) but got array with shape (1, 24, 83161)

# Comparison

In [None]:
# Quadruplets 

# training file , I remove the concept of validation loss

# In this script we perform the training of the fully connected model

# Import 
import gensim

import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import pyrouge
from pyrouge import Rouge155

from hard_coded import data_json_dir, data_txt_dir, lang_model_dir, model_dir, nn_summarizers_dir, summary_system_super_dir, tdqfs_folder
from hard_coded import non_selected_keys,tdqfs_themes
from functions.training_functions import *

# paths to folder 
data_json = data_json_dir
data_txt = data_txt_dir
lang_model_folder = lang_model_dir
nn_summarizers_folder = nn_summarizers_dir
summary_system_super_folder = summary_system_super_dir
themes = tdqfs_themes

#title_file = "/home/ubuntu/summarization_query_oriented/data/DUC/duc2005_topics.sgml"
#titles_folder = "/home/ubuntu/summarization_query_oriented/data/DUC/duc2005_docs/"


# training parameters

patience_limit = 25

## loading a d2vmodel (to be a shifted LSTM next ...)

# parameters of doc2vec
dm = 0
min_count = 5
window = 10
size = 400
sample = 1e-4
negative = 5
workers = 4
epoch = 100

# Initialize the model ( IMPORTANT )
d2v_model = gensim.models.doc2vec.Doc2Vec(dm=dm,min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers,iter = epoch)

# load model
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)+"_wosw"
try :
    d2v_model = d2v_model.load(lang_model_folder+model_name+".d2v")
except :
    print "try a model in : ", os.listdir(lang_model_folder)
print("model loaded")


In [None]:
import time
txt = "they have a common name diabetes mellitus and diabetes insipidus"
prep = gensim.utils.simple_preprocess(txt, deacc=True)
tic = time.time()
vector = d2v_model.infer_vector(txt)
toc = time.time()
print("time : ",toc-tic)
print(vector.shape)

# Batch

In [37]:
import gensim
import numpy as np

sentence_str = "Diabetes mellitus (DM), commonly referred to as diabetes, is a group of metabolic diseases in which there are high blood sugar levels over a prolonged period."
sentence_X = gensim.utils.simple_preprocess(sentence_str)
starting_idx = 0
sentence = sentence_X[starting_idx :maxlen + starting_idx]

#start_index = random.randint(0, len(list_words) - maxlen - 1)

print()
print('----- Example:')
generated = ''
generated += ' '.join(sentence)
print('----- Generating with seed: "' , sentence , '"')
print()

for i in range(3):
    x = np.zeros((1, maxlen, len(words)))
    for t, word in enumerate(sentence):
        x[0, t, word_indices[word]] = 1.

    preds = model.predict(x, verbose=0)[0]

    del sentence[0]
    sentence.append(sentence_X[maxlen+starting_idx+i])
    
    tic = time.time()
    preds = model2.predict(x, verbose=0)[0]
    toc = time.time()
    print("Time : ", toc-tic)
    print(preds.shape)
    print(preds[:5])
    

()
----- Example:
('----- Generating with seed: "', [u'diabetes', u'mellitus', u'dm', u'commonly', u'referred', u'to', u'as', u'diabetes', u'is', u'group'], '"')
()
('Time : ', 0.2493610382080078)
(1, 400)
[[  0.00000000e+00  -1.60113745e-03   0.00000000e+00   5.92325255e-02
   -2.90173152e-03  -1.00483885e-02   0.00000000e+00  -1.47134922e-02
   -0.00000000e+00  -1.10678934e-02  -3.17822257e-03  -0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.06023783e-02  -0.00000000e+00
    0.00000000e+00  -0.00000000e+00   5.68611221e-03  -0.00000000e+00
   -2.05752694e-05   0.00000000e+00  -0.00000000e+00  -0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.21447882e-02   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   1.05356807e-02   4.54170287e-01  -0.00000000e+00
    0.00000000e+00  -0.00000000e+00   0.00000000e+00   0.00000000e+00
    4.03374881e-02  -0.00000000e+00   4.50232401e-02   0.00000000e+00
    0.00000000e+00  -2.1

In [35]:
#[ 0.00528679  0.00453413  0.00248272  0.01435476 -0.01339659]
#[ 0.02352135  0.03000085  0.01512095 -0.02398492 -0.01188593]
model2.reset_states() 

In [45]:
try : 
    word_indices["pépé"]
except KeyError:
    pass
print("hello")

KeyError: 'p\xc3\xa9p\xc3\xa9'

In [51]:
def remove_unknown_words(txt_prep, word_indices):
    txt_wo_uw = []
    for word in txt_prep:
        if word in word_indices.keys():
            txt_wo_uw.append(word)
    return txt_wo_uw

def lstm_infer_vector(lstm_model, txt, stopwords,word_indices, maxlen=10) :
    """
    d2v.infer_vector equivalent for a lstm language model
    """
    
    txt_prep = gensim.utils.simple_preprocess(txt, deacc=True)
    txt_wo_uw = remove_unknown_words(txt_prep, word_indices)
    txt_wo_ws = remove_stopwords(txt_wo_uw, stopwords)
    
    if len(txt_wo_ws)<maxlen :
        #cas du texte trop court
        sentence = txt_wo_ws
        X = np.zeros((1, maxlen, len(word_indices)), dtype=np.bool)
        y = np.zeros((1, len(word_indices)), dtype=np.bool)
        for t, word in enumerate(sentence):
            X[1, t, word_indices[word]] = 1
        preds = model2.predict(X, verbose=0)[0]
    else :
        
        for current_part in range(len(txt_wo_ws)/maxlen):
            sentence = txt_wo_ws[current_part*maxlen:(current_part+1)*maxlen]
            X = np.zeros((1, maxlen, len(word_indices)), dtype=np.bool)
            y = np.zeros((1, len(word_indices)), dtype=np.bool)
            for t, word in enumerate(sentence):
                X[0, t, word_indices[word]] = 1
            preds = model2.predict(X, verbose=0)[0]
            

    return preds

def create_triplets_lstm(lstm_model, article_names, article_weights,stopwords,word_indices, nb_triplets=20, triplets_per_file=5, neg_ratio=0.5, str_mode = False, with_txt_vect = False) :
    """
    inputs :    
        - lstm_model : lstm language model 
        - article_names : ndarray containing the names of the json files (absolute path !)
        - article_weights: ndarray normalized of the weight of each files 
        - nb_triplets : nb of triplets to generate
        - triplets_per_file : number of triplet built for each selected file
        - neg_ratio : ratio of positives / negative examples. Negative examples are taken inside the article !
        
    output : 
        - triplets : nd_array of triplets of shape (nb_triplets+ , embed_dim)
        - labels : nd_array of labels of shape (nb_triplets+ ,)

    """
    triplets = []
    labels = []
    
    assert nb_triplets>=triplets_per_file, "you should have nb_triplets > triplets_per_file"
    
    # nb of pos / neg triplets per file
    neg_per_file = np.floor(triplets_per_file*neg_ratio) #number of negative triplets to generate given(query + partial summary)
    assert neg_per_file >= 1, "you have to increase your neg_ratio"
    
    nb_files = nb_triplets / triplets_per_file
    selected_files_array = np.random.choice(article_names, size=nb_files, p=article_weights, replace = False)
    
    for full_name in selected_files_array :
        with open(full_name) as f :
            file_as_dict = json.load(f)
        
        counter = 0
        while counter < triplets_per_file :
            
            # select a key for positive examples
            key_pos = select_key(file_as_dict)
            
            triplet = build_triplet_lstm(lstm_model, file_as_dict, key_pos, positive = True, str_mode = str_mode, with_txt_vect=with_txt_vect)
            label = 1
            
            triplets.append(triplet)
            labels.append(label)
            counter += 1 
            
            if neg_ratio < 1 : 
                
                if np.random.rand() < neg_ratio :
                    
                    triplet = build_triplet_lstm(lstm_model, file_as_dict, key_pos, positive = False, str_mode = str_mode, with_txt_vect=with_txt_vect)
                    label = 0
                    
                    triplets.append(triplet)
                    labels.append(label)
                    counter += 1 

            else :
                
                for n in range(int(np.floor(neg_ratio))):
                    
                    triplet = build_triplet_lstm(lstm_model, file_as_dict, key_pos, positive = False, str_mode = str_mode,with_txt_vect=with_txt_vect)
                    label = 0
                    
                    triplets.append(triplet)
                    labels.append(label)
                    counter += 1 

            
    triplets = np.asarray(triplets)[:nb_triplets]
    labels = np.asarray(labels)[:nb_triplets]
    
    return triplets, labels

def build_triplet_lstm(lstm_model, file_as_dict, key_pos,stopwords,word_indices, positive = True, str_mode = False, remove_stop_words=True, with_txt_vect = False):
    if remove_stop_words : 
        stopwords = stop_words()
    else :
        stopwords = []
        
    if with_txt_vect :
        text_str = ""
        for key in file_as_dict.keys():
            if key not in non_selected_keys :
                text_str += file_as_dict[key]

        text_vector = lstm_infer_vector(lstm_model,text_str,stopwords,word_indices)
            
    query_str = key_pos
    query_vector = lstm_infer_vector(lstm_model,query_str,stopwords,word_indices)
    
    summary_str = file_as_dict[key_pos]
    sentences = summary_str.split(".")
    
    partial_summary = []
    candidates = []
    
    size_partial_summary = np.random.rand()
    
    for sentence in sentences: 
        if np.random.rand() < size_partial_summary :
            partial_summary.append(sentence)
        else :
            candidates.append(sentence)
    
    candidate = ""
    counter_candidate = 0
    while (candidate == "" or partial_summary == "") and counter_candidate < 10:
        counter_candidate += 1
        
        if positive : 
            if len(candidates) > 0:
                random_candidate_index = np.random.randint(0,len(candidates))
                candidate = candidates[random_candidate_index]
            else :
                random_candidate_index = np.random.randint(0,len(partial_summary))
                candidate = partial_summary[random_candidate_index]
                partial_summary[random_candidate_index] = ""


            candidate_prep = gensim.utils.simple_preprocess(candidate, deacc=True)
            candidate_vector = lstm_infer_vector(lstm_model,candidate,stopwords,word_indices)

        else :

            key_neg = select_key(file_as_dict)
            counter = 0

            while key_neg == key_pos and counter<10 : # the counter is for the preproduction code 
                counter += 1
                key_neg = select_key(file_as_dict)

            summary_str = file_as_dict[key_neg]

            sentences = summary_str.split('.')
            random_candidate_index = np.random.randint(0,len(sentences))
            candidate = sentences[random_candidate_index]
            candidate_vector = lstm_infer_vector(lstm_model,candidate,stopwords,word_indices)
        
        partial_summary_str = "".join(partial_summary)
        partial_summary_vector = lstm_infer_vector(lstm_model, partial_summary_str,stopwords,word_indices)
    
    if str_mode :
        return query_str, partial_summary_str, candidate
    elif with_txt_vect:
        return np.hstack( [query_vector, partial_summary_vector, candidate_vector, text_vector])
    else :
        return np.hstack( [query_vector, partial_summary_vector, candidate_vector] )


In [53]:
from functions.training_functions import remove_stopwords, stop_words
stopwords= stop_words()
tic = time.time()
txt = "Diabetes mellitus (DM), commonly referred to as diabetes, is a group of metabolic diseases in which there are high blood sugar levels over a prolonged period.[2] Symptoms of high blood sugar include frequent urination, increased thirst, and increased hunger. If left untreated, diabetes can cause many complications.[3] Acute complications can include diabetic ketoacidosis, nonketotic hyperosmolar coma, or death.[4] Serious long-term complications include heart disease, stroke, chronic kidney failure, foot ulcers, and damage to the eyes."
lstm_infer_vector(model2, txt, stopwords,word_indices=word_indices)
toc = time.time()
print("TIme",toc-tic)

('TIme', 1.5339951515197754)


In [None]:

def summarize_lstm(text, query, lstm_model,  nn_model, stopwords, word_indices, limit = 250, remove_stop_words = True,with_txt_vect=False):
    """
    Perform summarization on text given query,
    """
    if remove_stop_words : 
        stopwords = stop_words()
    else :
        stopwords = []
    
    if with_txt_vect :
        text_vector = lstm_infer_vector(lstm_model, text, stopwords,word_indices)
        
    query_vector = lstm_infer_vector(lstm_model, query, stopwords,word_indices)
    
    summary  = ""
    summary_vector = lstm_infer_vector(lstm_model, [""], stopwords,word_indices)
    summary_idx = []
    
    sentences = text.split('.')
    sentences = np.asarray(sentences)
    
    remaining_sentences = copy.copy(sentences)
    
    size = 0
    counter = 0
    while size < limit and len(remaining_sentences)>0 :
        counter = counter+1
        scores = []
        for sentence in remaining_sentences :
            sentence_vector = lstm_infer_vector(lstm_model, sentence, stopwords,word_indices)
            if with_txt_vect :
                nn_input = np.hstack([query_vector, summary_vector, sentence_vector, text_vector])
            else:
                nn_input = np.hstack([query_vector, summary_vector, sentence_vector])
            nn_input = np.asarray([nn_input]) # weird but it is important to do it
            score = nn_model.predict(nn_input) 
            scores.append(score)
        #print(scores)
        max_idx_rem = int(np.argmax(scores))
        idx_selected_sentence = np.arange(len(sentences))[sentences == remaining_sentences[max_idx_rem]]
        idx_selected_sentence = int(idx_selected_sentence[0])
        size += len(remaining_sentences[max_idx_rem].split())
        
        remaining_sentences = list(remaining_sentences)
        del remaining_sentences[max_idx_rem]
        bisect.insort_left(summary_idx,idx_selected_sentence)

        summary  = ""

        for idx in summary_idx:
            summary = summary + " " + sentences[idx]

        summary_vector = lstm_infer_vector(lstm_model, summary, stopwords,word_indices)

    return summary

In [55]:
range(50)[-10:]

[40, 41, 42, 43, 44, 45, 46, 47, 48, 49]