# Summarization query oriented

 <hr style="border-color:#1d539d"> 

## Initialization

In [69]:
import collections
import copy
import gensim
import json
import keras
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import random
import time


In [53]:
# paths to folders

data_json = "/home/ubuntu/summarization_query_oriented/data/json/patch_0/"
data_txt = "/home/ubuntu/summarization_query_oriented/data/txt/"
model_folder = "/home/ubuntu/summarization_query_oriented/models/"
nn_models_folder = "/home/ubuntu/summarization_query_oriented/nn_models/"

# file names + extensions


## Doc2vec

In [4]:
# parameters of doc2vec
dm = 0
min_count = 5
window = 10
size = 400
sample = 1e-4
negative = 5
workers = 4
epoch = 20

# Initialize the model ( IMPORTANT )
d2v_model = gensim.models.doc2vec.Doc2Vec(dm=dm,min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers,iter = epoch)

# load model
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)
d2v_model = d2v_model.load(model_folder+model_name+".d2v")
print("model loaded")

model loaded


In [6]:
# Verifiing the model can infer

v1 = d2v_model.infer_vector(['my', 'name', 'is', 'charles', 'sutton'])
v2 = d2v_model.infer_vector(['my', 'name', 'is', 'charles', 'bitton'])
v3 = d2v_model.infer_vector(['my', 'name', 'is', 'charles', 'martin'])

np.hstack([v1,v2,v3]).shape

(1200,)

## Models 

### Model 1 : fully connected model


* __Architecture__ : Fully connected model


* __Input__ : a vector that is the concatenation of [ query , partial summary, candidate ]
    * *query* : here the subtitle of a wikipedia page
    * *partial summary* : here a part (eventually void) of the summary attached to this subtitle
    * *candidate* : a random sentence


* __Output__ : a score describing how much the candidate sentence is completing the partial summary w.r.t the query 


* __Training mode__ : we sample triplet from wikipedia data to build the training set, we label 1 if the candidate sentence is a sentence of the correct subsection that is not in the partial summary (by building), we label 0 otherwise


* __Testing mode__ : Given a document and a query. The partial summary is initialized as the query, then we choose the sentence of the document that is not in the partial summary with the highest score and delete it from the document. We repeat it until we reach the length limit

#### Data Processing 

Here we build functions to perform end—to-end data preparation

In [7]:
# functions

non_selected_keys = ["title", "external links","further reading","references","see also"]

def has_at_least_one_relevant_key(file_as_dict):
    
    for key in file_as_dict.keys():
        b = True
        for unwanted_key in non_selected_keys:
            if unwanted_key in key.lower() :
                b = False    
        if b :
            return True
    return False
        
def has_irrelevant_content(file_as_dict):
    # remove articles with mathematics of chemics
    for key in file_as_dict.keys():
        if "{\\" in file_as_dict[key]:
            return True        

    # check that there is at least one interesting key
    if not has_at_least_one_relevant_key(file_as_dict):
        return True

    return False


def relevant_articles(article_folder_path, min_size = 0.0002) : 
    """
    inputs :
        - absolute path of the folder containing all the json articles
        - min_size : retaining only file with at least size = min_size*10^-4 ko
    output : 
        - article_names: nd array of the names of the relevant articles (absolute paths)
        - article_weights : nd array normalized of the weights of each files
    """
    all_names =  [f for f in listdir(article_folder_path)]
    article_names = []
    article_weights = []
    for name in all_names:
        article_weight = os.path.getsize(article_folder_path+name)
        if article_weight > min_size:
            # the size of the article meets the requirement
            
            with open(article_folder_path+name) as f :
                file_as_dict = json.load(f) # get article as dict
            
            if not has_irrelevant_content(file_as_dict):
                article_names.append(article_folder_path+name)
                article_weights.append(article_weight)
    
    article_names = np.asarray(article_names)
    article_weights = (np.asarray(article_weights) + 0.0) / np.sum(article_weights)
        
    return article_names, article_weights
            
def select_key(file_as_dict):
    assert has_at_least_one_relevant_key(file_as_dict), "the file has no relevant key"

    keys = file_as_dict.keys()
    rand_idx = np.random.randint(0,len(keys))
    selected_key = keys[rand_idx]
    
    for unwanted_key in non_selected_keys :
        if unwanted_key in selected_key.lower() :
            return select_key(file_as_dict)
        
    return selected_key

def create_triplets(d2v_model, article_names, article_weights, nb_triplets=20, triplets_per_file=5, neg_ratio=0.5, str_mode = False) :
    """
    inputs :    
        - d2v_model : paragraph vector model 
        - article_names : ndarray containing the names of the json files (absolute path !)
        - article_weights: ndarray normalized of the weight of each files 
        - nb_triplets : nb of triplets to generate
        - triplets_per_file : number of triplet built for each selected file
        - neg_ratio : ratio of positives / negative examples. Negative examples are taken inside the article !
        
    output : 
        - triplets : nd_array of triplets of shape (nb_triplets+ , embed_dim)
        - labels : nd_array of labels of shape (nb_triplets+ ,)

    """
    triplets = []
    labels = []
    
    assert nb_triplets>=triplets_per_file, "you should have nb_triplets > triplets_per_file"
    
    # nb of pos / neg triplets per file
    neg_per_file = np.floor(triplets_per_file*neg_ratio) #number of negative triplets to generate given(query + partial summary)
    assert neg_per_file >= 1, "you have to increase your neg_ratio"
    
    nb_files = nb_triplets / triplets_per_file
    selected_files_array = np.random.choice(article_names, size=nb_files, p=article_weights, replace = False)
    
    for full_name in selected_files_array :
        with open(full_name) as f :
            file_as_dict = json.load(f)
        
        counter = 0
        while counter < triplets_per_file :
            
            # select a key for positive examples
            key_pos = select_key(file_as_dict)
            
            triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = True, str_mode = str_mode)
            label = 1
            
            triplets.append(triplet)
            labels.append(label)
            counter += 1 
            
            if neg_ratio < 1 : 
                
                if np.random.rand() < neg_ratio :
                    
                    triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = False, str_mode = str_mode)
                    label = 0
                    
                    triplets.append(triplet)
                    labels.append(label)
                    counter += 1 

            else :
                
                for n in range(int(np.floor(neg_ratio))):
                    
                    triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = False, str_mode = str_mode)
                    label = 0
                    
                    triplets.append(triplet)
                    labels.append(label)
                    counter += 1 

            
    triplets = np.asarray(triplets)[:nb_triplets]
    labels = np.asarray(labels)[:nb_triplets]
    
    return triplets, labels

def build_triplet(d2v_model, file_as_dict, key_pos, positive = True, str_mode = False):

    query_str = key_pos
    query_prep = gensim.utils.simple_preprocess(query_str, deacc=True)
    query_vector = d2v_model.infer_vector(query_prep)
    
    summary_str = file_as_dict[key_pos]
    sentences = summary_str.split(".")
    
    partial_summary = []
    candidates = []
    
    size_partial_summary = np.random.rand()
    
    for sentence in sentences: 
        if np.random.rand() < size_partial_summary :
            partial_summary.append(sentence)
        else :
            candidates.append(sentence)
    
    candidate = ""
    counter_candidate = 0
    while (candidate == "" or partial_summary == "") and counter_candidate < 10:
        counter_candidate += 1
        
        if positive : 
            if len(candidates) > 0:
                random_candidate_index = np.random.randint(0,len(candidates))
                candidate = candidates[random_candidate_index]
            else :
                random_candidate_index = np.random.randint(0,len(partial_summary))
                candidate = partial_summary[random_candidate_index]
                partial_summary[random_candidate_index] = ""


            candidate_prep = gensim.utils.simple_preprocess(candidate, deacc=True)
            candidate_vector = d2v_model.infer_vector(candidate_prep)

        else :

            key_neg = select_key(file_as_dict)
            counter = 0

            while key_neg == key_pos and counter<10 : # the counter is for the preproduction code 
                counter += 1
                key_neg = select_key(file_as_dict)

            summary_str = file_as_dict[key_neg]

            sentences = summary_str.split('.')
            random_candidate_index = np.random.randint(0,len(sentences))
            candidate = sentences[random_candidate_index]
            candidate_prep = gensim.utils.simple_preprocess(candidate, deacc=True)
            candidate_vector = d2v_model.infer_vector(candidate_prep)
        
        partial_summary_str = "".join(partial_summary)
        partial_summary_prep = gensim.utils.simple_preprocess(partial_summary_str, deacc=True)
        partial_summary_vector = d2v_model.infer_vector(partial_summary_prep)
    
    if str_mode :
        return query_str, partial_summary_str, candidate
    else :
        return np.hstack( [query_vector, partial_summary_vector, candidate_vector] )



In [8]:
# data processing
article_names, article_weights = relevant_articles(data_json)

#### Here you can play with the triplet maker and see what gives triplet labelisation 

In [16]:
triplets, labels = create_triplets(d2v_model, article_names, article_weights, nb_triplets=25, triplets_per_file=8, neg_ratio=1, str_mode = True)

for i in range(len(labels)):
    print 50*'-'
    print "label = ", labels[i]
    print "\nquery :", triplets[i][0]
    print "\npartial summary :", triplets[i][1]
    print "\ncandidate :", triplets[i][2]

--------------------------------------------------
label =  1

query : Georg Wilhelm Friedrich Hegel Legacy Left and Right Hegelianism

partial summary : Some historians have spoken of Hegel's influence as represented by two opposing camps No Hegelians of the period ever referred to themselves as "Right Hegelians"; that was a term of insult originated by David Strauss, a self-styled Left Hegelian The Italian Fascist Giovanni Gentile, according to Benedetto Croce, " Walter Jaeschke and Otto Pöggeler in Germany, as well as Peter Hodgson and Howard Kainz in America are notable for their recent contributions to post-USSR thinking about Hegel

candidate : 
The Left Hegelians also spawned Marxism, which inspired global movements, encompassing the Russian Revolution, the Chinese Revolution, and myriad revolutionary practices up until the present moment
--------------------------------------------------
label =  0

query : Georg Wilhelm Friedrich Hegel Legacy Left and Right Hegelianism

partia

#### Define model

In [93]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import Adam


fc_model_name = nn_models_folder + time.strftime("%Y_%m_%d_") +'_fc_model.h5' 

fc_model = Sequential()

fc_model.add(Dense(120, input_dim=1200))
fc_model.add(Activation('sigmoid'))
fc_model.add(Dropout(0.5))

fc_model.add(Dense(12))
fc_model.add(Activation('sigmoid'))
fc_model.add(Dropout(0.5))

fc_model.add(Dense(1))
fc_model.add(Activation('sigmoid'))

In [34]:
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
fc_model.compile(loss="binary_crossentropy", optimizer=adam)

#### Training (we use training per batch)

In [86]:
batch_size = 128
for i in range(200):
    if i%10 == 0 : 
        print(i)
    triplets, labels = create_triplets(d2v_model, article_names, article_weights, nb_triplets=batch_size, triplets_per_file=16, neg_ratio=1, str_mode = False)
    fc_model.train_on_batch(triplets, labels)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190


In [87]:
triplets_tests , labels_tests = create_triplets(d2v_model, article_names, article_weights, nb_triplets=128, triplets_per_file=16, neg_ratio=1, str_mode = False)
labels_predicted = fc_model.predict(triplets_tests , batch_size=batch_size, verbose=1)



In [84]:
triplets_tests[0].shape

(1200,)

In [47]:
loss_and_metrics = fc_model.evaluate(triplets_tests, labels_tests, batch_size=batch_size)
print(loss_and_metrics)

0.693373680115


In [49]:
labels_tests

array([1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0])

#### Saving

In [57]:
from keras.models import load_model


fc_model.save(nn_models_folder + 'fc_model.h5')  # creates a HDF5 file 'my_model.h5'
del fc_model  # deletes the existing model

# returns a compiled model
# identical to the previous one
fc_model = load_model(nn_models_folder +'fc_model.h5')

#### General info on the model

In [65]:
fc_model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
dense_9 (Dense)                  (None, 120)           144120      dense_input_8[0][0]              
____________________________________________________________________________________________________
activation_9 (Activation)        (None, 120)           0           dense_9[0][0]                    
____________________________________________________________________________________________________
dropout_3 (Dropout)              (None, 120)           0           activation_9[0][0]               
____________________________________________________________________________________________________
dense_10 (Dense)                 (None, 12)            1452        dropout_3[0][0]                  
___________________________________________________________________________________________

In [64]:
fc_model.get_config()

[{'class_name': 'Dense',
  'config': {'W_constraint': None,
   'W_regularizer': None,
   'activation': 'linear',
   'activity_regularizer': None,
   'b_constraint': None,
   'b_regularizer': None,
   'batch_input_shape': (None, 1200),
   'bias': True,
   'init': 'glorot_uniform',
   'input_dim': 1200,
   'input_dtype': u'float32',
   'name': u'dense_9',
   'output_dim': 120,
   'trainable': True}},
 {'class_name': 'Activation',
  'config': {'activation': 'sigmoid',
   'name': u'activation_9',
   'trainable': True}},
 {'class_name': 'Dropout',
  'config': {'name': u'dropout_3', 'p': 0.5, 'trainable': True}},
 {'class_name': 'Dense',
  'config': {'W_constraint': None,
   'W_regularizer': None,
   'activation': 'linear',
   'activity_regularizer': None,
   'b_constraint': None,
   'b_regularizer': None,
   'bias': True,
   'init': 'glorot_uniform',
   'input_dim': None,
   'name': u'dense_10',
   'output_dim': 12,
   'trainable': True}},
 {'class_name': 'Activation',
  'config': {'activat

#### Testing the model

In [94]:
def summarize(text, query, d2v_model, nn_model, limit = 2000):

    query_prep = gensim.utils.simple_preprocess(query, deacc=True)
    query_vector = d2v_model.infer_vector(query_prep)
    
    summary  = ""
    summary_vector = d2v_model.infer_vector([""])
    summary_idx = []
    
    sentences = text.split('.')
    sentences = np.asarray(sentences)
    
    remaining_sentences = copy.copy(sentences)
    
    size = 0
    
    while size < limit and len(remaining_sentences)>0 :
        
        for sentence in remaining_sentences :
            scores = []
            sentence_prep = gensim.utils.simple_preprocess(sentence, deacc=True)
            sentence_vector = d2v_model.infer_vector(sentence_prep)
            print query_vector.shape, summary_vector.shape, sentence_vector.shape
            nn_input = np.hstack([query_vector, summary_vector, sentence_vector])
            
            scores.append(nn_model.predict(nn_input, batch_size=1))
            
            max_idx_rem = np.argmax(scores)
            
            idx_selected_sentence = np.arange(len(sentences))[sentences == remaining_sentences[max_idx_rem]]
            
            size += len(remaining_sentences[max_idx_rem])

            del remaining_sentences[max_idx_rem]
            
            summary_idx.append(idx_selected_sentence)
            summary_idx.sort()
            
            summary  = ""
            for idx in summary_idx:
                summary = summary + " " + sentences[summary_idx]
            
            summary_prep = gensim.utils.simple_preprocess(summary, deacc=True)
            summary_vector = d2v_model.infer_vector(summary_prep)
            
    return summary

In [95]:
wikipedia_title = "History of Israel"
with open(data_json+wikipedia_title+".json", 'r') as f:
    wiki_as_json = json.load(f)

text = ""
for key in wiki_as_json.keys():
    if key not in non_selected_keys:
        text += " " + wiki_as_json[key]
        
random_idx = np.random.randint(0,len(wiki_as_json.keys()))
query = wiki_as_json.keys()[random_idx]
summary_true = wiki_as_json[query]
limit_size = len(wiki_as_json[query])

print 50*"*"
print 'query', query
print 50*"*"
print "real summary\n\n", summary_true
print 50*"*"
print "nn summary\n\n", summarize(text,query,d2v_model,fc_model, limit = limit_size)

**************************************************
query History of Israel State of Israel (1948–present) 1974–1977: Rabin I
**************************************************
real summary

Following Meir's resignation, Yitzhak Rabin (Chief of Staff during the Six Day War) became prime minister. Modern Orthodox Jews (Religious Zionist followers of the teachings of Rabbi Kook), formed the Gush Emunim movement, and began an organized drive to settle the West Bank and Gaza Strip. In November 1975 the United Nations General Assembly, under the guidance of Austrian Secretary General Kurt Waldheim, adopted Resolution 3379, which asserted Zionism to be a form of racism. The General Assembly rescinded this resolution in December 1991 with Resolution 46/86. In March 1976 there was a massive strike by Israeli-Arabs in protest at a government plan to expropriate land in the Galilee.
In July 1976, an Air France plane carrying 260 people was hijacked by Palestinian and German terrorists and flown t

Exception: Error when checking : expected dense_input_9 to have shape (None, 1200) but got array with shape (1200, 1)

[{'class_name': 'Dense',
  'config': {'W_constraint': None,
   'W_regularizer': None,
   'activation': 'linear',
   'activity_regularizer': None,
   'b_constraint': None,
   'b_regularizer': None,
   'batch_input_shape': (None, 1200),
   'bias': True,
   'init': 'glorot_uniform',
   'input_dim': 1200,
   'input_dtype': u'float32',
   'name': u'dense_9',
   'output_dim': 120,
   'trainable': True}},
 {'class_name': 'Activation',
  'config': {'activation': 'sigmoid',
   'name': u'activation_9',
   'trainable': True}},
 {'class_name': 'Dropout',
  'config': {'name': u'dropout_3', 'p': 0.5, 'trainable': True}},
 {'class_name': 'Dense',
  'config': {'W_constraint': None,
   'W_regularizer': None,
   'activation': 'linear',
   'activity_regularizer': None,
   'b_constraint': None,
   'b_regularizer': None,
   'bias': True,
   'init': 'glorot_uniform',
   'input_dim': None,
   'name': u'dense_10',
   'output_dim': 12,
   'trainable': True}},
 {'class_name': 'Activation',
  'config': {'activat

### Model 2 : LSTM

#### data processing 

#### define model

#### training

#### testing

#### saving

In [68]:
import copy
list1 = [1,2,3,4]
list2 = copy.copy(list1)
del list1[0]
print (list1)
print (list2)

[2, 3, 4]
[1, 2, 3, 4]
