# Summarization query oriented

 <hr style="border-color:#1d539d"> 

## Initialization

In [18]:
import collections
import gensim
import json
import keras
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import random

In [19]:
# paths to folders

data_json = "/home/ubuntu/summarization_query_oriented/data/json/patch_0/"
data_txt = "/home/ubuntu/summarization_query_oriented/data/txt/"
model_folder = "/home/ubuntu/summarization_query_oriented/models/"

# file names + extensions


## Doc2vec

In [20]:
# parameters of doc2vec
dm = 0
min_count = 5
window = 10
size = 400
sample = 1e-4
negative = 5
workers = 4
epoch = 20

# Initialize the model ( IMPORTANT )
model = gensim.models.doc2vec.Doc2Vec(dm=dm,min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers,iter = epoch)

# load model
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)
model = model.load(model_folder+model_name+".d2v")
print("model loaded")

model loaded


In [161]:
# Verifiing the model can infer

v1 = model.infer_vector(['my', 'name', 'is', 'charles', 'sutton'])
v2 = model.infer_vector(['my', 'name', 'is', 'charles', 'bitton'])
v3 = model.infer_vector(['my', 'name', 'is', 'charles', 'martin'])

np.hstack([v1,v2,v3]).shape

(1200,)

## Models 

### Model 1 : fully connected model


* __Architecture__ : Fully connected model


* __Input__ : a vector that is the concatenation of [ query , partial summary, candidate ]
    * *query* : here the subtitle of a wikipedia page
    * *partial summary* : here a part (eventually void) of the summary attached to this subtitle
    * *candidate* : a random sentence


* __Output__ : a score describing how much the candidate sentence is completing the partial summary w.r.t the query 


* __Training mode__ : we sample triplet from wikipedia data to build the training set, we label 1 if the candidate sentence is a sentence of the correct subsection that is not in the partial summary (by building), we label 0 otherwise


* __Testing mode__ : Given a document and a query. The partial summary is initialized as the query, then we choose the sentence of the document that is not in the partial summary with the highest score and delete it from the document. We repeat it until we reach the length limit

#### Data Processing 

Here we build functions to perform end—to-end data preparation

In [183]:
# functions

non_selected_keys = ["title", "external links","further reading","references","see also"]

def has_at_least_one_relevant_key(file_as_dict):
    
    for key in file_as_dict.keys():
        b = True
        for unwanted_key in non_selected_keys:
            if unwanted_key in key.lower() :
                b = False    
        if b :
            return True
    return False
        
def has_irrelevant_content(file_as_dict):
    # remove articles with mathematics of chemics
    for key in file_as_dict.keys():
        if "{\\" in file_as_dict[key]:
            return True        

    # check that there is at least one interesting key
    if not has_at_least_one_relevant_key(file_as_dict):
        return True

    return False


def relevant_articles(article_folder_path, min_size = 0.0002) : 
    """
    inputs :
        - absolute path of the folder containing all the json articles
        - min_size : retaining only file with at least size = min_size*10^-4 ko
    output : 
        - article_names: nd array of the names of the relevant articles (absolute paths)
        - article_weights : nd array normalized of the weights of each files
    """
    all_names =  [f for f in listdir(article_folder_path)]
    article_names = []
    article_weights = []
    for name in all_names:
        article_weight = os.path.getsize(article_folder_path+name)
        if article_weight > min_size:
            # the size of the article meets the requirement
            
            with open(article_folder_path+name) as f :
                file_as_dict = json.load(f) # get article as dict
            
            if not has_irrelevant_content(file_as_dict):
                article_names.append(article_folder_path+name)
                article_weights.append(article_weight)
    
    article_names = np.asarray(article_names)
    article_weights = (np.asarray(article_weights) + 0.0) / np.sum(article_weights)
        
    return article_names, article_weights
            
def select_key(file_as_dict):
    assert has_at_least_one_relevant_key(file_as_dict), "the file has no relevant key"

    keys = file_as_dict.keys()
    rand_idx = np.random.randint(0,len(keys))
    selected_key = keys[rand_idx]
    
    for unwanted_key in non_selected_keys :
        if unwanted_key in selected_key.lower() :
            return select_key(file_as_dict)
        
    return selected_key

def create_triplets(d2v_model, article_names, article_weights, nb_triplets=20, triplets_per_file=5, neg_ratio=0.5, str_mode = False) :
    """
    inputs :    
        - d2v_model : paragraph vector model 
        - article_names : ndarray containing the names of the json files (absolute path !)
        - article_weights: ndarray normalized of the weight of each files 
        - nb_triplets : nb of triplets to generate
        - triplets_per_file : number of triplet built for each selected file
        - neg_ratio : ratio of positives / negative examples. Negative examples are taken inside the article !
        
    output : 
        - triplets : nd_array of triplets of shape (nb_triplets+ , embed_dim)
        - labels : nd_array of labels of shape (nb_triplets+ ,)

    """
    triplets = []
    labels = []
    
    assert nb_triplets>=triplets_per_file, "you should have nb_triplets > triplets_per_file"
    
    # nb of pos / neg triplets per file
    neg_per_file = np.floor(triplets_per_file*neg_ratio) #number of negative triplets to generate given(query + partial summary)
    assert neg_per_file >= 1, "you have to increase your neg_ratio"
    
    nb_files = nb_triplets / triplets_per_file
    selected_files_array = np.random.choice(article_names, size=nb_files, p=article_weights, replace = False)
    
    for full_name in selected_files_array :
        with open(full_name) as f :
            file_as_dict = json.load(f)
        
        counter = 0
        while counter < triplets_per_file :
            
            # select a key for positive examples
            key_pos = select_key(file_as_dict)
            
            triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = True, str_mode = str_mode)
            label = 1
            
            triplets.append(triplet)
            labels.append(label)
            counter += 1 
            
            if neg_ratio < 1 : 
                
                if np.random.rand() < neg_ratio :
                    
                    triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = False, str_mode = str_mode)
                    label = 0
                    
                    triplets.append(triplet)
                    labels.append(label)
                    counter += 1 

            else :
                
                for n in range(int(np.floor(neg_ratio))):
                    
                    triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = False, str_mode = str_mode)
                    label = 0
                    
                    triplets.append(triplet)
                    labels.append(label)
                    counter += 1 

            
    triplets = np.asarray(triplets)[:nb_triplets]
    labels = np.asarray(labels)[:nb_triplets]
    
    return triplets, labels

def build_triplet(d2v_model, file_as_dict, key_pos, positive = True, str_mode = False):

    query_str = key_pos
    query_prep = gensim.utils.simple_preprocess(query_str, deacc=True)
    query_vector = d2v_model.infer_vector(query_prep)
    
    summary_str = file_as_dict[key_pos]
    sentences = summary_str.split(".")
    
    partial_summary = []
    candidates = []
    
    size_partial_summary = np.random.rand()
    
    for sentence in sentences: 
        if np.random.rand() < size_partial_summary :
            partial_summary.append(sentence)
        else :
            candidates.append(sentence)
    
    candidate = ""
    counter_candidate = 0
    while (candidate == "" or partial_summary == "") and counter_candidate < 10:
        counter_candidate += 1
        
        if positive : 
            if len(candidates) > 0:
                random_candidate_index = np.random.randint(0,len(candidates))
                candidate = candidates[random_candidate_index]
            else :
                random_candidate_index = np.random.randint(0,len(partial_summary))
                candidate = partial_summary[random_candidate_index]
                partial_summary[random_candidate_index] = ""


            candidate_prep = gensim.utils.simple_preprocess(candidate, deacc=True)
            candidate_vector = d2v_model.infer_vector(candidate_prep)

        else :

            key_neg = select_key(file_as_dict)
            counter = 0

            while key_neg == key_pos and counter<10 : # the counter is for the preproduction code 
                counter += 1
                key_neg = select_key(file_as_dict)

            summary_str = file_as_dict[key_neg]

            sentences = summary_str.split('.')
            random_candidate_index = np.random.randint(0,len(sentences))
            candidate = sentences[random_candidate_index]
            candidate_prep = gensim.utils.simple_preprocess(candidate, deacc=True)
            candidate_vector = d2v_model.infer_vector(candidate_prep)
        
        partial_summary_str = "".join(partial_summary)
        partial_summary_prep = gensim.utils.simple_preprocess(partial_summary_str, deacc=True)
        partial_summary_vector = d2v_model.infer_vector(partial_summary_prep)
    
    if str_mode :
        return query_str, partial_summary_str, candidate
    else :
        return np.hstack( [query_vector, partial_summary_vector, candidate_vector] )



In [127]:
# data processing
article_names, article_weights = relevant_articles(data_json)

#### Here you can play with the triplet maker and see what gives triplet labelisation 

In [189]:
triplets, labels = create_triplets(model, article_names, article_weights, nb_triplets=3, triplets_per_file=3, neg_ratio=2, str_mode = True)

for i in range(len(labels)):
    print 50*'-'
    print "label = ", labels[i]
    print "\nquery :", triplets[i][0]
    print "\npartial summary :", triplets[i][1]
    print "\ncandidate :", triplets[i][2]

--------------------------------------------------
label =  1

query : Arena (software) Commercial software editions

partial summary :  Systems, regardless of complexity, can be represented and custom performance metrics may be measured and tracked
Standard Edition – This mid-tier package has the versatility to solve simulation problems encountered in an array of industries and systems This edition includes Basic Process, Advanced Transfer, and Advanced Process Arena templates
OptQuest – OptQuest provides optimization functionality within Arena

candidate : Professional Edition – The flagship product, provides the ultimate in functionality and flexibility to meet the needs of any simulation problem
--------------------------------------------------
label =  0

query : Arena (software) Commercial software editions

partial summary :  Systems, regardless of complexity, can be represented and custom performance metrics may be measured and tracked
Standard Edition – This mid-tier package 

In [185]:
triplets, labels = create_triplets(model, article_names, article_weights, nb_triplets=1000, triplets_per_file=3, neg_ratio=2, str_mode = False)

#### define model

#### training

#### testing

#### saving

### Model 2 : LSTM

#### data processing 

#### define model

#### training

#### testing

#### saving