# Imports

In [14]:
import bisect
import collections
import copy
import gensim
import json
import keras
from keras.layers import Dense, Activation, Dropout
from keras.models import Sequential, load_model
from keras.optimizers import Adam
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib inline
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
import pyrouge
from pyrouge import Rouge155
import random
import re
import time

In [15]:
lang_model_folder = "/home/ubuntu/summarization_query_oriented/nn_models/language_models/d2v/"
nn_summarizers_folder = "/home/ubuntu/summarization_query_oriented/nn_models/nn_summarizer/"
model_dir = "/home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_model"
current_time_str = time.strftime("%Y_%m_%d_%H_%M_")
system_folder = "/home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/" + current_time_str + "test/"
os.mkdir(system_folder)
tdqfs_folder = "/home/ubuntu/summarization_query_oriented/data/TD-QFS/"


In [16]:
themes = ["alz","asthma","cancer","obese"]
non_selected_keys = ["title", "external links","further reading","references","see also"]

# Useful functions

In [17]:

def has_at_least_one_relevant_key(file_as_dict):
    
    for key in file_as_dict.keys():
        b = True
        for unwanted_key in non_selected_keys:
            if unwanted_key in key.lower() :
                b = False    
        if b :
            return True
    return False
        
def has_irrelevant_content(file_as_dict):
    # remove articles with mathematics of chemics
    for key in file_as_dict.keys():
        if "{\\" in file_as_dict[key]:
            return True        

    # check that there is at least one interesting key
    if not has_at_least_one_relevant_key(file_as_dict):
        return True

    return False


def relevant_articles(article_folder_path, min_size = 10000) : 
    """
    inputs :
        - absolute path of the folder containing all the json articles
        - min_size : retaining only file with at least size = min_size*10^-4 ko
    output : 
        - article_names: nd array of the names of the relevant articles (absolute paths)
        - article_weights : nd array normalized of the weights of each files
    """
    all_names =  [f for f in listdir(article_folder_path)]
    article_names = []
    article_weights = []
    for name in all_names:
        article_weight = os.path.getsize(article_folder_path+name)
        if article_weight > min_size:
            # the size of the article meets the requirement
            
            with open(article_folder_path+name) as f :
                file_as_dict = json.load(f) # get article as dict
            
            if not has_irrelevant_content(file_as_dict):
                article_names.append(article_folder_path+name)
                article_weights.append(article_weight)
    
    article_names = np.asarray(article_names)
    article_weights = (np.asarray(article_weights) + 0.0) / np.sum(article_weights)
        
    return article_names, article_weights
            
def select_key(file_as_dict, patience = 10):
    if patience > 0 :
        assert has_at_least_one_relevant_key(file_as_dict), "the file has no relevant key"

        keys = file_as_dict.keys()
        rand_idx = np.random.randint(0,len(keys))
        selected_key = keys[rand_idx]

        if len(file_as_dict[selected_key].split("."))<=2:
            return select_key(file_as_dict, patience = patience - 1)

        for unwanted_key in non_selected_keys :
            if unwanted_key in selected_key.lower() :
                return select_key(file_as_dict, patience = patience - 1)

        return selected_key
    else : 
        keys = file_as_dict.keys()
        rand_idx = np.random.randint(0,len(keys))
        selected_key = keys[rand_idx]
        return selected_key

def create_triplets(d2v_model, article_names, article_weights, nb_triplets=20, triplets_per_file=5, neg_ratio=0.5, str_mode = False) :
    """
    inputs :    
        - d2v_model : paragraph vector model 
        - article_names : ndarray containing the names of the json files (absolute path !)
        - article_weights: ndarray normalized of the weight of each files 
        - nb_triplets : nb of triplets to generate
        - triplets_per_file : number of triplet built for each selected file
        - neg_ratio : ratio of positives / negative examples. Negative examples are taken inside the article !
        
    output : 
        - triplets : nd_array of triplets of shape (nb_triplets+ , embed_dim)
        - labels : nd_array of labels of shape (nb_triplets+ ,)

    """
    triplets = []
    labels = []
    
    assert nb_triplets>=triplets_per_file, "you should have nb_triplets > triplets_per_file"
    
    # nb of pos / neg triplets per file
    neg_per_file = np.floor(triplets_per_file*neg_ratio) #number of negative triplets to generate given(query + partial summary)
    assert neg_per_file >= 1, "you have to increase your neg_ratio"
    
    nb_files = nb_triplets / triplets_per_file
    selected_files_array = np.random.choice(article_names, size=nb_files, p=article_weights, replace = False)
    
    for full_name in selected_files_array :
        with open(full_name) as f :
            file_as_dict = json.load(f)
        
        counter = 0
        while counter < triplets_per_file :
            
            # select a key for positive examples
            key_pos = select_key(file_as_dict)
            
            triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = True, str_mode = str_mode)
            label = 1
            
            triplets.append(triplet)
            labels.append(label)
            counter += 1 
            
            if neg_ratio < 1 : 
                
                if np.random.rand() < neg_ratio :
                    
                    triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = False, str_mode = str_mode)
                    label = 0
                    
                    triplets.append(triplet)
                    labels.append(label)
                    counter += 1 

            else :
                
                for n in range(int(np.floor(neg_ratio))):
                    
                    triplet = build_triplet(d2v_model, file_as_dict, key_pos, positive = False, str_mode = str_mode)
                    label = 0
                    
                    triplets.append(triplet)
                    labels.append(label)
                    counter += 1 

            
    triplets = np.asarray(triplets)[:nb_triplets]
    labels = np.asarray(labels)[:nb_triplets]
    
    return triplets, labels

def build_triplet(d2v_model, file_as_dict, key_pos, positive = True, str_mode = False):

    query_str = key_pos
    query_prep = gensim.utils.simple_preprocess(query_str, deacc=True)
    query_vector = d2v_model.infer_vector(query_prep)
    
    summary_str = file_as_dict[key_pos]
    sentences = summary_str.split(".")
    
    partial_summary = []
    candidates = []
    
    size_partial_summary = np.random.rand()
    
    for sentence in sentences: 
        if np.random.rand() < size_partial_summary :
            partial_summary.append(sentence)
        else :
            candidates.append(sentence)
    
    candidate = ""
    counter_candidate = 0
    while (candidate == "" or partial_summary == "") and counter_candidate < 10:
        counter_candidate += 1
        
        if positive : 
            if len(candidates) > 0:
                random_candidate_index = np.random.randint(0,len(candidates))
                candidate = candidates[random_candidate_index]
            else :
                random_candidate_index = np.random.randint(0,len(partial_summary))
                candidate = partial_summary[random_candidate_index]
                partial_summary[random_candidate_index] = ""


            candidate_prep = gensim.utils.simple_preprocess(candidate, deacc=True)
            candidate_vector = d2v_model.infer_vector(candidate_prep)

        else :

            key_neg = select_key(file_as_dict)
            counter = 0

            while key_neg == key_pos and counter<10 : # the counter is for the preproduction code 
                counter += 1
                key_neg = select_key(file_as_dict)

            summary_str = file_as_dict[key_neg]

            sentences = summary_str.split('.')
            random_candidate_index = np.random.randint(0,len(sentences))
            candidate = sentences[random_candidate_index]
            candidate_prep = gensim.utils.simple_preprocess(candidate, deacc=True)
            candidate_vector = d2v_model.infer_vector(candidate_prep)
        
        partial_summary_str = "".join(partial_summary)
        partial_summary_prep = gensim.utils.simple_preprocess(partial_summary_str, deacc=True)
        partial_summary_vector = d2v_model.infer_vector(partial_summary_prep)
    
    if str_mode :
        return query_str, partial_summary_str, candidate
    else :
        return np.hstack( [query_vector, partial_summary_vector, candidate_vector] )


def doc_title_table(title_file):
    with open(title_file , 'r') as f :
        lines = f.readlines()
        raw_text = "".join(l for l in lines)
        left_idx_num = [ m.end(0) for m in re.finditer(r"<num>",raw_text)]
        right_idx_num = [ m.start(0) for m in re.finditer(r"</num>",raw_text)]

        left_idx_title = [ m.end(0) for m in re.finditer(r"<title>",raw_text)]
        right_idx_title = [ m.start(0) for m in re.finditer(r"</title>",raw_text)]

        docs_title_dict = {}
        for i in range(len(left_idx_num)):
            docs_title_dict[raw_text[left_idx_num[i]+1:right_idx_num[i]-1]] = raw_text[left_idx_title[i]+1:right_idx_title[i]-1]
    return docs_title_dict

def merge_articles(docs_folder):
    """ for DUC corpus """ 
    s = ""
    
    for doc in os.listdir(docs_folder):
        try:
            with open(docs_folder + doc ,'r') as f:

                lines = f.readlines()
                raw_doc = "".join(txt for txt in lines)
                left_idx_headline = [ m.end(0) for m in re.finditer(r"<HEADLINE>",raw_doc)]
                right_idx_headline = [ m.start(0) for m in re.finditer(r"</HEADLINE>",raw_doc)]

                left_idx_text = [ m.end(0) for m in re.finditer(r"<TEXT>",raw_doc)]
                right_idx_text = [ m.start(0) for m in re.finditer(r"</TEXT>",raw_doc)]

                raw_headline = raw_doc[left_idx_headline[0]:right_idx_headline[0]]
                raw_text = raw_doc[left_idx_text[0]:right_idx_text[0]]

                left_idx_paragraph_headline = [ m.end(0) for m in re.finditer(r"<P>",raw_headline)]
                right_idx_paragraph_headline = [ m.start(0) for m in re.finditer(r"</P>",raw_headline)]

                left_idx_paragraph_text = [ m.end(0) for m in re.finditer(r"<P>",raw_text)]
                right_idx_paragraph_text = [ m.start(0) for m in re.finditer(r"</P>",raw_text)]

                for i in range(len(left_idx_paragraph_headline)):
                    s += raw_headline[left_idx_paragraph_headline[i]:right_idx_paragraph_headline[i]-2] + "."

                for i in range(len(left_idx_paragraph_text)):
                    s += raw_text[left_idx_paragraph_text[i]:right_idx_paragraph_text[i]-1]
        except:
            pass

    return s

def summarize(text, query, d2v_model, nn_model, limit = 250):

    query_prep = gensim.utils.simple_preprocess(query, deacc=True)
    query_vector = d2v_model.infer_vector(query_prep)
    
    summary  = ""
    summary_vector = d2v_model.infer_vector([""])
    summary_idx = []
    
    sentences = text.split('.')
    sentences = np.asarray(sentences)
    
    remaining_sentences = copy.copy(sentences)
    
    size = 0
    counter = 0
    while size < limit and len(remaining_sentences)>0 :
        counter = counter+1
        scores = []
        for sentence in remaining_sentences :

            sentence_prep = gensim.utils.simple_preprocess(sentence, deacc=True)
            sentence_vector = d2v_model.infer_vector(sentence_prep)

            nn_input = np.hstack([query_vector, summary_vector, sentence_vector])
            nn_input = np.asarray([nn_input]) # weird but it is important to do it
            score = nn_model.predict(nn_input) 
            scores.append(score)
        #print(scores)
        max_idx_rem = int(np.argmax(scores))
        idx_selected_sentence = np.arange(len(sentences))[sentences == remaining_sentences[max_idx_rem]]
        idx_selected_sentence = int(idx_selected_sentence[0])
        size += len(remaining_sentences[max_idx_rem].split())
        
        remaining_sentences = list(remaining_sentences)
        del remaining_sentences[max_idx_rem]
        bisect.insort_left(summary_idx,idx_selected_sentence)

        summary  = ""

        for idx in summary_idx:
            summary = summary + " " + sentences[idx]

        summary_prep = gensim.utils.simple_preprocess(summary, deacc=True)
        summary_vector = d2v_model.infer_vector(summary_prep)

    return summary

def merge_articles_tqdfs(theme_doc_folder):
    """ for tqdfs corpus """ 
    s = ""
    for source in os.listdir(theme_doc_folder):
        try :
            for doc in os.listdir(theme_doc_folder + source):
                with open(theme_doc_folder + source + "/" + doc ,'r') as f:
                    lines = f.readlines()
                    s += "".join(txt for txt in lines)
                s += " "
        except:
            pass
    return s

In [18]:
def get_queries(query_txt_file):
    with open(query_txt_file, 'r') as f :
        queries = f.readlines()
    return queries

# Load model

## Load language model

In [19]:
## loading a d2vmodel (to be a shifted LSTM next ...)

# parameters of doc2vec
dm = 0
min_count = 5
window = 10
size = 400
sample = 1e-4
negative = 5
workers = 4
epoch = 20

# Initialize the model ( IMPORTANT )
d2v_model = gensim.models.doc2vec.Doc2Vec(dm=dm,min_count=min_count, window=window, size=size, sample=sample, negative=negative, workers=workers,iter = epoch)

# load model
model_name ="dm_"+str(dm)+"_mc_"+str(min_count)+"_w_"+str(window)+"_size_"+str(size)+"_neg_"+str(negative)+"_ep_"+str(epoch)
try :
    d2v_model = d2v_model.load(lang_model_folder+model_name+".d2v")
except :
    print "try a model in : ", os.listdir(lang_model_folder)
print("model loaded")

model loaded


## Load summarizer

### Initializing 

In [20]:
fc_model = Sequential()

fc_model.add(Dense(120, input_dim=1200))
fc_model.add(Activation('sigmoid'))
fc_model.add(Dropout(0.5))

fc_model.add(Dense(12))
fc_model.add(Activation('sigmoid'))
fc_model.add(Dropout(0.5))

fc_model.add(Dense(1))
fc_model.add(Activation('sigmoid'))

### load weights

In [21]:
#fc_model.load_weights(nn_summarizers_folder + "fc_model_batch_21k_R2_0.09297_SU4_0.08988.hdf5")
fc_model.load_weights(nn_summarizers_folder + "fc_model_batch_103k_R2_0.06574_SU4_0.08636.hdf5")

# Write summaries

In [22]:
for theme in themes :
    theme_folder = tdqfs_folder + theme + "/"
    theme_doc_folder = theme_folder + theme + "/"
    queries = get_queries(theme_folder+"queries.txt")
    text = merge_articles_tqdfs(theme_doc_folder)
    for i in range(len(queries)):
        query = queries[i]
        summary = summarize(text,query,d2v_model, fc_model, limit = 250)
        summary = " ".join(summary.split()[:250])
        
        summary_name = theme + "." + str(i+1) + ".txt"
        with open(system_folder + summary_name,'w') as f :
            f.write(summary.decode('ascii',"ignore").encode("utf8", "replace"))
            print 'writing in '+ system_folder + summary_name


writing in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_25_05_38_test/alz.1.txt
writing in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_25_05_38_test/alz.2.txt
writing in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_25_05_38_test/alz.3.txt
writing in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_25_05_38_test/alz.4.txt
writing in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_25_05_38_test/alz.5.txt
writing in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_25_05_38_test/alz.6.txt
writing in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_25_05_38_test/alz.7.txt
writing in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_25_05_38_test/alz.8.txt
writing in /home/ubuntu/summarization_query_oriented/dat

In [23]:
#type(summary)

str

# Compute ROUGE

In [24]:
# perform rouge
r = Rouge155()
r.system_dir = "/home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_21_15_13_test"
r.model_dir = model_dir
r.model_filename_pattern = '#ID#.u[0-9]q[0-9].txt'
r.system_filename_pattern = '([a-z]+.[0-9]+).txt'

#options = "-n 4 -m -2 4 -u -c 95 -r 1000 -f A -p 0.5 -t 0 -a -x" #porter stemmer pose pb

#options =  '-a -d -e ' + r._data_dir + ' -m -n 2 -s -2 4 -u -x -f A'

options = "-e " + r._data_dir + " -n 4 -2 4 -u -c 95 -r 1000 -f A -p 0.5 -t 0 -a -x"

In [25]:
output = r.convert_and_evaluate(rouge_args=options)
#print(output)
output_dict = r.output_to_dict(output)

2016-09-25 05:43:45,235 [MainThread  ] [INFO ]  Writing summaries.
INFO:global:Writing summaries.
2016-09-25 05:43:45,236 [MainThread  ] [INFO ]  Processing summaries. Saving system files to /tmp/tmpdq29DW/system and model files to /tmp/tmpdq29DW/model.
INFO:global:Processing summaries. Saving system files to /tmp/tmpdq29DW/system and model files to /tmp/tmpdq29DW/model.
2016-09-25 05:43:45,237 [MainThread  ] [INFO ]  Processing files in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_21_15_13_test.
INFO:global:Processing files in /home/ubuntu/summarization_query_oriented/data/TD-QFS/tdqfs_summary_system/2016_09_21_15_13_test.
2016-09-25 05:43:45,237 [MainThread  ] [INFO ]  Processing cancer.9.txt.
INFO:global:Processing cancer.9.txt.
2016-09-25 05:43:45,238 [MainThread  ] [INFO ]  Processing asthma.5.txt.
INFO:global:Processing asthma.5.txt.
2016-09-25 05:43:45,239 [MainThread  ] [INFO ]  Processing obese.6.txt.
INFO:global:Processing obese.6.txt.
20

In [28]:
print(output)

---------------------------------------------
1 ROUGE-1 Average_R: 0.29328 (95%-conf.int. 0.25166 - 0.33412)
1 ROUGE-1 Average_P: 0.53727 (95%-conf.int. 0.44934 - 0.62920)
1 ROUGE-1 Average_F: 0.30753 (95%-conf.int. 0.27508 - 0.33867)
---------------------------------------------
1 ROUGE-2 Average_R: 0.10244 (95%-conf.int. 0.09430 - 0.11142)
1 ROUGE-2 Average_P: 0.34029 (95%-conf.int. 0.22014 - 0.45677)
1 ROUGE-2 Average_F: 0.11803 (95%-conf.int. 0.11066 - 0.12572)
---------------------------------------------
1 ROUGE-3 Average_R: 0.07776 (95%-conf.int. 0.07225 - 0.08445)
1 ROUGE-3 Average_P: 0.30413 (95%-conf.int. 0.18552 - 0.41737)
1 ROUGE-3 Average_F: 0.09209 (95%-conf.int. 0.08382 - 0.10050)
---------------------------------------------
1 ROUGE-4 Average_R: 0.06825 (95%-conf.int. 0.06314 - 0.07454)
1 ROUGE-4 Average_P: 0.28168 (95%-conf.int. 0.16932 - 0.38925)
1 ROUGE-4 Average_F: 0.08118 (95%-conf.int. 0.07327 - 0.08926)
---------------------------------------------
1 ROUGE-SU4 Av

In [29]:
print(output_dict)

{u'rouge_su4_precision': 0.35971, u'rouge_3_f_score_cb': 0.08382, u'rouge_3_f_score_ce': 0.1005, u'rouge_1_precision': 0.53727, u'rouge_su4_f_score': 0.14374, u'rouge_3_recall': 0.07776, u'rouge_3_precision_ce': 0.41737, u'rouge_2_precision_ce': 0.45677, u'rouge_2_precision_cb': 0.22014, u'rouge_2_recall': 0.10244, u'rouge_3_precision_cb': 0.18552, u'rouge_4_f_score_ce': 0.08926, u'rouge_2_precision': 0.34029, u'rouge_1_recall_cb': 0.25166, u'rouge_1_recall_ce': 0.33412, u'rouge_4_f_score_cb': 0.07327, u'rouge_2_recall_cb': 0.0943, u'rouge_su4_f_score_ce': 0.15218, u'rouge_su4_f_score_cb': 0.13589, u'rouge_2_recall_ce': 0.11142, u'rouge_4_precision_cb': 0.16932, u'rouge_4_precision_ce': 0.38925, u'rouge_1_f_score': 0.30753, u'rouge_4_recall_ce': 0.07454, u'rouge_su4_precision_ce': 0.4672, u'rouge_1_recall': 0.29328, u'rouge_4_recall_cb': 0.06314, u'rouge_4_recall': 0.06825, u'rouge_3_recall_cb': 0.07225, u'rouge_3_recall_ce': 0.08445, u'rouge_4_precision': 0.28168, u'rouge_4_f_score': 