In [194]:
import glob
import os
import sys
import pickle
import struct
import pandas as pd
from collections import Counter
from nltk.tokenize import sent_tokenize
from tensorflow.core.example import example_pb2

sys.path.append('../src')
import data_io, params, SIF_embedding

pd.options.display.max_rows = 125

In [178]:
def load_embed(weightpara=1e-3, param=None, rmpc=0,
               wordfile = '/home/francisco/GitHub/SIF/data/glove.840B.300d.txt', 
               weightfile='/home/francisco/GitHub/SIF/auxiliary_data/enwiki_vocab_min200.txt'):
    '''
    wordfile:   : location of embedding data (e.g., glove embedings, can be downloaded from GloVe website)
    weightfile: : location of TF data for words, each line is a word and its frequency
    weightpara: : the parameter in the SIF weighting scheme, usually in range [3e-5, 3e-3]
    rmpc:       : number of principal components to remove in SIF weighting scheme
    '''
    # load word vectors
    (words, Weights) = data_io.getWordmap(wordfile)

    # load word weights
    word2weight = data_io.getWordWeight(weightfile, weightpara) # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(words, word2weight) # weight4ind[i] is the weight for the i-th word

    # set parameters
    param.rmpc = rmpc

    return Weights, words, word2weight, weight4ind

def return_sif(sentences, words, weight4ind, param, Weights):
    # x is the array of word indices, m is the binary mask indicating whether there is a word in that location
    x, m = data_io.sentences2idx(sentences, words)
    w = data_io.seq2weight(x, m, weight4ind) # get word weights
    # get SIF embedding
    embeddings = SIF_embedding.SIF_embedding(Weights, x, w, param) # embedding[i,:] is the embedding for sentence i
    return embeddings


def embed_sentences(inputpath, wordfile, weightfile, weightpara, param, rmpc, file_list):
    Weights, words, word2weight, weight4ind = load_embed(wordfile, weightfile, weightpara, param, rmpc)

    print('embeddings loaded...')
    for file_i in file_list:
        input_file = open(os.path.join(inputpath, file_i), 'rb')
        c = 0
        while input_file:
            try:
                clean_abstract, clean_article = return_bytes(input_file)
            except:
                input_file = None

            print('article cleaned...')
            embeddings = return_sif(clean_article, words, weight4ind, param, Weights)

            sdf = pd.DataFrame(clean_article, columns=['sentence'])
            sdf['clean_sentence'] = [' '.join([s for s in x if s.isalnum()]) for x in sdf['sentence'].str.split(" ")]
            sdf['summary'] = clean_abstract
            sdf.ix[1:, 'summary'] = ''

            embcols = ['emb_%i'%i for i in range(embeddings.shape[1])]
            emb = pd.DataFrame(embeddings, columns = embcols)

            sdf = pd.concat([sdf, emb], axis=1)
            sdf = sdf[['summary', 'sentence', 'clean_sentence'] + sdf.columns[3:].tolist()]
            newfile = file_i.replace(".bin", "").split("/")[-1]
            #sdf.to_csv("/home/francisco/GitHub/DQN-Event-Summarization/data/sif/%s_%i.csv" % (
            #         newfile, c
            #         )
            #    )
            sdf.to_csv("/home/francisco/GitHub/DQN-Event-Summarization/data/testsif/%s_%i.csv" % (
                     newfile, c
                     )
                )
            if (c % 100) == 0:
                 print("Data exported to %s_%i.csv" % (newfile, c))
            c+= 1
            
def embedCNNQuery(sdf, params, Weights, words, word2weight, weight4ind):
    # sdf = idf[idf['query_id'] == queryid].reset_index(drop=True)
    for i in range(sdf.shape[0]):
        tempsentence = sdf['sentence'][i]
        ab = sent_tokenize(tempsentence)
        clean_sentence = '. '.join([' '.join(s for s in x.split() if s.isalnum()) for x in ''.join(ab).replace("<s>","").split("</s>")]).strip()
        sembedding = return_sif([clean_sentence], words, weight4ind, params, Weights)
        if i == 0:
            tempsentence = sdf['query'][0]
            true_summary = sdf['true_summary'][0]
            ab = sent_tokenize(tempsentence)
            clean_sentence = '. '.join([' '.join(s for s in x.split() if s.isalnum()) for x in ''.join(ab).replace("<s>","").split("</s>")]).strip()
            qembedding = return_sif([clean_sentence], words, weight4ind, params, Weights)
            qedf = pd.DataFrame(qembedding)
            sedf = pd.DataFrame(sembedding)
        else:
            sedf = pd.concat([sedf, pd.DataFrame(sembedding)], axis=0)

    sedf.columns = ['embedding_%i' % i for i in range(sedf.shape[1])]
    qedf.columns =  ['embedding_%i' % i for i in range(qedf.shape[1])]
    
    return {'query_embeddings': qedf, 'sentence_embeddings': sedf, 'true_summary': true_summary}

def rougeScores(genSummary, refSummary):
    genTotal, refTotal, intersection = 0., 0., 0.
    for token in list(set(list(refSummary.keys()) + list(genSummary.keys()) )):
        intersection += min(refSummary[token], genSummary[token])
        refTotal += refSummary[token]
        genTotal += genSummary[token]

    recall = intersection / refTotal if refTotal > 0. else 0.
    prec   = intersection / genTotal if genTotal > 0. else 0.
    f1 = (2. * recall * prec) / (recall + prec) if (recall + prec) > 0. else 0.
    
    return recall, prec, f1

# Initializing data and SIF parameters

In [202]:
inputfile = "/home/francisco/GitHub/DQN-Event-Summarization/data/cnn_tokenized/cnn_data_corpus.csv"
inputdict = "/home/francisco/GitHub/DQN-Event-Summarization/data/cnn_tokenized/cnn_total_corpus_smry.csv"

qdf = pd.read_csv(inputfile)
qdict = pd.read_csv(inputdict)
corpus_dict = dict(zip(qdict['id'], qdict['token']))

df1 = pd.read_csv('/home/francisco/GitHub/DQN-Event-Summarization/data/1-output/cnn_trainingstreams.csv')
mainpath = '/home/francisco/GitHub/DQN-Event-Summarization/'

myparams = params.params()
wp = 1e-3
rp = 0
query_id = 0
outfile = '/home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_%i.pkl' % query_id

  interactivity=interactivity, compiler=compiler, result=result)


In [107]:
Weights, words, word2weight, weight4ind = load_embed(wp, mNot everyone was up in arms with the approach Haddish took in responding to the loaded question as many took to Twitter to voice their support for the comedian for her honest response.

yparams, rp)

In [179]:
tdf = df1[df1['query_id'] == query_id].reset_index(drop=True)

results = embedCNNQuery(tdf, myparams, Weights, words, word2weight, weight4ind)

In [205]:
simMatrix = pd.DataFrame(results['query_embeddings'].values.dot(results['sentence_embeddings'].values.T)).T

best_sentences = [i for i, x in enumerate(simMatrix.values) if x > 5]
sentences = [tdf['sentence'][x] for x in best_sentences]

In [206]:
tdf['query'].values[1:] = ''
tdf['true_summary'].values[1:] = ''
tdf = tdf[['query_id', 'sentence_idx', 'label', 'query', 'true_summary', 'sentence']]
final = {'embeddings': results, 'data': tdf}

In [207]:
curr_summary = ' '.join(sentences)
true_summary = tdf['true_summary'][0]
lead3 = ' '.join(tdf['sentence'][0:3])

ts_tokenized = Counter(true_summary.split(" "))
ps_tokenized = Counter(curr_summary.split(" "))
l3_tokenized = Counter(lead3.split(" "))

finalsummary = rougeScores(ts_tokenized, ps_tokenized)
baseline = rougeScores(ts_tokenized, l3_tokenized)

In [208]:
print("lead-3  recall = %.3f; precision = %.3f; f1-score = %.3f " % (baseline[0], baseline[1], baseline[2]))

print("learned recall = %.3f; precision = %.3f; f1-score = %.3f " % (finalsummary[0], finalsummary[1], finalsummary[2]))

lead-3  recall = 1.000; precision = 0.148; f1-score = 0.258 
learned recall = 0.604; precision = 0.116; f1-score = 0.194 


In [203]:
pickle.dump(final, open(outfile, 'wb'))

# Dumping a set of 10

In [213]:
for query_id in range(10):
    myparams = params.params()
    myparams.rmpc = 0
    outfile = '/home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_%i.pkl' % query_id
    tdf = df1[df1['query_id'] == query_id].reset_index(drop=True)
    tdf['query'].values[1:] = ''
    tdf['true_summary'].values[1:] = ''
    
    tdf = tdf[['query_id', 'sentence_idx', 'label', 'query', 'true_summary', 'sentence']]
    results = embedCNNQuery(tdf, myparams, Weights, words, word2weight, weight4ind)
    final = {'embeddings': results, 'data': tdf}    
    pickle.dump(final, open(outfile, 'wb'))
    print('query %i written to %s' % (query_id, outfile))

query 0 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_0.pkl
query 1 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_1.pkl
query 2 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_2.pkl
query 3 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_3.pkl
query 4 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_4.pkl
query 5 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_5.pkl
query 6 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_6.pkl
query 7 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_7.pkl
query 8 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_8.pkl
query 9 written to /home/francisco/GitHub/DQN-Event-Summarization/data/testsif/sifquerydict_9.pkl


# END