### Each file contains four parts separated by ‘\n\n’. They are
    1. url of the original article;
    2. sentences in the article and their labels (for sentence-based extractive summarization);
    3. extractable highlights (for word extraction-based abstractive summarization);
    4. named entity mapping.

### Sentence labels. There are three labels for the sentences: 1, 2 and 0. 

    - 1: sentence should extracted; 
    - 2: sentence might be extracted; 
    - 0: sentence shouldn't be extracted.

### Extractable highlights

The extractable highlights are created by examining if a word (or its morphological transformation) in the highlight appears in the article or a general purpose stop-word list, which together constitute the output space (i.e., the allowed vocabulary during summary generation).

In [1]:
import os
import urllib2
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [43]:
def buildEntityDictionary(input_path, filenames):
    # Swapping in the entity names
    entitykey, entityname = [], []
    for filename in filenames:
        f = open(os.path.join(input_path, filename))
        data = f.read()
        entities = data.split("\n\n")[3]

        for entity in entities.split("\n"):
            entitykey.append( entity.split(":")[0] )
            entityname.append( entity.split(":")[1] )

    edictionary = dict(zip(entitykey, entityname))
    return edictionary    

def cleandata(input_path, files, edict):
    f = open(os.path.join(input_path, files))
    data = f.read()

    url  = data.split("\n\n")[0]
    article = data.split("\n\n")[1]
    nuggets = data.split("\n\n")[2]
    entities = data.split("\n\n")[3]

    # Parsing the sentences and substituting
    sentencelist, sentencelabel = [], []
    for sentence in article.split("\n"):
        # Swapping in the entity names
        sentencelabel.append(int(sentence.split("\t\t\t")[1]))
        sentence = sentence.split("\t\t\t")[0]
        newsentence = ' '.join([edict[word] if word in edict else word for word in sentence.split(" ")])
        sentencelist.append(newsentence)

    # Collecting the sentences in a list
    df = pd.DataFrame(sentencelist, columns=['Sentence'])
    df['Label'] = sentencelabel

    # Extracting the nuggets
    highlight = []
    for nugget in nuggets.split("\n"):
        newnugget = ' '.join([edict[word] if word in edict else word for word in nugget.split(" ")])
        highlight.append(newnugget)

    nuggets = pd.DataFrame(highlight, columns=['Nugget'])
    # Getting the title/query
#     response = urllib2.urlopen(url)
    html = requests.get(url).text
#     html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    try:
        title = soup.findAll("title")[0].text
    except:
        title = 'MISSING'
    return title, nuggets, df, df.shape[0]

In [23]:
inputpath = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data/neuralsum/cnn/training/'
outputpath = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/0-output'
datafiles = os.listdir(inputpath)

In [4]:
edict = buildEntityDictionary(inputpath, datafiles)

In [61]:
filelist = os.listdir(outputpath)
finished = [int(x.replace("q",'').replace("_stream.csv", '')) for x in filelist if 'stream.csv' in x]
finishedval = max(finished)
# outdf = pd.DataFrame(columns=['query_id','query','streamSize','query_filename', 'outfile_name', 'nuggetfilename'])

for i, datafile in enumerate(datafiles):
    if i > finishedval:
        query, nuggets, stream, streamSize = cleandata(inputpath, datafile, edict)
        outfilename = 'q%i_stream.csv' % i
        nuggetfilename = 'q%i_nuggets.csv' % i
        tmpdf = pd.DataFrame( [i, query, streamSize, datafile, outfilename, nuggetfilename] ).T
        tmpdf.columns = ['query_id','query','streamSize','query_filename', 'outfile_name', 'nuggetfilename']
        outdf = pd.concat([outdf, tmpdf], axis=0)
        stream.to_csv(os.path.join(outputpath, outfilename), index=False)
        nuggets.to_csv(os.path.join(outputpath, nuggetfilename), index=False)

In [75]:
outdf.shape[0], outdf['streamSize'].sum()

(85198, 2541554)

In [76]:
outdf.head()

Unnamed: 0,query_id,query,streamSize,query_filename,outfile_name,nuggetfilename
0,0,Mistaken for your child's grandmother - CNN.com,61,000223f0c9a759b9cdd2f86ac8c2899747937263.summary,q0_stream.csv,q0_nuggets.csv
0,1,"Arraignment for 8 current, former Bell, Califo...",36,00030b9744b6d8c21d3a9fcb35460b06d3a71e2e.summary,q1_stream.csv,q1_nuggets.csv
0,2,U.S. and Russia can end the suffering - CNN.com,49,00036c4d44d9af8d34280086a887f3b7847cdae1.summary,q2_stream.csv,q2_nuggets.csv
0,3,U.N. Security Council to hold emergency meetin...,31,0003e88107432daa7a852f9fc9f26915122f218b.summary,q3_stream.csv,q3_nuggets.csv
0,4,Residents evacuated as firefighters battle bla...,30,00052899fc9012ce1798f7b70710304913befd28.summary,q4_stream.csv,q4_nuggets.csv


In [73]:
outdf.to_csv("/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingqueries.csv", 
             index=False, encoding='utf-8')

In [92]:
finalinputdir = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/0-output/'
finaloutputdir = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/'

In [97]:
outdf['query_id'].head()

0    0
0    1
0    2
0    3
0    4
Name: query_id, dtype: object

In [93]:
finalinputfiles = os.listdir(finalinputdir)

In [106]:
streams = [x for x in finalinputfiles if '_stream.csv' in x]
nuggets = [x for x in finalinputfiles if '_nuggets.csv' in x]

In [100]:
finalpairs = [(q, s) for (q, s) in  finalinputfiles ]

ValueError: need more than 1 value to unpack

In [94]:
len(finalinputfiles)

167136