### Each file contains four parts separated by ‘\n\n’. They are
    1. url of the original article;
    2. sentences in the article and their labels (for sentence-based extractive summarization);
    3. extractable highlights (for word extraction-based abstractive summarization);
    4. named entity mapping.

### Sentence labels. There are three labels for the sentences: 1, 2 and 0. 

    - 1: sentence should extracted; 
    - 2: sentence might be extracted; 
    - 0: sentence shouldn't be extracted.

### Extractable highlights

The extractable highlights are created by examining if a word (or its morphological transformation) in the highlight appears in the article or a general purpose stop-word list, which together constitute the output space (i.e., the allowed vocabulary during summary generation).

In [1]:
import os
import urllib2
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
pth = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data/neuralsum/cnn/training/'

In [3]:
datafiles = os.listdir(pth)

In [4]:
len(datafiles), datafiles[0]

(83568, '000223f0c9a759b9cdd2f86ac8c2899747937263.summary')

In [5]:
def cleandata(input_path, files):
    f = open(os.path.join(pth, files))
    data = f.read()

    url  = data.split("\n\n")[0]
    article = data.split("\n\n")[1]
    nuggets = data.split("\n\n")[2]
    entities = data.split("\n\n")[3]

    # Swapping in the entity names
    entitykey, entityname = [], []
    for entity in entities.split("\n"):
        entitykey.append( entity.split(":")[0] )
        entityname.append( entity.split(":")[1] )

    edict = dict(zip(entitykey, entityname))

    # Parsing the sentences and substituting
    sentencelist, sentencelabel = [], []
    for sentence in article.split("\n"):
        newsentence = ' '.join([edict[word] if word in edict.keys() else word for word in sentence.split(" ")])
        sentencelist.append(newsentence.split("\t\t\t")[0])
        sentencelabel.append(int(newsentence.split("\t\t\t")[1]))

    # Collecting the sentences in a list
    df = pd.DataFrame(sentencelist, columns=['Sentence'])
    df['Label'] = sentencelabel

    # Extracting the nuggets
    highlight = []
    for nugget in nuggets.split("\n"):
        newnugget = ' '.join([edict[word] if word in edict.keys() else word for word in nugget.split(" ")])
        highlight.append(newnugget)
        
    # Getting the title/query
    response = urllib2.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.findAll("title")[0].text
    return title, highlight, df, df.shape[0]

In [None]:
outpath = './data2/0-output'

In [9]:
outdf = pd.DataFrame(columns=['query_id','query','streamSize','query_filename', 'outfile_name', 'nuggetfilename'])

for i, datafile in enumerate(datafiles):
    query, nuggets, stream, streamSize = cleandata(pth, datafile)
    outfilename = 'q%i_stream.csv' % i
    nuggetfilename = 'q%i_nuggets.csv' % i
    tmpdf = pd.DataFrame( [i, query, streamSize, datafile, outfilename, nuggetfilename] ).T
    tmpdf.columns = ['query_id','query','streamSize','query_filename', 'outfile_name', 'nuggetfilename']
    outdf = pd.concat([outdf, tmpdf], axis=0)
    stream.to_csv(outpath + )
    if i == 2:
        break

In [8]:
nuggets

['Kyrgyzstan violence has claimed many lives , forced many from their homes',
 'Scott Horton , Baktybek Abdrisaev say U.S. , Russia have done little in response',
 'they say the two nations have *responsibility* to help ease the suffering',
 'coordinated action could end the humanitarian crisis , they say']

In [10]:
outdf

Unnamed: 0,query_id,query,streamSize,query_filename,outfile_name,nuggetfilename
0,0,Mistaken for your child's grandmother - CNN.com,61,000223f0c9a759b9cdd2f86ac8c2899747937263.summary,q0_stream.csv,q0_nuggets.csv
0,1,"Arraignment for 8 current, former Bell, Califo...",36,00030b9744b6d8c21d3a9fcb35460b06d3a71e2e.summary,q1_stream.csv,q1_nuggets.csv
0,2,U.S. and Russia can end the suffering - CNN.com,49,00036c4d44d9af8d34280086a887f3b7847cdae1.summary,q2_stream.csv,q2_nuggets.csv
