### Each file contains four parts separated by ‘\n\n’. They are
    1. url of the original article;
    2. sentences in the article and their labels (for sentence-based extractive summarization);
    3. extractable highlights (for word extraction-based abstractive summarization);
    4. named entity mapping.

### Sentence labels. There are three labels for the sentences: 1, 2 and 0. 

    - 1: sentence should extracted; 
    - 2: sentence might be extracted; 
    - 0: sentence shouldn't be extracted.

### Extractable highlights

The extractable highlights are created by examining if a word (or its morphological transformation) in the highlight appears in the article or a general purpose stop-word list, which together constitute the output space (i.e., the allowed vocabulary during summary generation).

In [1]:
import os
import urllib2
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [43]:
def buildEntityDictionary(input_path, filenames):
    # Swapping in the entity names
    entitykey, entityname = [], []
    for filename in filenames:
        f = open(os.path.join(input_path, filename))
        data = f.read()
        entities = data.split("\n\n")[3]

        for entity in entities.split("\n"):
            entitykey.append( entity.split(":")[0] )
            entityname.append( entity.split(":")[1] )

    edictionary = dict(zip(entitykey, entityname))
    return edictionary    

def cleandata(input_path, files, edict):
    f = open(os.path.join(input_path, files))
    data = f.read()

    url  = data.split("\n\n")[0]
    article = data.split("\n\n")[1]
    nuggets = data.split("\n\n")[2]
    entities = data.split("\n\n")[3]

    # Parsing the sentences and substituting
    sentencelist, sentencelabel = [], []
    for sentence in article.split("\n"):
        # Swapping in the entity names
        sentencelabel.append(int(sentence.split("\t\t\t")[1]))
        sentence = sentence.split("\t\t\t")[0]
        newsentence = ' '.join([edict[word] if word in edict else word for word in sentence.split(" ")])
        sentencelist.append(newsentence)

    # Collecting the sentences in a list
    df = pd.DataFrame(sentencelist, columns=['Sentence'])
    df['Label'] = sentencelabel

    # Extracting the nuggets
    highlight = []
    for nugget in nuggets.split("\n"):
        newnugget = ' '.join([edict[word] if word in edict else word for word in nugget.split(" ")])
        highlight.append(newnugget)

    nuggets = pd.DataFrame(highlight, columns=['Nugget'])
    # Getting the title/query
#     response = urllib2.urlopen(url)
    html = requests.get(url).text
#     html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    try:
        title = soup.findAll("title")[0].text
    except:
        title = 'MISSING'
    return title, nuggets, df, df.shape[0]

In [23]:
inputpath = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data/neuralsum/cnn/training/'
outputpath = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/0-output'
datafiles = os.listdir(inputpath)

In [4]:
edict = buildEntityDictionary(inputpath, datafiles)

In [61]:
filelist = os.listdir(outputpath)
finished = [int(x.replace("q",'').replace("_stream.csv", '')) for x in filelist if 'stream.csv' in x]
finishedval = max(finished)
# outdf = pd.DataFrame(columns=['query_id','query','streamSize','query_filename', 'outfile_name', 'nuggetfilename'])

for i, datafile in enumerate(datafiles):
    if i > finishedval:
        query, nuggets, stream, streamSize = cleandata(inputpath, datafile, edict)
        outfilename = 'q%i_stream.csv' % i
        nuggetfilename = 'q%i_nuggets.csv' % i
        tmpdf = pd.DataFrame( [i, query, streamSize, datafile, outfilename, nuggetfilename] ).T
        tmpdf.columns = ['query_id','query','streamSize','query_filename', 'outfile_name', 'nuggetfilename']
        outdf = pd.concat([outdf, tmpdf], axis=0)
        stream.to_csv(os.path.join(outputpath, outfilename), index=False)
        nuggets.to_csv(os.path.join(outputpath, nuggetfilename), index=False)

In [73]:
outdf.to_csv("/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingqueries.csv", 
             index=False, encoding='utf-8')

In [93]:
finalinputdir = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/0-output/'
finaloutputdir = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/'

finalinputfiles = os.listdir(finalinputdir)

# Validating the summaries

In [194]:
streams = [(int(x.split("_")[0].replace("q",'')), x) for x in finalinputfiles if '_stream.csv' in x]
nuggets = [(int(x.split("_")[0].replace("q",'')), x) for x in finalinputfiles if '_nuggets.csv' in x]

streamsummary = pd.DataFrame(streams, columns=['query_id','streamname'])
nuggetsummary = pd.DataFrame(nuggets, columns=['query_id','nuggetname'])

fulldf = pd.merge(streamsummary, nuggetsummary, how='inner', left_on = 'query_id', right_on='query_id')
dupes = outdf.drop_duplicates(inplace=False)['query_id'].value_counts().reset_index()
dupes.columns = ['query_id', 'count']
dupes = dupes[dupes['count'] > 1]
dedupefilter = outdf['query_id'].isin(dupes['query_id'])==False

In [193]:
# Exporting files
outdf[dedupefilter].to_csv(
    "/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingqueries.csv", 
             index=False, encoding='utf-8')

In [198]:
outdf.shape[0], outdf[dedupefilter]['streamSize'].sum()

(85198, 2541426)

In [277]:
for i, (index, row) in enumerate(outdf[dedupefilter].iterrows()):
    if i == 0:
        cleanedstreams = pd.read_csv(finalinputdir + row['outfile_name'])
        cleanedstreams['query'] = row['query'].replace(" - CNN.com", "")
        cleanedstreams['query_id'] = row['query_id']
    else:
        tmp  = pd.read_csv(finalinputdir + row['outfile_name'])
        tmp['query'] = row['query'].replace(" - CNN.com", "")
        tmp['query_id'] = row['query_id']
        cleanedstreams = pd.concat([cleanedstreams, tmp])

In [278]:
cleanedstreams.head()

Unnamed: 0,Sentence,Label,query,query_id
0,"-- i 'm 45 , and my son is 7",1,Mistaken for your child's grandmother,0
1,"once in a while , i still get carded when i tr...",1,Mistaken for your child's grandmother,0
2,i was 38 when Dominican Republic Emergency Ope...,1,Mistaken for your child's grandmother,0
3,both incidents took place after i moved from A...,1,Mistaken for your child's grandmother,0
4,i thought about the incidents when i read a re...,2,Mistaken for your child's grandmother,0


In [280]:
cleanedstreams.shape

(2541426, 4)

In [281]:
cleanedstreams.to_csv(
    "/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingstreams.csv", 
             index=False, encoding='utf-8')