### Each file contains four parts separated by ‘\n\n’. They are
    1. url of the original article;
    2. sentences in the article and their labels (for sentence-based extractive summarization);
    3. extractable highlights (for word extraction-based abstractive summarization);
    4. named entity mapping.

### Sentence labels. There are three labels for the sentences: 1, 2 and 0. 

    - 1: sentence should extracted; 
    - 2: sentence might be extracted; 
    - 0: sentence shouldn't be extracted.

### Extractable highlights

The extractable highlights are created by examining if a word (or its morphological transformation) in the highlight appears in the article or a general purpose stop-word list, which together constitute the output space (i.e., the allowed vocabulary during summary generation).

In [None]:
import os
# import urllib2
from urllib.request import urlopen
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
def buildEntityDictionary(input_path, filenames):
    # Swapping in the entity names
    entitykey, entityname = [], []
    for filename in filenames:
        f = open(os.path.join(input_path, filename))
        data = f.read()
        entities = data.split("\n\n")[3]

        for entity in entities.split("\n"):
            entitykey.append( entity.split(":")[0] )
            entityname.append( entity.split(":")[1] )

    edictionary = dict(zip(entitykey, entityname))
    return edictionary    

def cleandata(input_path, files, edict):
    f = open(os.path.join(input_path, files))
    data = f.read()

    url  = data.split("\n\n")[0]
    article = data.split("\n\n")[1]
    nuggets = data.split("\n\n")[2]
    entities = data.split("\n\n")[3]

    # Parsing the sentences and substituting
    sentencelist, sentencelabel = [], []
    for sentence in article.split("\n"):
        # Swapping in the entity names
        sentencelabel.append(int(sentence.split("\t\t\t")[1]))
        sentence = sentence.split("\t\t\t")[0]
        newsentence = ' '.join([edict[word] if word in edict else word for word in sentence.split(" ")])
        sentencelist.append(newsentence)

    # Collecting the sentences in a list
    df = pd.DataFrame(sentencelist, columns=['Sentence'])
    df['Label'] = sentencelabel

    # Extracting the nuggets
    highlight = []
    for nugget in nuggets.split("\n"):
        newnugget = ' '.join([edict[word] if word in edict else word for word in nugget.split(" ")])
        highlight.append(newnugget)

    nuggets = pd.DataFrame(highlight, columns=['Nugget'])
    # Getting the title/query
#    html = requests.get(url).text
    html = urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    try:
        title = soup.findAll("title")[0].text
    except:
        title = 'MISSING'
    return title, nuggets, df, df.shape[0]

In [7]:
# inputpath = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data/neuralsum/cnn/training/'
outputpath = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/0-output'
# datafiles = os.listdir(inputpath)

In [4]:
edict = buildEntityDictionary(inputpath, datafiles)

In [61]:
filelist = os.listdir(outputpath)
finished = [int(x.replace("q",'').replace("_stream.csv", '')) for x in filelist if 'stream.csv' in x]
finishedval = max(finished)
# outdf = pd.DataFrame(columns=['query_id','query','streamSize','query_filename', 'outfile_name', 'nuggetfilename'])

for i, datafile in enumerate(datafiles):
    if i > finishedval:
        query, nuggets, stream, streamSize = cleandata(inputpath, datafile, edict)
        outfilename = 'q%i_stream.csv' % i
        nuggetfilename = 'q%i_nuggets.csv' % i
        tmpdf = pd.DataFrame( [i, query, streamSize, datafile, outfilename, nuggetfilename] ).T
        tmpdf.columns = ['query_id','query','streamSize','query_filename', 'outfile_name', 'nuggetfilename']
        outdf = pd.concat([outdf, tmpdf], axis=0)
        stream.to_csv(os.path.join(outputpath, outfilename), index=False)
        nuggets.to_csv(os.path.join(outputpath, nuggetfilename), index=False)

In [73]:
outdf.to_csv("/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingqueries.csv", 
             index=False, encoding='utf-8')

In [93]:
finalinputdir = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/0-output/'
finaloutputdir = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/'

finalinputfiles = os.listdir(finalinputdir)

# Validating the summaries

In [194]:
streams = [(int(x.split("_")[0].replace("q",'')), x) for x in finalinputfiles if '_stream.csv' in x]
nuggets = [(int(x.split("_")[0].replace("q",'')), x) for x in finalinputfiles if '_nuggets.csv' in x]

streamsummary = pd.DataFrame(streams, columns=['query_id','streamname'])
nuggetsummary = pd.DataFrame(nuggets, columns=['query_id','nuggetname'])

fulldf = pd.merge(streamsummary, nuggetsummary, how='inner', left_on = 'query_id', right_on='query_id')
dupes = outdf.drop_duplicates(inplace=False)['query_id'].value_counts().reset_index()
dupes.columns = ['query_id', 'count']
dupes = dupes[dupes['count'] > 1]
dedupefilter = outdf['query_id'].isin(dupes['query_id'])==False

In [193]:
# Exporting files
outdf[dedupefilter].to_csv(
    "/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingqueries.csv", 
             index=False, encoding='utf-8')

In [198]:
outdf.shape[0], outdf[dedupefilter]['streamSize'].sum()

(85198, 2541426)

In [282]:
outdf[dedupefilter]['streamSize'].sum() / float(outdf.shape[0])

29.829643888354187

# Reading data back in 

In [1]:
import os
import urllib2
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
finalinputdir = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/0-output/'
finaloutputdir = '/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/'

finalinputfiles = os.listdir(finalinputdir)

In [3]:
outdf = pd.read_csv("/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingqueries.csv")

In [4]:
def extract_data(inputdir, odf_row):
        cleanedstreams = pd.read_csv(inputdir + odf_row['outfile_name'])
        cleanedstreams['query'] = odf_row['query'].replace(" - CNN.com", "")
        cleanedstreams['query_id'] = odf_row['query_id']
        cleanedstreams['true_summary'] = ' '.join(cleanedstreams[cleanedstreams['Label']==1].Sentence)
        cleanedstreams['sentence_idx'] = cleanedstreams.index
        return cleanedstreams

In [5]:
from joblib import Parallel, delayed
streams = Parallel(n_jobs=-1)(
    delayed(extract_data)(finalinputdir, row) for i, row in outdf.iterrows()
)

In [6]:
cleanedstreams = pd.concat(streams)
cleanedstreams = cleanedstreams[['query_id', 'sentence_idx', 'Label','query','Sentence', 'true_summary']]
cleanedstreams.columns = [x.lower() for x in cleanedstreams.columns]

In [7]:
cleanedstreams.head()

Unnamed: 0,query_id,sentence_idx,label,query,sentence,true_summary
0,0,0,1,Mistaken for your child's grandmother,"-- i 'm 45 , and my son is 7","-- i 'm 45 , and my son is 7 once in a while ,..."
1,0,1,1,Mistaken for your child's grandmother,"once in a while , i still get carded when i tr...","-- i 'm 45 , and my son is 7 once in a while ,..."
2,0,2,1,Mistaken for your child's grandmother,i was 38 when Dominican Republic Emergency Ope...,"-- i 'm 45 , and my son is 7 once in a while ,..."
3,0,3,1,Mistaken for your child's grandmother,both incidents took place after i moved from A...,"-- i 'm 45 , and my son is 7 once in a while ,..."
4,0,4,2,Mistaken for your child's grandmother,i thought about the incidents when i read a re...,"-- i 'm 45 , and my son is 7 once in a while ,..."


In [8]:
cleanedstreams.to_csv(
    "/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingstreams.csv", 
             index=False, encoding='utf-8'
)

In [10]:
cleanedstreams.head()

Unnamed: 0,query_id,sentence_idx,label,query,sentence,true_summary
0,0,0,1,Mistaken for your child's grandmother,"-- i 'm 45 , and my son is 7","-- i 'm 45 , and my son is 7 once in a while ,..."
1,0,1,1,Mistaken for your child's grandmother,"once in a while , i still get carded when i tr...","-- i 'm 45 , and my son is 7 once in a while ,..."
2,0,2,1,Mistaken for your child's grandmother,i was 38 when Dominican Republic Emergency Ope...,"-- i 'm 45 , and my son is 7 once in a while ,..."
3,0,3,1,Mistaken for your child's grandmother,both incidents took place after i moved from A...,"-- i 'm 45 , and my son is 7 once in a while ,..."
4,0,4,2,Mistaken for your child's grandmother,i thought about the incidents when i read a re...,"-- i 'm 45 , and my son is 7 once in a while ,..."


# Tokenize data

In [1]:
import os
import re
import sys
import pickle
import csv
import gzip
import numpy as np
from itertools import chain
from bs4 import BeautifulSoup
import pandas as pd
from gensim import corpora
from gensim.parsing.preprocessing import STOPWORDS
from collections import defaultdict

Couldn't import dot_parser, loading of dot files will not be possible.


Using gpu device 0: GeForce GT 750M (CNMeM is disabled, cuDNN 5004)


In [2]:
df = pd.read_csv("/Users/franciscojavierarceo/GitHub/DeepNLPQLearning/data2/1-output/cnn_trainingstreams.csv")

In [4]:
df.head()

Unnamed: 0,query_id,sentence_idx,label,query,sentence,true_summary
0,0,0,1,Mistaken for your child's grandmother,"-- i 'm 45 , and my son is 7","-- i 'm 45 , and my son is 7 once in a while ,..."
1,0,1,1,Mistaken for your child's grandmother,"once in a while , i still get carded when i tr...","-- i 'm 45 , and my son is 7 once in a while ,..."
2,0,2,1,Mistaken for your child's grandmother,i was 38 when Dominican Republic Emergency Ope...,"-- i 'm 45 , and my son is 7 once in a while ,..."
3,0,3,1,Mistaken for your child's grandmother,both incidents took place after i moved from A...,"-- i 'm 45 , and my son is 7 once in a while ,..."
4,0,4,2,Mistaken for your child's grandmother,i thought about the incidents when i read a re...,"-- i 'm 45 , and my son is 7 once in a while ,..."


In [6]:
all_tokens = []
ntexts, qtexts = [], []
frequency = defaultdict(int)

df['true_summary'] = df['true_summary'].str.replace('[^A-Za-z0-9]+', ' ').str.strip().str.lower()

In [None]:
def str_split(string):
    return string.split(" ")
from joblib import Parallel, delayed

summary_texts = Parallel(n_jobs=-1)(
    delayed(str_split)(row['true_summary'])  for i, row in df.iterrows()
)

In [None]:
all_tokens = []
ntexts, qtexts = [], []
frequency = defaultdict(int)

df['true_summary'] = df['true_summary'].str.replace('[^A-Za-z0-9]+', ' ').str.strip().str.lower()
texts = [t.split(" ") for t in df['true_summary'] ]

if 'nuggets' in infilename:
    df = pd.read_csv(infilename)
    df['nugget_text'] = df['nugget_text'].str.replace('[^A-Za-z0-9]+', ' ').str.strip().str.lower()
    texts = [t.split(" ") for t in df['nugget_text'] ]
    ntexts.append(texts)

if infilename in qfilenames:
    texts = loadQuery(infilename)
    qtexts.append(texts)

for text in texts:
    for token in text:
        frequency[token] += 1
texts = [ [token for token in text] for text in texts]
# Collecting all the list of tokens
all_tokens.append(texts)

texts  = sum(all_tokens, [])
qtexts = sum(qtexts, [])
ntexts = sum(ntexts, [])

# Getting the dictionary with token info
dictionary = corpora.Dictionary(texts)

# Mapping to numeric list -- adding plus one to tokens
dictionary.token2id = {k: v+1 for k,v in dictionary.token2id.items()}
word2idx = dictionary.token2id

dictionary.id2token = {v:k for k,v in dictionary.token2id.items()}
idx2word = dictionary.id2token

# Exporting the dictionaries
print("Exporting word to index and dictionary to word indices")
output = open(os.path.join(inputdir,'0-output/LSTMDQN_Dic_token2id.pkl'), 'ab+')
pickle.dump(word2idx, output)
output.close()

output = open(os.path.join(inputdir,'0-output/LSTMDQN_Dic_id2token.pkl'), 'ab+')
pickle.dump(idx2word, output)
output.close()

# Merging the dictionaries toa pandas data frame with summary info
odf0 = pd.DataFrame.from_dict(dictionary.dfs, orient='index').reset_index()
odf1 = pd.DataFrame.from_dict(word2idx, orient='index').reset_index()

odf0.columns = ['id', 'frequency']
odf1.columns = ['token', 'id']
# Merge by token id
odf = pd.merge(left=odf0, right=odf1, on='id')
odf = odf[['id','token', 'frequency']]
# Exporting data
odf.to_csv(os.path.join(inputdir, '0-output/total_corpus_smry.csv'), index=False)

In [5]:
cleanedstreams.head()

Unnamed: 0,query_id,sentence_idx,label,query,sentence,true_summary
0,0,0,1,Mistaken for your child's grandmother,"-- i 'm 45 , and my son is 7","-- i 'm 45 , and my son is 7 once in a while ,..."
1,0,1,1,Mistaken for your child's grandmother,"once in a while , i still get carded when i tr...","-- i 'm 45 , and my son is 7 once in a while ,..."
2,0,2,1,Mistaken for your child's grandmother,i was 38 when Dominican Republic Emergency Ope...,"-- i 'm 45 , and my son is 7 once in a while ,..."
3,0,3,1,Mistaken for your child's grandmother,both incidents took place after i moved from A...,"-- i 'm 45 , and my son is 7 once in a while ,..."
4,0,4,2,Mistaken for your child's grandmother,i thought about the incidents when i read a re...,"-- i 'm 45 , and my son is 7 once in a while ,..."


In [None]:
from joblib import Parallel, delayed
streams = Parallel(n_jobs=-1)(
    delayed(extract_data)(finalinputdir, row) for i, row in outdf.iterrows()
)