# Loading CNN Data and reducing large vocab to a smaller sized vocab

## Loading CNN data

This code is adapted from https://machinelearningmastery.com/prepare-news-articles-text-summarization/ for the purpose of loading data from CNN dataset.

You can download CNN data from https://cs.nyu.edu/~kcho/DMQA/

Note the below directory for the downloaded files
directory = 'data/cnn/stories/'

The last cell in this notebook converts the CNN dataset into a format that can be consumed by the NMT model at the below URL 

https://github.com/tensorflow/nmt

In [1]:
from os import listdir
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a document into news story and highlights
def split_story(doc):
	# find first highlight
	index = doc.find('@highlight')
	# split into story and highlights
	story, highlights = doc[:index], doc[index:].split('@highlight')
	# strip extra white space around each highlight
	highlights = [h.strip() for h in highlights if len(h) > 0]
	return story, highlights

# load all stories in a directory
def load_stories(directory):
	stories = list()
	for name in listdir(directory):
		filename = directory + '/' + name
		# load document
		doc = load_doc(filename)
		# split into story and highlights
		story, highlights = split_story(doc)
		# store
		stories.append({'story':story, 'highlights':highlights})
	return stories

# clean a list of lines
def clean_lines(lines):
	cleaned = list()
	# prepare a translation table to remove punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines:
		# strip source cnn office if it exists
		index = line.find('(CNN) -- ')
		if index > -1:
			line = line[index+len('(CNN)'):]
		# tokenize on white space
		line = line.split()
		# convert to lower case
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [w.translate(table) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		cleaned.append(' '.join(line))
	# remove empty strings
	cleaned = [c for c in cleaned if len(c) > 0]
	return cleaned

# load stories
directory = '/tmp/mnt_fork/tldr_data/cnn/stories'
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))

# clean stories
for example in stories:
	example['story'] = clean_lines(example['story'].split('\n'))
	example['highlights'] = clean_lines(example['highlights'])

Loaded Stories 92579


In [4]:
## Saving data to a pickle file for later use

# save to file
from pickle import dump
dump(stories, open('cnn_dataset.pkl', 'wb'))

In [5]:
#### Loading CNN dataset

import pickle

pFile = open('cnn_dataset.pkl', 'rb')

pFile.seek(0)

stories = pickle.load(pFile)
print('Loaded Stories %d' % len(stories))

Loaded Stories 92579


In [None]:
# Create 

In [111]:
def getData(stories,train_pct,dev_pct,test_pct):
    
    '''
    Input:
    
    stories  : list of stories. Each element is a dictionar with two keys and corresponding values.
               key1 is "highlight" and key2 is story. 
               "highlight" is a list of highlights or summary of the story spread across multiple lines
               "story" is the news story spread across multiple lines
    train_pct: floating point number. Percentage to dedicate for training
    dev_pct  : floating point number. Percentage to dedicate for dev testset
    test_pct : floating point number. Percentage to dedicate for testing
    
    Output:
    
    train_in, train_out, dev_in, dev_out, test_in, test_out: 
                corresponding to training and test input and output sentences.
                All sentences are squashed to a single line
    '''
    
    data_in = []
    data_out = []
    
    allHighlights = False
    
    for story in stories:
        
        if allHighlights == False:
            summary = story["highlights"][0]
        else:
            summary = ".".join(story["highlights"])
        details = ".".join(story["story"])
        
        data_in.append(details)
        data_out.append(summary)

    # splice list
    train_end  = int(len(data_in) * abs(float(train_pct)/100))
    dev_begin  = train_end # to adjust for python being zero based
    dev_end    = int(len(data_in) * abs(float(train_pct+dev_pct)/100))
    test_begin = dev_end
    
    train_in  = data_in[0:train_end]
    train_out = data_out[0:train_end]
    
    dev_in = data_in[dev_begin:dev_end]
    dev_out = data_out[dev_begin:dev_end]
    
    test_in = data_in[test_begin:]
    test_out = data_out[test_begin:]
    
    return train_in, train_out, dev_in, dev_out, test_in, test_out

In [8]:
train_in, train_out, dev_in, dev_out, test_in, test_out = getData(stories,90,5,5)

In [9]:
print("train_in:%d, train_out:%d, dev_in:%d, dev_out:%d, test_in:%d, test_out:%d" % (len(train_in), len(train_out), len(dev_in), len(dev_out),len(test_in), len(test_out)))

train_in:83321, train_out:83321, dev_in:4629, dev_out:4629, test_in:4629, test_out:4629


In [10]:
# write data out to file

dest_dir = "/tmp/mnt_fork/tldr_data"

with open(dest_dir+"/train.in","w") as outfile:
    outfile.write('\n'.join(train_in))
    
with open(dest_dir+"/train.out","w") as outfile:
    outfile.write('\n'.join(train_out))
    
with open(dest_dir+"/dev.in","w") as outfile:
    outfile.write('\n'.join(dev_in))
    
with open(dest_dir+"/dev.out","w") as outfile:
    outfile.write('\n'.join(dev_out))
    
with open(dest_dir+"/test.in","w") as outfile:
    outfile.write('\n'.join(test_in))
    
with open(dest_dir+"/test.out","w") as outfile:
    outfile.write('\n'.join(test_out))

# Extract a reduced sized vocabulary from data



In [11]:
import numpy as np

In [112]:
import nltk
import re

stories_dataset = [item["story"] for item in stories]
highlights_dataset = [item["highlights"][0] for item in stories]

stories_dataset_final = list(map(lambda x: ".".join(x),stories_dataset))

## Create stories vocabulary

In [114]:
from collections import Counter
import nltk

# write these 20k into vocab
vocabSize = 20000

all_tokens = []

for story in stories_dataset_final:
    
    tokens = nltk.word_tokenize(story)
    
    all_tokens.extend(tokens)
    
counts = Counter(all_tokens)

# get most common words
common_words = counts.most_common(vocabSize)

# write to output file
with open("vocab_20k.in","w") as vocabFile:
    for i in range(0,vocabSize):
        vocabFile.write(common_words[i][0])
        vocabFile.write("\n")

## Create headlines vocabulary

In [115]:
print(len(highlights_dataset))

92579


In [122]:
from collections import Counter
import nltk

# write these 20k into vocab
vocabSize = 20000

all_tokens = []

for highlight in highlights_dataset:
    
    tokens = nltk.word_tokenize(highlight)
    
    all_tokens.extend(tokens)

counts = Counter(all_tokens)

# get most common words
common_words = counts.most_common(vocabSize)

# write to output file
with open("vocab_20k.out","w") as vocabFile:
    for i in range(0,vocabSize):
        vocabFile.write(common_words[i][0])
        vocabFile.write("\n")

## Reducing VOCAB size

This notebook looks at the CNN dataset vocab files available at the below link
http://cs.nyu.edu/~kcho/DMQA/

the vocab file is in the format

    word1 frequency1
    word2 frequency2
    word3 frequency3
    word4 frequency4
    ...

We filter out words with low frequencies

In [None]:
inFile = "vocab"
outFile = "vocab.out"
frequencyCutOff = 20

In [None]:
outVocab = []
with open(inFile) as readFile:
    
    line = readFile.readline()
    
    while line != "":

        tokens = line.split()

        if len(tokens) == 2 and int(tokens[1]) > frequencyCutOff:
            outVocab.append(tokens[0])
            
        line = readFile.readline()
        
        
with open(outFile,"w") as writeFile:
    
    writeFile.write("\n".join(outVocab))