## Loading CNN data

This code is adapted from https://machinelearningmastery.com/prepare-news-articles-text-summarization/ for the purpose of loading data from CNN dataset.

In [3]:
from os import listdir
import string

# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, encoding='utf-8')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

# split a document into news story and highlights
def split_story(doc):
	# find first highlight
	index = doc.find('@highlight')
	# split into story and highlights
	story, highlights = doc[:index], doc[index:].split('@highlight')
	# strip extra white space around each highlight
	highlights = [h.strip() for h in highlights if len(h) > 0]
	return story, highlights

# load all stories in a directory
def load_stories(directory):
	stories = list()
	for name in listdir(directory):
		filename = directory + '/' + name
		# load document
		doc = load_doc(filename)
		# split into story and highlights
		story, highlights = split_story(doc)
		# store
		stories.append({'story':story, 'highlights':highlights})
	return stories

# clean a list of lines
def clean_lines(lines):
	cleaned = list()
	# prepare a translation table to remove punctuation
	table = str.maketrans('', '', string.punctuation)
	for line in lines:
		# strip source cnn office if it exists
		index = line.find('(CNN) -- ')
		if index > -1:
			line = line[index+len('(CNN)'):]
		# tokenize on white space
		line = line.split()
		# convert to lower case
		line = [word.lower() for word in line]
		# remove punctuation from each token
		line = [w.translate(table) for w in line]
		# remove tokens with numbers in them
		line = [word for word in line if word.isalpha()]
		# store as string
		cleaned.append(' '.join(line))
	# remove empty strings
	cleaned = [c for c in cleaned if len(c) > 0]
	return cleaned

# load stories
directory = 'data/cnn/stories/'
stories = load_stories(directory)
print('Loaded Stories %d' % len(stories))

# clean stories
for example in stories:
	example['story'] = clean_lines(example['story'].split('\n'))
	example['highlights'] = clean_lines(example['highlights'])

Loaded Stories 92579


In [11]:
# examining one of the stories
example

{'highlights': ['new religious freedom is the underlying issue texas attorney general says',
  'the issue has become a political flashpoint in a presidential election year',
  'the lawsuit accuses the government of violating the first amendment',
  'the obama administration adjusted its policy in an attempt at compromise'],
 'story': ['seven states on thursday filed a lawsuit against the federal government requirement that religious employers offer health insurance coverage that includes contraceptives and other birth control services',
  'the issue has become a political flashpoint in a presidential election year and the lawsuit by attorneys general from nebraska michigan ohio oklahoma south carolina florida and texas was certain to keep it prominent',
  'private plaintiffs joining the seven states included pius x catholic high school catholic social services catholic mutual relief society of america and private citizens stacy molai and sister mary catherine',
  'the lawsuit named the

In [14]:
stories[0]

{'highlights': ['shutdown of greek broadcaster ert sparks protests',
  'sophia ignatidou says shutdown was met with mixed feelings',
  'democracy is being questioned in the country once deemed its cradle',
  'ignatidou switching off state broadcaster will only exacerbate problems'],
 'story': ['the irony was not lost on most viewers of the greek prime ministers statements as he replied to the public outcry over the hasty shutdown of ert the countrys national broadcaster with the immediate layoff of over employees like a wolf in sheeps clothing antonis samaras was chastising a system that his own new democracy party had helped to ingrain in greek society and political life',
  'speaking at an award ceremony organized by the athens chamber of commerce and industry evea he labeled ert a symbol of corruption and waste and claimed it controlled the flow of information without any accountability constantly repeating righteous keywords such as transparency he tried to justify his decision to 

In [15]:
## Saving data to a pickle file for later use

# save to file
from pickle import dump
dump(stories, open('cnn_dataset.pkl', 'wb'))