<a href="https://colab.research.google.com/github/clarencechen/cnn-dailymail-summary/blob/master/Summarizing_CNN_Dailymail_Articles_with_Cloud_TPUs_and_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Copyright 2018 The TensorFlow Authors.

In [0]:
#@title Licensed under the Apache License, Version 2.0 (the "License"); { display-mode: "form" }
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Summarizing CNN/Dailymail Articles with Cloud TPUs and BERT


<table class="tfo-notebook-buttons" align="left">
 <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/shakespeare_with_tpu_and_keras.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/tpu/blob/master/tools/colab/shakespeare_with_tpu_and_keras.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

## Download data

Download *The Complete Works of William Shakespeare* as a single text file from [Project Gutenberg](https://www.gutenberg.org/). We'll use snippets from this file as the *training data* for the model. The *target* snippet is offset by one character.

In [0]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf
import numpy as np
import time

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()

with tf.Session(TPU_ADDRESS) as session:
    # Upload credentials to TPU.
    with open('/content/adc.json', 'r') as f:
        auth_info = json.load(f)
    tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
    # Now credentials are set for all future sessions on this TPU.
    print('TPU devices:')
    print(session.list_devices())

# This address identifies the TPU we'll use when configuring TensorFlow.
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']

OUTPUT_DIR = '/content/model/'
tf.logging.set_verbosity(tf.logging.INFO)

TPU address is grpc://10.89.239.2:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 11582478840878583954), _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 9952103659584718622), _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_GPU:0, XLA_GPU, 17179869184, 18033498473482104438), _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 14319539187879693053), _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 9675666561288967995), _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 8125228641297061490), _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 8601934407541013928), _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 12754932271768098734), _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 1748102443965

## Build the model

The model is defined as a two-layer, forward-LSTM—with two changes from the `tf.keras` standard LSTM definition:

1. Define the input `shape` of our model which satisfies the [XLA compiler](https://www.tensorflow.org/performance/xla/)'s static shape requirement.
2. Use `tf.train.Optimizer` instead of a standard Keras optimizer (Keras optimizer support is still experimental).

In [0]:
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
    sys.path += ['bert_repo']

# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'cased_L-12_H-768_A-12' #@param {type:"string"}
BERT_PRETRAINED_DIR = 'gs://cloud-tpu-checkpoints/bert/' + BERT_MODEL
print('***** BERT pretrained directory: {} *****'.format(BERT_PRETRAINED_DIR))
!gsutil ls $BERT_PRETRAINED_DIR

***** BERT pretrained directory: gs://cloud-tpu-checkpoints/bert/cased_L-12_H-768_A-12 *****
gs://cloud-tpu-checkpoints/bert/cased_L-12_H-768_A-12/bert_config.json
gs://cloud-tpu-checkpoints/bert/cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001
gs://cloud-tpu-checkpoints/bert/cased_L-12_H-768_A-12/bert_model.ckpt.index
gs://cloud-tpu-checkpoints/bert/cased_L-12_H-768_A-12/bert_model.ckpt.meta
gs://cloud-tpu-checkpoints/bert/cased_L-12_H-768_A-12/checkpoint
gs://cloud-tpu-checkpoints/bert/cased_L-12_H-768_A-12/vocab.txt


## Dataset download

### Convert the Shakespeare Corpus into Newline-separated Sentences

In [0]:
!test -d cnn || mkdir cnn/
!test -d dailymail || mkdir dailymail/
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download the story files from using their Drive file ID.
CNN_DRIVE_ID, DAILYMAIL_DRIVE_ID = '0BwmD_VLjROrfTHk4NFg2SndKcjQ', '0BwmD_VLjROrfM1BxdkxVaTY2bWs'
CNN_DIR, DAILYMAIL_DIR, URL_DIR = '/content/cnn/', '/content/dailymail/', '/content/url_lists/'
TGZ_NAME = 'stories.tgz'
CNN_TGZ, DAILYMAIL_TGZ = CNN_DIR + TGZ_NAME, DAILYMAIL_DIR + TGZ_NAME

!test -d $CNN_DIR || mkdir $CNN_DIR
!test -d $URL_DIR || mkdir $URL_DIR
!test -d $DAILYMAIL_DIR || mkdir $DAILYMAIL_DIR

cnn, dailymail = drive.CreateFile({'id': CNN_DRIVE_ID}), drive.CreateFile({'id': DAILYMAIL_DRIVE_ID})
cnn.GetContentFile(CNN_TGZ)
dailymail.GetContentFile(DAILYMAIL_TGZ)

!tar -xzf $CNN_TGZ
!tar -xzf $DAILYMAIL_TGZ

!rm $CNN_TGZ
!rm $DAILYMAIL_TGZ

# Download URL lists of stories for each set using wget
!wget --secure-protocol=auto https://raw.githubusercontent.com/becxer/cnn-dailymail/master/url_lists/all_val.txt
!wget --secure-protocol=auto https://raw.githubusercontent.com/becxer/cnn-dailymail/master/url_lists/all_train.txt
!wget --secure-protocol=auto https://raw.githubusercontent.com/becxer/cnn-dailymail/master/url_lists/all_test.txt
!mv all_val.txt all_train.txt all_test.txt $URL_DIR

## Dataset Preprocessing
Now that all the news stories have been downloaded, they need to be tokenized and batched in order for BERT to compute their embeddings per sentence.  We will only extract the activations from the last layer. 

In [0]:
# Define directory to store TFRecord data and final results (must be Google Cloud Bucket or BERT cannoy run) 
FINISHED_DIR = "finished_files" #@param {type:"string"}
CHUNKED_DIR = os.path.join(FINISHED_DIR, "chunked")

In [0]:
import codecs
import collections
import json
import re
import hashlib

import modeling
import tokenization

class InputExample(object):
	"""A single story with its article text and highlights."""

	def __init__(self, story_id, article, highlights):
		self.story_id = story_id
		self.article = article
		self.highlights = highlights

class InputFeatures(object):
	"""A single set of features of data."""

	def __init__(self, unique_id, example_id, doc_span_index, tokens, token_is_max_context, input_ids, input_mask, input_type_ids, is_real=True):
		self.unique_id = unique_id
		self.tokens = tokens
		self.example_id = example_id,
		self.doc_span_index = doc_span_index,
		self.token_is_max_context = token_is_max_context,
		self.input_ids = input_ids
		self.input_mask = input_mask
		self.input_type_ids = input_type_ids
		self.is_real = is_real

def file_based_input_fn_builder(input_file, seq_length, num_cores):
	"""Creates an `input_fn` closure to be passed to TPUEstimator."""

	name_to_features = {
			"unique_id": tf.FixedLenFeature([], tf.int64),
			"input_ids": tf.FixedLenFeature([seq_length], tf.int64),
			"input_mask": tf.FixedLenFeature([seq_length], tf.int64),
			"input_type_ids": tf.FixedLenFeature([seq_length], tf.int64),
			}

	def _decode_record(record, name_to_features):
		"""Decodes a record to a TensorFlow example."""
		feature_set = tf.parse_single_example(record, name_to_features)

		# tf.Example only supports tf.int64, but the TPU only supports tf.int32.
		# So cast all int64 to int32.
		for name in list(feature_set.keys()):
			t = feature_set[name]
			if t.dtype == tf.int64:
				t = tf.to_int32(t)
			feature_set[name] = t

		return feature_set

	def input_fn(params):
		"""The actual input function."""
		batch_size = params["batch_size"]

		# For training, we want a lot of parallel reading and shuffling.
		# For eval, we want no shuffling and parallel reading doesn't matter.
		d = tf.data.TFRecordDataset(input_file, buffer_size=16*1024*1024)

		d = d.apply(
				tf.contrib.data.map_and_batch(
					lambda record: _decode_record(record, name_to_features),
					batch_size=batch_size,
					num_parallel_batches=num_cores,
					drop_remainder=False))
		# Pefetch data while training
		d = d.prefetch(tf.contrib.data.AUTOTUNE)

		return d

	return input_fn

def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu=True, use_one_hot_embeddings=True):
	"""Returns `model_fn` closure for TPUEstimator."""

	def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
		"""The `model_fn` for TPUEstimator."""
        
		tf.logging.info("*** Features ***")
		for name in sorted(features.keys()):
			tf.logging.info("  name = {}, shape = {}".format(name, features[name].shape))

		unique_id = features["unique_id"]
		input_ids = features["input_ids"]
		input_mask = features["input_mask"]
		input_type_ids = features["input_type_ids"]

		model = modeling.BertModel(
				config=bert_config,
				is_training=False,
				input_ids=input_ids,
				input_mask=input_mask,
				token_type_ids=input_type_ids,
				use_one_hot_embeddings=use_one_hot_embeddings)

		if mode != tf.estimator.ModeKeys.PREDICT:
			raise ValueError("Only PREDICT modes are supported: %s" % (mode))

		tvars = tf.trainable_variables()
		scaffold_fn = None
		(assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
				tvars, init_checkpoint)
		
		if use_tpu:
			def tpu_scaffold():
				tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
				return tf.train.Scaffold()
			scaffold_fn = tpu_scaffold
		else:
			tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

		tf.logging.info("**** Trainable Variables ****")
		for var in tvars:
			init_string = ""
			if var.name in initialized_variable_names:
				init_string = ", *INIT_FROM_CKPT*"
			tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape, init_string)

		all_layers = model.get_all_encoder_layers()

		predictions = {
				"unique_id": unique_id,
		}

		for (i, layer_index) in enumerate(layer_indexes):
			predictions['layer_output_{:d}'.format(i)] = all_layers[layer_index]

		output_spec = tf.contrib.tpu.TPUEstimatorSpec(
				mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)
		return output_spec

	return model_fn


In [0]:
dm_single_close_quote = u'\u2019' # unicode
dm_double_close_quote = u'\u201d'
END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', dm_single_close_quote, dm_double_close_quote, ")"] # acceptable ways to end a sentence

# These are the number of .story files we expect there to be in cnn_stories_dir and dm_stories_dir
num_expected_cnn_stories = 92579
num_expected_dm_stories = 219506


def read_text_file(text_file):
	lines = []
	with open(text_file, "r") as f:
		for line in f:
			lines.append(line.strip())
	return lines

def check_num_stories(stories_dir, num_expected):
	num_stories = len(os.listdir(stories_dir))
	if num_stories != num_expected:
		raise Exception("stories directory {} contains {:d} files but should contain {:d}".format(stories_dir, num_stories, num_expected))

def hashhex(s):
    """Returns a heximal formated SHA1 hash of the input string."""
    h = hashlib.sha1()
    h.update(s.encode())
    return h.hexdigest()

def get_url_hashes(url_list):
	return [hashhex(url) for url in url_list]

def fix_missing_period(line):
	"""Adds a period to a line that is missing a period"""
	if line[-1] in END_TOKENS:
		return line
	return line + "."

def get_art_abs(story_file, story_idx):
	# Separate out article and abstract sentences
	story_text = read_text_file(story_file)
	highlights, article = [], ''
	next_is_highlight = False
	for line in story_text:
		if line == '':
			continue # empty line
		elif '@' in line and 'highlight' in line:
			next_is_highlight = True
		elif next_is_highlight:
			highlights.append(line)
		# Put periods on the ends of lines that are missing them
		# (this is a problem in the dataset because many image captions don't end in periods;
		# consequently they end up in the body of the article as run-on sentences)
		else:
			# Concatenate fixed line to article text
			article += fix_missing_period(line) + ' '

	return InputExample(story_id=story_idx, article=article, highlights=highlights)

In [0]:
class FeatureWriter(object):
	"""Writes InputFeature to TF example file."""

	def __init__(self, filename):
		self.filename = filename
		self.num_features = 0
		self._writer = tf.python_io.TFRecordWriter(filename)

	def process_feature(self, feature):
		"""Write a InputFeature to the TFRecordWriter as a tf.train.Example."""
		self.num_features += 1

		def create_int_feature(values):
			feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
			return feature

		features = collections.OrderedDict()
		features["unique_ids"] = create_int_feature([feature.unique_id])
		features["input_ids"] = create_int_feature(feature.input_ids)
		features["input_mask"] = create_int_feature(feature.input_mask)
		features["input_type_ids"] = create_int_feature(feature.input_type_ids)

		tf_example = tf.train.Example(features=tf.train.Features(feature=features))
		self._writer.write(tf_example.SerializeToString())

	def close(self):
		self._writer.close()

def _check_is_max_context(doc_spans, cur_span_index, position):
	"""Check if this is the 'max context' doc span for the token."""

	# Because of the sliding window approach taken to scoring documents, a single
	# token can appear in multiple documents. E.g.
	# 
	# Doc: the man went to the store and bought a gallon of milk
	# Span A: the man went to the
	# Span B: to the store and bought
	# Span C: and bought a gallon of
	# ...
	#
	# Now the word 'bought' will have two scores from spans B and C. We only
	# want to consider the score with "maximum context", which we define as
	# the *minimum* of its left and right context (the *sum* of left and
	# right context will always be the same, of course).
	#
	# In the example the maximum context for 'bought' would be span C since
	# it has 1 left context and 3 right context, while span B has 4 left context
	# and 0 right context.
	best_score = None
	best_span_index = None
	for (span_index, doc_span) in enumerate(doc_spans):
		end = doc_span.start + doc_span.length - 1
		if position < doc_span.start:
			continue
		if position > end:
			continue
		num_left_context = position - doc_span.start
		num_right_context = end - position
		score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
		if best_score is None or score > best_score:
			best_score = score
			best_span_index = span_index

	return cur_span_index == best_span_index

def convert_examples_to_features(examples, max_seq_length, batch_size, tokenizer, doc_stride, features_file):
	"""Splits and tokenizes example stories into lists of length shorter than max_seq_length along with other metadata"""
	eval_writer = FeatureWriter(filename=features_file)
	id_to_features = {}

	unique_id = 1000000000
	for (example_index, example) in enumerate(examples):
		
		if example_index % 1000 == 0:
			tf.logging.info("Writing features for example {0:d} of {1:d}; {2:.2f} percent done.".format(example_index, len(examples), example_index*100.0/len(examples)))

		'''# Tokenize highlights
		for highlight_index, highlight in enumerate(example.highlights):
			
			highlight_tokens = tokenizer.tokenize(highlight)
			
			# Truncate highlight if too long
			# The -1 accounts for [CLS]
			if len(highlight_tokens) > max_seq_length -1:
				highlight_tokens = highlight_tokens[0:max_seq_length -1]

			tokens = []
			token_is_max_context = {}
			segment_ids = []
			
			# Add classification token
			tokens.append("[CLS]")
			segment_ids.append(0)

			# Extend with actual highlight tokens
			tokens.extend(highlight_tokens)
			segment_ids.extend([1] * len(highlight_tokens))

			# Write to temporary features tfrecord
			feature = pad_and_create_feature(
					tokenizer,
					writer=eval_writer,
					max_seq_length=max_seq_length,
					unique_id=unique_id,
					example_num=example_index,
					section_num=highlight_index,
					tokens=tokens,
					segment_ids=segment_ids)

			id_to_features[unique_id] = feature
			unique_id += 1
			'''
		# Tokenize article
		#tok_to_orig_index = []
		#orig_to_tok_index = []

		article_tokens = tokenizer.tokenize(example.article)

		# The -1 accounts for [CLS]
		max_tokens_for_doc = max_seq_length -1

		# We can have documents that are longer than the maximum sequence length.
		# To deal with this we do a sliding window approach, where we take chunks
		# of the up to our max length with a stride of `doc_stride`.

		# Calculate article spans
		_DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
				"DocSpan", ["start", "length"])
		doc_spans = []
		start_offset = 0
		while start_offset < len(article_tokens):
			length = len(article_tokens) - start_offset
			if length > max_tokens_for_doc:
				length = max_tokens_for_doc
			doc_spans.append(_DocSpan(start=start_offset, length=length))
			if start_offset + length == len(article_tokens):
				break
			start_offset += int(min(length, doc_stride))


		for (article_part_index, doc_span) in enumerate(doc_spans):
			tokens = []
			token_is_max_context = {}
			segment_ids = []
			tokens.append("[CLS]")
			segment_ids.append(0)

			# Add all article tokens in doc_span's range
			for i in range(doc_span.length):
				split_token_index = doc_span.start + i
				is_max_context = _check_is_max_context(doc_spans, article_part_index, split_token_index)
				token_is_max_context[len(tokens)] = is_max_context
				tokens.append(article_tokens[split_token_index])
				segment_ids.append(1)

			# Write to temporary features tfrecord
			feature = pad_and_create_feature(
					tokenizer,
					writer=eval_writer, 
					max_seq_length=max_seq_length, 
					unique_id=unique_id, 
					example_num=example_index, 
					section_num=article_part_index, 
					tokens=tokens, 
					segment_ids=segment_ids,
					token_is_max_context=token_is_max_context)

			id_to_features[unique_id] = feature
			unique_id += 1

	tf.logging.info('Finished writing {1} real features from {2} examples to {0}.'.format(eval_writer.filename, eval_writer.num_features, len(examples)))
	
	# TPU requires a fixed batch size for all batches, therefore the number
	# of features must be a multiple of the batch size, or else features
	# will get dropped. So we pad with fake features which are ignored
	# later on.
	while eval_writer.num_features % batch_size != 0:
		feature = create_fake_feature(
					writer=eval_writer, 
					max_seq_length=max_seq_length, 
					unique_id=unique_id)
		
		id_to_features[unique_id] = feature
		unique_id += 1

	tf.logging.info('Finished writing padding features to {}; should result in {:d} batches.'.format(eval_writer.filename, eval_writer.num_features//batch_size))
	# Close tfrecord writer
	eval_writer.close()
	return id_to_features

def pad_and_create_feature(tokenizer, writer, max_seq_length, unique_id, example_num, section_num, tokens, segment_ids, token_is_max_context=None):
	"""Pads feature data to max_seq_length and writes the resulting feature to a tfRecord file using the `writer`."""
	# Convert tokens from tokenizer to ids from vocab file
	input_ids = tokenizer.convert_tokens_to_ids(tokens)
	
	# The mask has 1 for real tokens and 0 for padding tokens. Only real
	# tokens are attended to.
	input_mask = [1] * len(input_ids)

	# Zero-pad up to the sequence length.
	while len(input_ids) < max_seq_length:
		input_ids.append(0)
		input_mask.append(0)
		segment_ids.append(0)

	assert len(input_ids) == max_seq_length
	assert len(input_mask) == max_seq_length
	assert len(segment_ids) == max_seq_length

	if example_num < 2:
		tf.logging.info("*** Example ***")
		tf.logging.info("unique_id: {}".format(unique_id))
		tf.logging.info("example_num: {}".format(example_num))
		tf.logging.info("section_num: {}".format(section_num))
		
		tf.logging.info("tokens: {}".format(''.join(
				[tokenization.printable_text(x) + ' ' for x in tokens])))
		if token_is_max_context:
			tf.logging.info("token_is_max_context: {}".format(''.join(
				["{0}:{1}".format(x, y) + ', ' for (x, y) in token_is_max_context.items()])))

		tf.logging.info("input_ids: {}".format(''.join([str(x) + ' ' for x in input_ids])))
		tf.logging.info(
				"input_mask: {}".format(''.join([str(x) + ' ' for x in input_mask])))
		tf.logging.info(
				"segment_ids: {}".format(''.join([str(x) + ' ' for x in segment_ids])))

	feature = InputFeatures(
			unique_id=unique_id,
			example_id=example_num,
			doc_span_index=section_num,
			tokens=tokens,
			token_is_max_context=token_is_max_context,
			input_ids=input_ids,
			input_mask=input_mask,
			input_type_ids=segment_ids)

	# Write to temporary features tfrecord
	writer.process_feature(feature)
	return feature

def create_fake_feature(writer, max_seq_length, unique_id):
	"""Pads feature data to max_seq_length and writes the resulting feature to a tfRecord file using the `writer`."""
	
	# The mask has 1 for real tokens and 0 for padding tokens. Only real
	# tokens are attended to.
	feature = InputFeatures(
			unique_id=unique_id,
			example_id=None,
			doc_span_index=None,
			tokens=[None] * max_seq_length,
			token_is_max_context=[False] * max_seq_length,
			input_ids=[0] * max_seq_length,
			input_mask=[0] * max_seq_length,
			input_type_ids=[0] * max_seq_length,
			is_real=False)

	# Write to temporary features tfrecord
	writer.process_feature(feature)
	return feature


In [0]:
BERT_CONFIG = BERT_PRETRAINED_DIR +'/bert_config.json'
BERT_VOCAB = BERT_PRETRAINED_DIR +'/vocab.txt'
BERT_CHECKPOINT = BERT_PRETRAINED_DIR +'/bert_model.ckpt'

MAX_SEQ_LEN = 384
BATCH_SIZE = 8
NUM_TPU_CORES = 8

layer_indexes = [-1]

def write_to_bin(cnn_stories_dir, dm_stories_dir, url_filename, out_filename, temp_records_filename):
	out_file = os.fsencode(os.path.join(FINISHED_DIR, out_filename))
	features_file = os.fsencode(os.path.join(FINISHED_DIR, temp_records_filename))

	"""Reads the tokenized .story files corresponding to the urls listed in the url_file and writes them to a out_file."""
	# print("Making bin file for URLs listed in {}...".format(url_file))
	url_list = read_text_file(os.fsencode(os.path.join(URL_DIR, url_filename)))
	url_hashes = get_url_hashes(url_list)
	story_fnames = [s + ".story" for s in url_hashes]

	num_stories = len(story_fnames)
	print("Found {:d} stories listed in URL list...".format(num_stories))

	# Instantiate Tokenizer and BERT Model from checkpoint
	bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
	print("Found BERT config file at {}...".format(BERT_CONFIG))

	tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=False)
	print("Instantiated BERT Tokenizer with vocab file from {}...".format(BERT_VOCAB))

	is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
	run_config = tf.contrib.tpu.RunConfig(master=TPU_ADDRESS,
			tpu_config=tf.contrib.tpu.TPUConfig(
					num_shards=NUM_TPU_CORES,
					per_host_input_for_training=is_per_host))

	model_fn = model_fn_builder(
			bert_config=bert_config,
			init_checkpoint=BERT_CHECKPOINT,
			layer_indexes=layer_indexes,
			use_tpu=True,
			use_one_hot_embeddings=True)

	# If TPU is not available, this will fall back to normal Estimator on CPU
	# or GPU.
	estimator = tf.contrib.tpu.TPUEstimator(
			use_tpu=True,
			model_fn=model_fn,
			config=run_config,
			train_batch_size=BATCH_SIZE,
			predict_batch_size=BATCH_SIZE)

	print("Successfully set up TPU estimator with BERT mode from checkpoint at {} with batch size {}...".format(BERT_CHECKPOINT, BATCH_SIZE))
	examples = []
	for idx,s in enumerate(story_fnames):
		if idx % 1000 == 0:
			print("Reading story {0:d} of {1:d}; {2:.2f} percent done".format(idx, num_stories, float(idx)*100.0/float(num_stories)))

		# Look in the tokenized story dirs to find the .story file corresponding to this url
		if os.path.isfile(os.path.join(cnn_stories_dir, s)):
			story_file = os.path.join(cnn_stories_dir, s)
		elif os.path.isfile(os.path.join(dm_stories_dir, s)):
			story_file = os.path.join(dm_stories_dir, s)
		else:
			print("Error: Couldn't find story file {} in either story directories {} and {}. Are the directories misplaced or missing?".format(s, cnn_stories_dir, dm_stories_dir))
			# Check again if tokenized stories directories contain correct number of files
			print("Checking that the stories directories {} and {} contain correct number of files...".format(cnn_stories_dir, dm_stories_dir))
			check_num_stories(cnn_stories_dir, num_expected_cnn_stories)
			check_num_stories(dm_stories_dir, num_expected_dm_stories)
			raise Exception("Stories directories {} and {} contain correct number of files but story file {} found in neither.".format(cnn_stories_dir, dm_stories_dir, s))

		# Get the example to tokenize from story file
		examples.append(get_art_abs(story_file, idx))
		
	unique_id_to_feature = convert_examples_to_features(
		examples=examples,
		max_seq_length=MAX_SEQ_LEN,
		batch_size=BATCH_SIZE,
		tokenizer=tokenizer,
		doc_stride=MAX_SEQ_LEN//3,
		features_file=features_file)
	
	input_fn = file_based_input_fn_builder(
		input_file=features_file,
		seq_length=MAX_SEQ_LEN,
		num_cores=NUM_TPU_CORES)

	tf.logging.info("***** Running predictions *****")
	tf.logging.info("  Num orig examples: {:d}".format(len(examples)))
	tf.logging.info("  Num total split sentences: {:d}".format(len(unique_id_to_feature)))
	tf.logging.info("  Batch size per TPU group: {:d}".format(BATCH_SIZE))

	# TODO: Complete convert_examples_to_features
	# TODO: Write results to tfrecord to feed into decoder
	with codecs.getwriter("utf-8")(tf.gfile.Open(out_file), "w") as writer:
		for result in estimator.predict(input_fn, yield_single_examples=True):
			unique_id = int(result["unique_id"])
			tf.logging.info("Received result with unique id {:d}".format(out_file))
			feature = unique_id_to_feature[unique_id]
			
			# Skip fake features used for padding
			if not feature.is_real:
				continue

			output_json = collections.OrderedDict()
			output_json["linex_index"] = unique_id
			output_json["example_id"] = feature.example_id
			output_json["doc_span_index"] = feature.doc_span_index
			all_features = []
			for (i, token) in enumerate(feature.tokens):
				all_layers = []
				for (j, layer_index) in enumerate(layer_indexes):
					layer_output = result["layer_output_{:d}".format(j)]
					layers = collections.OrderedDict()
					layers["index"] = layer_index
					layers["values"] = [
							round(float(x), 6) for x in layer_output[i:(i + 1)].flat
					]
					all_layers.append(layers)
				features = collections.OrderedDict()
				features["token"] = token
				features["layers"] = all_layers
				all_features.append(features)
			output_json["features"] = all_features
			writer.write(json.dumps(output_json) + "\n")
	
		print("Finished writing file {}\n".format(out_file))

In [0]:
CHUNK_SIZE = 1000 # num examples per chunk, for the chunked data

def chunk_file(set_name):
	in_file = 'finished_files/{}.jsonl'.format(set_name)
	reader = open(in_file, "rb")
	chunk = 0
	finished = False
	while not finished:
		chunk_fname = os.path.join(CHUNKED_DIR, '{0}_{1:03d}.jsonl'.format(set_name, chunk)) # new chunk
		with open(chunk_fname, 'wb') as writer:
			for _ in range(CHUNK_SIZE):
				len_bytes = reader.read(8)
				if not len_bytes:
					finished = True
					break
				str_len = struct.unpack('q', len_bytes)[0]
				example_str = struct.unpack('{:d}s'.format(str_len), reader.read(str_len))[0]
				writer.write(struct.pack('q', str_len))
				writer.write(struct.pack('{:d}s'.format(str_len), example_str))
			chunk += 1

def chunk_all(chunkdir):
	# Make a dir to hold the chunks
	if not os.path.isdir(chunkdir):
		os.mkdir(chunkdir)
	# Chunk the data
	for set_name in ['train', 'val', 'test']:
		print("Splitting {} data into chunks...".format(set_name))
		chunk_file(set_name)
	print("Saved chunked data in {}".format(chunkdir))

In [0]:
# Check the stories directories contain the correct number of .story files
check_num_stories(CNN_DIR + 'stories/', num_expected_cnn_stories)
check_num_stories(DAILYMAIL_DIR + 'stories/', num_expected_dm_stories)

# Create some new directories
if not os.path.exists(FINISHED_DIR):
    os.makedirs(FINISHED_DIR)

# Run BERT Tokenizer on both stories dirs, then read the tokenized stories, do a little postprocessing then write to bin files
for set_name in ['train', 'val', 'test']:
	write_to_bin(CNN_DIR + 'stories/', DAILYMAIL_DIR + 'stories/', "all_{}.txt".format(set_name), "{}.jsonl".format(set_name), "temp_{}.tfrecord".format(set_name))

# TODO: Chunk the data. This splits each of train.jsonl, val.jsonl and test.jsonl into smaller chunks, each containing e.g. 1000 examples, and saves them in FINISHED_DIR/chunks

In [0]:
cat cnn/stories/1a87358e123db0ecdaeebb3a675b73ea1ad97635.story

## Construct a Simple Decoder

Use the trained model to make predictions and generate your own Shakespeare-esque play.
Start the model off with a *seed* sentence, then generate 250 characters from it. We'll make five predictions from the initial seed.

In [0]:
# Eval the model.
eval_examples = processor.get_dev_examples(TASK_DATA_DIR)
eval_features = run_classifier.convert_examples_to_features(
    eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('***** Started evaluation at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(eval_examples)))
print('  Batch size = {}'.format(EVAL_BATCH_SIZE))
# Eval will be slightly WRONG on the TPU because it will truncate
# the last batch.
eval_steps = int(len(eval_examples) / EVAL_BATCH_SIZE)
eval_input_fn = run_classifier.input_fn_builder(
    features=eval_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=True)
result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
print('***** Finished evaluation at {} *****'.format(datetime.datetime.now()))
output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
with tf.gfile.GFile(output_eval_file, "w") as writer:
    print("***** Eval results *****")
    for key in sorted(result.keys()):
        print('  {} = {}'.format(key, str(result[key])))
        writer.write("%s = %s\n" % (key, str(result[key])))

In [0]:
# Train the model.
print('Shakespeare on BERT base model normally takes about ??? minutes. Please wait...')
train_features = run_classifier.convert_examples_to_features(
    train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('***** Started training at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("  Num steps = %d", num_train_steps)
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('***** Finished training at {} *****'.format(datetime.datetime.now()))