<a href="https://colab.research.google.com/github/freddejn/summarization-transformer-cnn-dailymail/blob/master/decode_and_evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Runs decoding from file and evaluates using ROUGE-155

* Input should be files containing data one article per line

In [0]:
!pip install -q -U tensor2tensor
!pip install -q -U tensorflow
!pip install pyrouge
!git clone https://github.com/andersjo/pyrouge.git
!!pyrouge_set_rouge_path '/content/pyrouge/tools/ROUGE-1.5.5/'
!sudo apt-get install libxml-parser-perl
!rm '/content/pyrouge/tools/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/WordNet-2.0.exc.db'
!/content/pyrouge/tools/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/buildExeptionDB.pl . exc /content/pyrouge/tools/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/WordNet-2.0.exc.db
!rm /content/pyrouge/tools/ROUGE-1.5.5/data/WordNet-2.0.exc.db
!ln -s /content/pyrouge/tools/ROUGE-1.5.5/data/WordNet-2.0-Exceptions/WordNet-2.0.exc.db /content/pyrouge/tools/ROUGE-1.5.5/data/WordNet-2.0.exc.db

from google.colab import auth
from nltk import tokenize
from pyrouge import Rouge155
from spacy.lang.en import English
from tensor2tensor import models
from tensor2tensor import problems
from tensor2tensor.data_generators import problem
from tensor2tensor.utils import trainer_lib, registry
from tensor2tensor.utils import registry
from tensor2tensor.utils import decoding
from tensor2tensor.utils import hparams_lib
from tensor2tensor.utils import trainer_lib
import tensorflow as tf
import warnings

import datetime as dt
import numpy as np
import os
import pandas as pd
import re
import sys

en = English()
en.add_pipe(en.create_pipe('sentencizer'))
auth.authenticate_user()
Modes = tf.estimator.ModeKeys

PROJECT_ID = 'transformer-233711'
!gcloud config set project {PROJECT_ID}
BUCKET = 'gs://tensor2tensor-test-bucket'
DATA_DIR = f'{BUCKET}/data'
PROBLEM_NAME = 'summarize_cnn_dailymail32k'
OUTPUT_DIR = 'output_dir/'
TARGET_DIR = 'target_dir/'

In [0]:
import json
# Read json file and return dict
def get_json(file):
    with open(file) as json_file:
        return json.load(json_file)
    
def save_to_separate(file, path, prefix, suffix='.txt', output_lim=0):
    with open(file, 'r') as infile:
        for i, line in enumerate(infile.readlines()):
            if output_lim:
                line = ' '.join(line.split()[:output_lim]) # If output limit
                if line:
                    if line[-1] != ".":
                        line = line + "..."
            line = en(line)
            sentences = [sent.string.strip() for sent in line.sents]
            with open(path + f'{prefix}{i:03d}{suffix}','w') as outfile:
                for s in sentences:   
                    outfile.write(f'{s}\n')

# Model class storing model-run information
class Model:
    def __init__(self, model_name, hparams_set, model_dir):
        self.model_name = model_name
        self.hparams_set = hparams_set
        self.model_dir = model_dir
        print(self.model_dir)
        
    def __str__(self):
        return self.model_dir
    
    def __repr__(self):
        return self.__str__()
    
    def get_label(self):
        return os.path.basename(self.model_dir)
    
# Deletes a tf.flag if set
def delete_flags_if_set(names):
    for name in names:
        if(name in tf.app.flags.FLAGS._flags()):
            tf.app.flags.FLAGS.__delattr__(name)

# Decodes from file
def decoder(run):
    tf.logging.set_verbosity(tf.logging.INFO)
    hp = hparams_lib.create_hparams(
        hparams_set=run.hparams_set, 
        hparams_overrides_str='',
        data_dir=DATA_DIR,
        problem_name=PROBLEM_NAME
    ) 
    decode_hp = decoding.decode_hparams('beam_size=4,alpha=0.6,log_results=False')
    run_config = trainer_lib.create_run_config(
        model_name=run.model_name, 
        cloud_tpu_name=TPU_WORKER,
        use_tpu=True,
        model_dir=run.model_dir 
    ) 

    estimator = trainer_lib.create_estimator(
        model_name=run.model_name,
        hparams=hp,
        run_config=run_config,
        decode_hparams=decode_hp,
        use_tpu=True,
        use_tpu_estimator=True 
    )
    decoding.decode_from_file(estimator, filename=decode_from_file, hparams=hp, decode_hp=decode_hp, decode_to_file=decode_to_file)
    
def run_rouge_155(run):
    r = Rouge155()
    r.system_dir = OUTPUT_DIR
    r.model_dir = TARGET_DIR
    r.system_filename_pattern = 'file.(\d+).txt'
    r.model_filename_pattern = 'file.[A-Z].#ID#.txt'
    output = r.convert_and_evaluate()
    return r.output_to_dict(output)

delete_flags_if_set(['f', 'problem'])
tf.app.flags.DEFINE_string('problem', PROBLEM_NAME, "Problem name.")
tf.app.flags.DEFINE_string('f', '', 'kernel')

# Run inference and score using ROUGE 1.5.5
### Set up all files before running inference, only needs to run once before infering

In [0]:
# Path to trained models
DATASET = 'eval'
MAX_TESTS = 10
MODEL_MAX_LENGTH = 2048
TEST_MAX_LENGTH = 512
OUTPUT_LIM = 128
trunk = True

if DATASET == 'test':
    eval_data_dir = f'{BUCKET}/data_for_test'
if DATASET == 'eval':
    eval_data_dir = f'{BUCKET}/data_for_evaluation'
    
if trunk:
    results_file = f'rouge-155-data-trunk-{DATASET}-set.csv'
    decode_from_file = f'{eval_data_dir}/trunk_{TEST_MAX_LENGTH}_num_{MAX_TESTS}_inputs.txt'
    targets = f'{eval_data_dir}/trunk_{TEST_MAX_LENGTH}_num_{MAX_TESTS}_targets.txt'
else:
    results_file = f'rouge-155-data-{DATASET}-set.csv'
    decode_from_file = f'{eval_data_dir}/len_{TEST_MAX_LENGTH}_num_{MAX_TESTS}_inputs.txt'
    targets = f'{eval_data_dir}/len_{test_max_length}_num_{MAX_TESTS}_targets.txt'

results_path = f'{eval_data_dir}/{results_file}'


# Copy targets and inputs to gs bucket, label by max_length and max_test, one example per line
!gsutil cp '{decode_from_file}' 'all_inputs.txt' 
!gsutil cp '{targets}' 'all_targets.txt' 

models_to_evaluate = [
    Model(model_name='transformer', hparams_set='transformer_tpu',
      model_dir=f'{BUCKET}/transformer_tpu_extra-b4096-ml4096-mi512-mt128')
]

# Copy csv-file and targets file locally
!gsutil cp '{results_path}' '{results_file}'
!rm -rf {TARGET_DIR} && mkdir {TARGET_DIR}
save_to_separate("all_targets.txt", TARGET_DIR, 'file.A.')
print(models_to_evaluate)

In [0]:

# Run inference
TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR'] # Comment out if running on GPU
!gsutil cp '{results_path}' '{results_file}'
if DATASET == 'eval':
    beam_size = 4
if DATASET == 'test':
    beam_size = 8
    
# Make local backup of original checkpoints file for all models
for run in models_to_evaluate:
    checkpoint_file = f'{run.model_dir}/checkpoint'
    !gsutil cp {checkpoint_file} '{run.get_label()}_checkpoint_backup.tmp'

for ckpt in range(0, 10000 + 10000, 10000):
    # Create a new dataframe for data if none created
    try:
        df = pd.read_csv(f'{results_file}', index_col=0)
    except FileNotFoundError:
        print('creating new dataframe')
        df = pd.DataFrame()
        
    for run in models_to_evaluate:
        # Read hparams from model in bucket
        checkpoint_file = f'{run.model_dir}/checkpoint'
        !gsutil cp $run.model_dir'/hparams.json' .
        hparams_json = get_json('hparams.json')

        # Check if checkpoint exists in checkpoints file
        available_checkpoints = !gsutil cat $checkpoint_file | tr '\n' ' ' | sed -e 's/[^0-9]/ /g' -e 's/^ *//g' -e 's/ *$//g' | tr -s ' ' | sed 's/ /\n/g'
        available_checkpoints = available_checkpoints[1:]
        if not str(ckpt) in available_checkpoints:
            print(f'Checkpoint {ckpt} does not exist for {os.path.basename(run.model_dir)}')
            continue # Skip iteration if checkpoint does not exists

        # Copy checkpoint file locally
        !gsutil cat $checkpoint_file > 'checkpoint.tmp'
        
        # Run sed to replace first line with checkpoint file to run
        !sed -i '1s/.*/model_checkpoint_path: "model.ckpt-'"{ckpt}"'"/' checkpoint.tmp

        # Copy altered checkpoint file to bucket
        !gsutil cp checkpoint.tmp {checkpoint_file} && echo $(gsutil cat $checkpoint_file) 
        print(f'{60*"-"}\nRunning inference on {run.model_dir} using checkpoint in {checkpoint_file}\n')
        
        # Run decoder for evaluation and restore checkpoint file
        decoder(run)
        !gsutil cp '{run.get_label()}_checkpoint_backup.tmp' {checkpoint_file}

        # Evaluate using ROUGE
        !rm -rf {OUTPUT_DIR} && mkdir {OUTPUT_DIR}     
        decode_to_file = f'{BUCKET}/generated_summaries/{run.get_label()}_all_outputs.txt'
        !gsutil cp '{decode_to_file}' '{run.get_label()}_{ckpt}_all_outputs.txt'    
        save_to_separate(f"{run.get_label()}_{ckpt}_all_outputs.txt", OUTPUT_DIR, 'file.', output_lim=OUTPUT_LIM)
        
        # Set up ROUGE155
        dict_output = run_rouge_155(run)
        print(f'{60*"_"}\n{run.get_label()}\nCheckpoint: {ckpt}\n\tRouge 1 F-score:\t{dict_output["rouge_1_f_score"]}')
        print(f'\tRouge 2 F-score:\t{dict_output["rouge_2_f_score"]}')
        print(f'\tRouge L F-score:\t{dict_output["rouge_l_f_score"]}\n{60*"-"}')
        dict_output['max_length'] = TEST_MAX_LENGTH
        dict_output['model_max_length'] = hparams_json['max_length']
        dict_output['step'] = ckpt
        dict_output['num_evaluated'] = MAX_TESTS
        dict_output['model'] = run.get_label()
        dict_output['output_limit'] = OUTPUT_LIM
        df = df.append(pd.Series(dict_output), ignore_index=True)
        df = df.drop_duplicates(['model', 'step', 'max_length', 'num_evaluated'], keep='last')
        df.to_csv(f'{results_file}')
        !gsutil cp '{results_file}' '{results_path}'