<a href="https://colab.research.google.com/github/freddejn/summarization-transformer-cnn-dailymail/blob/master/dataset_metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Display information about dataset.

* Runs through entire dataset and extracts input and output lenghts
* Stores data in dataframe on GCS bucket `DATASET_METADATA_PATH/type-metadata.csv`, where type is of train test or eval.
* Path to training data is specifyed by `DATA_DIR`

In [0]:
# Imports
import tensorflow as tf 
import pandas as pd 
import numpy as np 
!pip install -q -U tensor2tensor
from tensor2tensor import problems 
from tensor2tensor.data_generators import problem
from google.colab import auth
auth.authenticate_user()

PROJECT_ID = 'transformer-233711'
!gcloud config set project {PROJECT_ID}
BUCKET = 'tensor2tensor-test-bucket'

PROBLEM_NAME = 'summarize_cnn_dailymail32k'
MODES = tf.estimator.ModeKeys
DATA_DIR = f'gs://{BUCKET}/data'
DATASET_METADATA_PATH = f'gs://{BUCKET}/metadata/'

tfe = tf.contrib.eager 
tfe.enable_eager_execution()

In [0]:
# Runs through dataset and stores in GCS, checks if already exists on bucket
# before running.
def data_exists(file, path):
    files = !gsutil ls $path
    return (path + file) in files

summary_problem = problems.problem(PROBLEM_NAME) 
encoders = summary_problem.feature_encoders(DATA_DIR)

example_iterators = { 'eval': tfe.Iterator(summary_problem.dataset(problem.DatasetSplit.EVAL, DATA_DIR)),
                      'test': tfe.Iterator(summary_problem.dataset(problem.DatasetSplit.TEST, DATA_DIR)),
                      'train':tfe.Iterator(summary_problem.dataset(problem.DatasetSplit.TRAIN, DATA_DIR))}

for key, example_iterator in example_iterators.items():
    print(f'Extracting metadatat {key}')
    tmp_file = f'{key}-metadata.csv'
    input_shapes = []
    output_shapes = []
    if data_exists(tmp_file, DATASET_METADATA_PATH):
        print(f'{key} data already exists')
        continue
    for example in example_iterator:
        input_shapes.append(example['inputs'].shape[0])
        output_shapes.append(example['targets'].shape[0])
    data = pd.DataFrame({'input_shape':input_shapes, 'output_shape':output_shapes})
    data.to_csv(tmp_file)
    !gsutil cp $tmp_file $DATASET_METADATA_PATH
    display(data.head())

In [0]:
# Copy files locally if not already done
!gsutil cp $DATASET_METADATA_PATH'eval-metadata.csv' 'eval-metadata.csv'
!gsutil cp $DATASET_METADATA_PATH'test-metadata.csv' 'test-metadata.csv'
!gsutil cp $DATASET_METADATA_PATH'train-metadata.csv' 'train-metadata.csv'

In [0]:
# Display length of each dataset.
eval_data = pd.read_csv('eval-metadata.csv')
test_data = pd.read_csv('test-metadata.csv')
train_data = pd.read_csv('train-metadata.csv')
eval_articles = eval_data.shape[0]
test_articles = test_data.shape[0]
train_articles = train_data.shape[0]
total_articles = eval_articles + train_articles + test_articles
print(100*'-' + '\n')
print(f'Number of train samples: {train_articles:,} ({train_articles/total_articles:.1%}) \n' 
        f'Number of evaluation samples {eval_articles:,}({eval_articles/total_articles:.1%})\n'
        f'Number of test samples: {test_articles:,} ({test_articles/total_articles:.1%})\n'
        f'Total samples {total_articles:,} ({total_articles/total_articles:0.0%})\n') 

display(f'Eval: {eval_data.shape}',eval_data.head(5))
display(f'Test: {test_data.shape}', test_data.head(5))
display(f'Train: {train_data.shape}', train_data.head(5))
display(train_data.mean())

In [0]:
# Print bin sizes and display min and max lenghts of data.
import numpy as np

bins = pd.DataFrame()
bin_split = [0,128,256,512,1024,2048,4096,9999]
all_data = pd.concat([eval_data, test_data, train_data], axis=0)
input_binned = all_data.groupby(pd.cut(all_data['input_shape'], bins=bin_split)).size()
output_binned = all_data.groupby(pd.cut(all_data['output_shape'], bins=bin_split)).size()
bins['num_input'] = input_binned
bins['num_output'] = output_binned
bins.index.name = 'Bins'

avg_in, avg_out = all_data.input_shape.mean(), all_data.output_shape.mean()
min_in, min_out = all_data.input_shape.min(), all_data.output_shape.min()
max_in, max_out = all_data.input_shape.max(), all_data.output_shape.max()
bins['fraction_input'] = bins.num_input/bins.num_input.sum()
bins['fraction_output'] = bins.num_output/bins.num_output.sum()
bins['cum_fraction_input'] = bins.fraction_input.cumsum()
bins['cum_fraction_output'] = bins.fraction_output.cumsum()
display(bins)
print(f'Average len input: {avg_in:.2f}\n' \
      f'Average len output: {avg_out:.2f}\n' \
      f'Max len input: {max_in}\n' \
      f'Max len output: {max_out}\n' \
      f'Min len input: {min_in}\n' \
      f'Min len output: {min_out}\n')

In [0]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from matplotlib.ticker import PercentFormatter
from matplotlib.ticker import FuncFormatter

import matplotlib.pyplot as plt
import matplotlib as mlt
colors = ['#3498db', '#95a5a6']

fig, ax = plt.subplots(2,1, figsize=(13,8))
for i, a in enumerate(ax):
    a.spines['top'].set_visible(False)
    a.spines['right'].set_visible(False)
    a.set_ylabel('Fraction (‰)', fontsize=18)
    a.tick_params(labelsize=18)
    a.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: "%.1f" % (x*1000)))

ax[0].hist(all_data.input_shape, bins=range(0,3072,32), density=True, \
           label='input', alpha=0.6, color=colors[0], edgecolor='black')
ax[0].set_xticks(range(0,2560,512))
ax[0].set_xlim(right=2500, left=0)
ax[0].set_xlabel('Input Length', fontsize=18)

ax[1].hist(all_data.output_shape, bins=range(0,256,2), density=True, \
           label='targets', alpha=0.6, color=colors[1], edgecolor='black')
ax[1].set_xlim(right=170, left=0)
ax[1].set_xticks(range(0,192,32))
ax[1].set_xlabel('Target Length', fontsize=18)

fig.tight_layout()
plt.savefig('data_histogram.pdf')
plt.show()