In [None]:
import transformers, datasets, json 

#### CNN/Daily Mail Dataset

**Source**: [From huggingface](https://huggingface.co/datasets/cnn_dailymail)


| **Dataset Split** | **Number of Instances in Split** |
| --- | --- |
| Train	| 287,113 |
| Validation | 13,368 |
| Test | 11,490 |

In [None]:
train_ds, val_ds, test_ds = datasets.load_dataset(
    "cnn_dailymail", '3.0.0', 
    split=['train', 'validation', 'test']
)

In [None]:
from transformers import AutoTokenizer

In [None]:
configs = {
    'max_input_embedding_length': 512,
    'max_output_embedding_length': 128,
    'task_prefix': "summarize: ",
    'tokenizer': 't5-small',
    'ignore_ids': -100
}

In [None]:
tokenizer = AutoTokenizer.from_pretrained(configs['tokenizer'])

In [None]:
def preprocess_data(batch):
    global configs, tokenize

    articles, summarizations = batch['article'], batch['highlights']
        
    input_tokens = tokenize(
        [configs['task_prefix'] + e for e in articles], 
        max_length = configs['max_input_embedding_length'], 
        padding = 'max_length', 
        truncation = True
    )
    
    labels = tokenize(
        summarizations, 
        max_length = configs['max_output_embedding_length'], 
        padding = 'max_length', 
        truncation = True
    ).input_ids
    
    for i in range(len(labels)):
        labels[i][labels[i] == 0] = configs['ignore_ids']
        
    input_tokens['labels'] = labels
    
    return input_tokens
    

In [None]:
encoded_val_ds = val_ds.map(preprocess_data, batched=True, remove_columns=val_ds.column_names)
encoded_train_ds = train_ds.map(preprocess_data, batched=True, remove_columns=train_ds.column_names)
encoded_test_ds = test_ds.map(preprocess_data, batched=True, remove_columns=test_ds.column_names)

In [None]:
import json

with open('val_ds_encoded.json', 'w') as fp:
    json.dump([
        item for item in encoded_val_ds
    ], fp)
    
with open('test_ds_encoded.json', 'w') as fp:
    json.dump([
        item for item in encoded_test_ds
    ], fp)
    
with open('train_ds_encoded.json', 'w') as fp:
    json.dump([
        item for item in encoded_train_ds
    ], fp)

#### Last step

- Create new version of the dataset and store it in Kaggle. 
- Here is my Kaggle pre-processed dataset: [Dataset](https://www.kaggle.com/datasets/eddyvo/t5-base-tokens-cnn-daily)