In [1]:
# Imports
import numpy as np
import os
import tiktoken

from datasets import load_dataset # huggingface datasets
from tqdm import tqdm

In [2]:
# Import dataset
split_dataset = load_dataset("abisee/cnn_dailymail", "3.0.0")
split_dataset['train']

Dataset({
    features: ['article', 'highlights', 'id'],
    num_rows: 287113
})

In [3]:
# Encoder with summary tokens
enc = tiktoken.get_encoding("gpt2")
sum_tokens = {"<|sum|>" : 50257, "<|core|>" : 50258}
custom_enc = tiktoken.Encoding(name="gpt2_custom", 
                               pat_str=enc._pat_str, 
                               mergeable_ranks=enc._mergeable_ranks, 
                               special_tokens={**enc._special_tokens, **sum_tokens})
sumtok = sum_tokens["<|sum|>"]
coretok = sum_tokens["<|core|>"]

In [7]:
# Encoding function (gpt2 bpe)
def process(example):
    article = [coretok] + custom_enc.encode_ordinary(example['article']) # encode_ordinary ignores any special tokens
    summary = [sumtok] + custom_enc.encode_ordinary(example['highlights'])
    article.append(custom_enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
    res = summary + article 
    # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
    # out = {'ids': ids, 'len': len(ids)}
    return res

In [8]:
custom_enc.decode(process(split_dataset['train'][0]))

'<|sum|>Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .\nYoung actor says he has no plans to fritter his cash away .\nRadcliffe\'s earnings from first five Potter films have been held in trust fund .<|core|>LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 p