In [3]:
from datasets import load_dataset, concatenate_datasets
bookcorpus = load_dataset("bookcorpus", split="train")

In [141]:
import re
import random
from copy import deepcopy

def fn_remove_special_tokens(seq_origin):
    seq_target = re.sub(r'[^a-zA-Z0-9 ]', ' ', seq_origin)
    seq_target = re.sub(r'\d+', 'number', seq_target)
    seq_target = re.sub(r' +', ' ', seq_target)
    return seq_target.lstrip().rstrip()
# end

def train_test_split(index_all, rate=0.15):
    index_all = deepcopy(index_all)
    random.shuffle(index_all)
    index_train, index_test = index_all[:int(len(index_all) * rate)], index_all[int(len(index_all) * rate):]
    return index_train, index_test
# end


def create_random_index_isnext(len_all, rate_selected=0.5):
    list_index = [i for i in range(len_all-1)]
    random.shuffle(list_index)
    list_index_isnext = list_index[:int(len(list_index) * rate_selected)]
    pairs_target = [(i, i+1, 1) for i in list_index_isnext]
    return pairs_target
# end

def create_random_index_notnext(len_all, rate_selected=0.5):

    list_index_a = [i for i in range(len_all)][:int(len_all * rate_selected)]
    list_index_b = [i for i in range(len_all)][:int(len_all * rate_selected)]

    random.shuffle(list_index_a)
    random.shuffle(list_index_b)

    pairs_target = [(a, b, 0) for a, b in zip(list_index_a, list_index_b) if abs(a - b) > 1]

    return pairs_target
# end

#   pairs_index_mixed = random.shuffle(deepcopy(pairs_index_mixed)) should be shuffled before for train/eval
def select_pair_from_origin(dataset_train, pairs_index_mixed, size_batch=2):

    index_end_mixed = len(pairs_index_mixed) - len(pairs_index_mixed) % size_batch
    pairs_index_target = pairs_index_mixed[:index_end_mixed]
    n_batches = int(len(pairs_index_target) / size_batch)
    print('size_batch: {}, len_origin: {}, index_end_mixed: {}, n_batches: {}'.format(size_batch, len(pairs_index_mixed), index_end_mixed, n_batches))

    for i_batches in range(n_batches):
        index_batch_start = i_batches
        index_batch_end = i_batches + size_batch
        pairs_batch_current = pairs_index_target[index_batch_start:index_batch_end]

        pairs_sentences = [(
            fn_remove_special_tokens(dataset_train[pair_batch_current[0]]),
            fn_remove_special_tokens(dataset_train[pair_batch_current[1]]),
            pair_batch_current[-1])
                for pair_batch_current in pairs_batch_current
        ]
        
        labels_isnext = [pair_batch_current[-1] for pair_batch_current in pairs_batch_current]

        yield pairs_sentences, labels_isnext
    # end
# end

In [142]:
corpus_train = bookcorpus[:200]['text']
index_all_mixed = create_random_index_isnext(len(corpus_train)) + create_random_index_notnext(len(corpus_train))
random.shuffle(index_all_mixed)

In [143]:
for a,b in select_pair_from_origin(bookcorpus[:200]['text'], deepcopy(index_all_mixed), 5):
    print(a,b)

size_batch: 5, len_origin: 191, index_end_mixed: 190, n_batches: 38
[('megan pursed her lips at the prospect', 'widowers usually fell into two categories those who were still devastated by their wives deaths or those who were ready to have fun and live a little', 1), ('instead she decided that she would become a nurse which would fulfill her need to care for sick people', 'his platinum blond hair and blue eyes were completely hers', 0), ('as they started into the church her mother reached for mason', 'after that he had n t been interested in any of the pictures and emails megan sent', 0), ('he had worn it and now it was being passed down to his son', 'casey snorted', 1), ('while it boasted a sweetheart neckline the hemline fell just below her knees', 'she put on her pearls a high school graduation gift from her uncle aidan or ankle as she often called him', 1)] [1, 0, 0, 1, 1]
[('instead she decided that she would become a nurse which would fulfill her need to care for sick people', 'h