In [1]:
import torch
from torch.utils.data import RandomSampler, DataLoader
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from qlora_train import create_prompt_dataset_v2
import math

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
model_name = "togethercomputer/RedPajama-INCITE-7B-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer._pad_token is None:
    DEFAULT_PAD_TOKEN = "[PAD]"
    special_tokens_dict = dict(pad_token=DEFAULT_PAD_TOKEN)
    tokenizer.add_special_tokens(special_tokens_dict)

datasets = "chip2_instruct_alpha_prompt_en_v2_clean_v1 chip2_instruct_alpha_prompt_ru_v2_clean_v1 dolly_original_prompt_v2 dolly_translated_prompt_v2_clean_v1 openass_prompt_dataset_en_v2_clean_v1 openass_prompt_dataset_ru_v2_clean_v1".split()

train, valid = create_prompt_dataset_v2(
    datasets_names=datasets,
    tokenizer=tokenizer,
    max_seq_len=2048,
    output_path="./datasets/",
    seed=1234,
)

generator = torch.Generator()
generator.manual_seed(1234)
train_dataset = train
sampler = RandomSampler(train_dataset, generator=generator)
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    padding=True,
    max_length=2048,
)


def collator(x):
    features_map = data_collator(x)
    return features_map


data_loader = DataLoader(
    train_dataset, batch_size=1, sampler=sampler, collate_fn=collator
)


# the length of the dataset in tokens
L_t = 0
#  the length of the dataset in UTF-8 encoded bytes.
L_b = 0
# cross entropy loss
loss = 1.095

for i, item in zip(range(20000), data_loader):
    # print(item)
    tokens = item["input_ids"][0]
    L_t += len(tokens)
    decoded_str = tokenizer.decode(tokens)
    L_b += len(decoded_str)
    # print(decoded_str)


def tokens_per_byte(
    L_t=None,
    L_b=None,
    loss=None,
):
    return (L_t / L_b) * loss / math.log2(2)


tokens_per_byte(L_t=L_t, L_b=L_b, loss=loss)

Loading cached split indices for dataset at /home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/prompt_datasets/chip2_instruct_alpha_prompt_en_v2_clean_v1/cache-1c4c7215bfe16982.arrow and /home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/prompt_datasets/chip2_instruct_alpha_prompt_en_v2_clean_v1/cache-54085008405851ef.arrow


fname 00430737cbd813b41bcf80abc44368ad9ac5133372d1292b5c2ce034188cdabd
cache_found False


Loading cached split indices for dataset at /home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/prompt_datasets/chip2_instruct_alpha_prompt_ru_v2_clean_v1/cache-c584f28b24e37d96.arrow and /home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/prompt_datasets/chip2_instruct_alpha_prompt_ru_v2_clean_v1/cache-b1db1d1113bc4791.arrow
Loading cached split indices for dataset at /home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/prompt_datasets/dolly_original_prompt_v2/cache-309ac6eb7efff8cc.arrow and /home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/prompt_datasets/dolly_original_prompt_v2/cache-93d5e78eeef1a509.arrow
Loading cached split indices for dataset at /home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/s

0.37572527983423637