In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"  # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3"


In [33]:
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer

import torch
from torch.utils.data import DataLoader

from datasets import load_dataset, load_from_disk



In [3]:
base_model = "decapoda-research/llama-7b-hf"
device_map = "auto"

model = LlamaForCausalLM.from_pretrained(
    base_model,
    # load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)


Loading checkpoint shards: 100%|██████████| 33/33 [00:10<00:00,  3.16it/s]


In [4]:
tokenizer = LlamaTokenizer.from_pretrained(base_model)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
tokenizer.padding_side = "left"  # Allow batched inference


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [22]:
def generate_and_tokenize_prompt(prompt):
    # print(prompt)
    prompt = prompt["prompt"]

    tokenized_prompt = tokenizer(
        prompt,
        max_length=1024,
        truncation=True,
    )
    user_prompt = prompt[: prompt.index("### Assistant:")]
    tokenized_user_prompt = tokenizer(
        user_prompt,
        max_length=1024,
        truncation=True,
        add_special_tokens=False,
    )
    prompt_len = len(tokenized_user_prompt["input_ids"])

    tokenized_prompt["labels"] = [-100] * (prompt_len) + tokenized_prompt["input_ids"][
        prompt_len:
    ]
    return tokenized_prompt


In [23]:
data_files = {
    "train": "/home/kosenko/ru_chatGPT/ru_instruct/ru_instruct/sandbox/datasets_processed/IlyaGusev_ru_turbo_alpaca_formatted.json"
}
data = load_dataset(
    "json",
    data_files=data_files,
)

# всеравно мы никак не можем адекватно мерить модель
# тогда зачем тратить на валидацию данные?
train_data = (
    data["train"]
    .shuffle()
    .map(
        generate_and_tokenize_prompt,
        num_proc=32,
        # batched=True,
    )
)


Found cached dataset json (/home/kosenko/.cache/huggingface/datasets/json/default-8fca4c8833304c51/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
100%|██████████| 1/1 [00:00<00:00, 477.06it/s]
                                                                                  

In [24]:
print(data["train"][1000]["prompt"])




A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.

### Human: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Как вы относитесь к конкуренции на рабочем месте?

### Response:
### Assistant:
Я считаю, что конкуренция на рабочем месте может быть полезной, если она справедливая и здоровая. Конкуренция может стимулировать улучшение работы и повышение производительности. Однако, я также считаю, что важно сохранять здоровую рабочую среду и отношения с коллегами. Я всегда стараюсь помогать своим коллегам и делиться своими знаниями и опытом, чтобы мы могли достигать успеха вместе.




In [25]:
train_data


Dataset({
    features: ['prompt', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 75368
})

In [26]:
print(train_data[0]["prompt"])
print(tokenizer.decode(train_data[0]["input_ids"]))




A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.

### Human: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Я люблю гулять в парке.
Переведи следующее высказывание на немецкий язык.
### Response:
### Assistant:
Eingabe: Ich liebe es, im Park spazieren zu gehen.


<unk>

A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.

### Human: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Я люблю гулять в парке.
Переведи следующее высказывание на немецкий язык.
### Response:
### Assistant:
Eingabe: Ich liebe es, im Park spazieren zu gehen.




In [None]:
train_data = train_data.remove_columns("prompt")
splited_dataset = train_data.train_test_split(train_size=0.98, test_size=0.02)

In [42]:
splited_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 73860
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1508
    })
})

In [47]:

data_collator = transformers.DataCollatorForSeq2Seq(
	tokenizer=tokenizer,
	pad_to_multiple_of=8,
 	return_tensors="pt", 
  	padding=True
)
dataloader = DataLoader(
	dataset=splited_dataset['train'],
	batch_size=16,
	# num_workers=self.hparams.num_workers,
	pin_memory=True,
	shuffle=True,
 	collate_fn=data_collator
)

In [48]:
next(iter(dataloader))

{'input_ids': tensor([[    0,     0,     0,  ..., 29973,    13,    13],
        [    0,     0,     0,  ..., 29973,    13,    13],
        [    0,     0,     0,  ..., 29991,    13,    13],
        ...,
        [    0,     0,     0,  ...,  7337,    13,    13],
        [    0,     0,     0,  ..., 29889,    13,    13],
        [    0,     0,     0,  ..., 29889,    13,    13]]), 'attention_mask': tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  ..., 29973,    13,    13],
        [ -100,  -100,  -100,  ..., 29973,    13,    13],
        [ -100,  -100,  -100,  ..., 29991,    13,    13],
        ...,
        [ -100,  -100,  -100,  ...,  7337,    13,    13],
        [ -100,  -100,  -100,  ..., 29889,    13,    13],
        [ -100,  -100,  -100,  ..., 29889,    13,    13]])}