In [None]:
import logging
import warnings
from typing import Any, Dict, List, Optional, Union

import torch
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, AutoTokenizer

log = logging.getLogger(__name__)

# HuggingFace hardcodes the ignore index to -100
_HF_IGNORE_INDEX = -100

In [None]:
model_id = 'vilm/vinallama-7b-chat'
tokenizer = AutoTokenizer.from_pretrained(model_id, 
                                          trust_remote_code=True, 
                                          token='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',
                                          cache_dir='../cache',)

In [None]:
# vocab = tokenizer.get_vocab()
# import json 
# with open('tokenizer.json', 'w') as f:
#     json.dump(vocab, f, indent=4, ensure_ascii=False)

In [None]:
tokenizer.special_tokens_map

VinaLLaMA

In [None]:
tokenizer.chat_template = (
    "{% for message in messages %}"
    "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}"
    "{% endfor %}"
    "{% if add_generation_prompt %}"
    "{{ '<|im_start|>assistant\n' }}"
    "{% endif %}"
)

In [None]:
def gen_prompt(question, context=None):
    if context is None:
        return question
    else:
        return f'### Đây là những thông tin liên quan:\n{context}\n### Hãy trả lời câu hỏi:\n{question}'

# def format_prompt(sample):
#     context, question = sample['context'], sample['question']
#     human_prompt = gen_prompt(question, context)

#     system =  "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."

#     return f"{system} USER: {human_prompt} ASSISTANT:", sample['answer']
def format_prompt_vinallama(sample):
    context, question = sample['context'], sample['question']

    prompt = gen_prompt(question, context)
    system = 'Bạn là một trợ lí AI hữu ích. Hãy trả lời người dùng một cách chính xác.\n' 
    messages = [{'role': 'system', 'content': system}, 
                {'role': 'user', 'content': prompt},]
    
    return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True), sample['answer']

In [None]:
def template_data(sample): 
    prompt, response = format_prompt_vinallama(sample)
    sample['prompt'] = prompt
    sample['response'] = response
    return sample

In [None]:
def tokenize_data(sample):
    try:
        tokenizer(text=sample['prompt'], text_target=sample['response'])
    except:
        print('-----------------')
        print(sample['prompt'])
        print(sample['response'])
    return tokenizer(text=sample['prompt'], text_target=sample['response'])

In [None]:
def ensure_list(x: Union[List, torch.Tensor]) -> List:
    if isinstance(x, torch.Tensor):
        x = list(x.flatten())
    assert isinstance(x, list)
    return x

In [None]:
def _process_and_batch_decoder_only(examples, max_seq_len=2048):
    # examples: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
    # Steps explained in comments

    processed_examples = []
    for context, target in zip(examples['input_ids'], examples['labels']):
        # context = ensure_list(example['input_ids'])
        # target = ensure_list(example['labels'])

        context = ensure_list(context)
        target = ensure_list(target)
        # First, get rid of any padding tokens
        context = [t for t in context if t != tokenizer.pad_token_id]
        target = [t for t in target if t != tokenizer.pad_token_id]

        # Third, ensure that the target text ends with an eos tag
        if target[-1] != tokenizer.eos_token_id:
            target = target + [tokenizer.eos_token_id]

        n_context = len(context)
        n_target = len(target)

        # if n_context >= max_seq_len:
        #     warnings.warn(
        #         f'Skipping example because CONTEXT length={n_context} leaves no room ' +\
        #         f'for TARGET tokens because max_seq_len={max_seq_len}. ' +\
        #         f'If this causes downstream issues because of inconsistent batch sizes, ' +\
        #         f'consider increasing max_seq_len or using example packing.'
        #     )
        #     continue

        # We need to concatenate the context and target to get the
        # full input sequence, cutting off any excess tokens from the
        # end of the target
        # if n_context + n_target > max_seq_len:
        #     old_n_target = int(n_target)
        #     n_target = max_seq_len - n_context
        #     warnings.warn(
        #         f'Truncating TARGET sequence of length={old_n_target} to length={n_target}, ' +\
        #         f'so context+target fit max_seq_len={max_seq_len}. If truncation is ' +\
        #         f'a problem, consider increasing max_seq_len.')
        #     target = target[-n_target:]
        #     target[-1] = tokenizer.eos_token_id
        if n_context + n_target >= max_seq_len:
            warnings.warn(
                f'Skipping example, total length of context and target is {n_context + n_target}')
            continue
        n_total = n_context + n_target

        input_ids = context + target
        labels = ([_HF_IGNORE_INDEX] * n_context) + target
        attention_mask = [1] * n_total
        # bidirectional_mask is used by our prefix lm model variants
        # bidirectional_mask = ([1] * n_context) + ([0] * n_target)

        # Annoyingly, we need to pad the everything but input_ids
        # and attention_mask ourselves
        i_pad = [_HF_IGNORE_INDEX] * (max_seq_len - n_total)
        # z_pad = [0] * (max_seq_len - n_total)
        if tokenizer.padding_side == 'left':
            labels = i_pad + labels
            # bidirectional_mask = z_pad + bidirectional_mask
        else:
            labels = labels + i_pad
            # bidirectional_mask = bidirectional_mask + z_pad

        # Update the example
        example = {}
        example['input_ids'] = input_ids
        example['labels'] = labels
        example['attention_mask'] = attention_mask
        # example['bidirectional_mask'] = bidirectional_mask

        processed_examples.append(example)

    batch = tokenizer.pad(
        processed_examples,
        padding='max_length',
        max_length=max_seq_len,
        return_tensors='pt',
    )

    return batch

In [None]:
from datasets import load_dataset, Dataset 
import os 
# dataset = load_dataset('csv', data_files='/home4/tuannd/llm-training/Data_Vi_QA_v1.1/QA_Uni/Chitchat_HUST_train.csv', split='train')
# dataset[0]
import pandas as pd

# data_dir = '/home4/tuannd/llm-training/data/train_v2/final'
# data_files = os.listdir(data_dir)
# # data_files = ['/home4/tuannd/llm-training/data/Data_Vi_QA_v1.1/QA_Uni/Chitchat_HUST_train.csv',
# #               '/home4/tuannd/llm-training/data/Data_Vi_QA_v1.1/QA_Uni/hust_no_ans.csv',
# #               '/home4/tuannd/llm-training/data/Data_Vi_QA_v1.1/QA_Uni/Uni-QA(08_12_2023).csv']
# print(data_files)
# all_df = pd.concat([pd.read_csv(data_dir + '/' + f) for f in data_files])
# all_df = all_df.dropna(subset=['answer'])
all_df = pd.read_csv('/home4/tuannd/llm-training/data/train_v2/final/train_v2.csv')
# len(all_df)
dataset = Dataset.from_pandas(all_df)
dataset = dataset.remove_columns(['type'])
dataset

In [None]:
all_df['type'].value_counts()

In [None]:
dataset = dataset.filter(lambda x: len(x['answer'].split()) > 10)

In [None]:
processed_dataset = dataset.map(template_data, remove_columns=['question', 'context', 'answer'])

In [None]:
import random 
i = random.randint(0, len(processed_dataset))
print(processed_dataset[i]['prompt'])
print('-----------------')
print(processed_dataset[i]['response'])

In [None]:
tokenized_dataset = processed_dataset.map(tokenize_data, batched=False, remove_columns=['prompt', 'response'])
# tokenized_dataset[0]

In [None]:
CONTEXT_LENGTH = 2048

In [None]:
len(tokenized_dataset.filter(lambda x: len(x['input_ids']) + len(x['labels']) + 2 <= CONTEXT_LENGTH))

In [None]:
# get stats of length of input_ids + labels
lengths = []
for example in tokenized_dataset:
    lengths.append(len(example['input_ids']) + len(example['labels']))

import numpy as np
np.mean(lengths), np.std(lengths), np.max(lengths), np.min(lengths)

In [None]:
tokenized_dataset = tokenized_dataset.filter(lambda x: len(x['input_ids']) + len(x['labels']) + 2 < CONTEXT_LENGTH)
len(tokenized_dataset)

In [None]:
# tokenizer.pad_token_id = 3
print(tokenizer.pad_token)
tokenizer.padding_side = 'right'

In [None]:
from functools import partial
final_dataset = tokenized_dataset.map(partial(_process_and_batch_decoder_only, max_seq_len=CONTEXT_LENGTH), batched=True)

In [None]:
final_dataset.save_to_disk('/home4/tuannd/llm-training/viqauni_vinallama_v2')

In [None]:
len(final_dataset)

In [None]:
i = random.randint(0, len(final_dataset))
print(tokenizer.decode(final_dataset[i]['input_ids'], skip_special_tokens=False))

labels = [i for i in final_dataset[i]['labels'] if i != -100]
tokenizer.decode(labels, skip_special_tokens=False)