## 1. Requirements

*dependencies*

In [1]:
print('hello')

hello


In [None]:
%%capture output

! pip install datasets
! pip install peft
! pip install bitsandbytes
! pip install accelerate
! pip install transformers

In [3]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed,
    BitsAndBytesConfig,
)

*file system*

In [12]:
import os, pandas as pd
from google.colab import drive

# Access drive
drive.mount('/content/drive')
PATH = '/content/drive/MyDrive/WeChat-Copilot-Live/'
OUTPUT_DIR = os.path.join(PATH, 'Model', 'codellama')

Mounted at /content/drive


## 2. Configurations

In [4]:
MODEL = "codellama/CodeLlama-7b-hf"   # Model checkpoint on the Hugging Face Hub
DATASET = "smangrul/hf-stack-v1"      # Dataset on the Hugging Face Hub
DATA_COLUMN = "content"               # Column name containing the code content

In [21]:
SEQ_LENGTH = 1024                  # Sequence length

# Training arguments
MAX_STEPS = 2000                   # max_steps
BATCH_SIZE = 8                     # batch_size
GR_ACC_STEPS = 1                   # gradient_accumulation_steps
LR = 5e-4                          # learning_rate
LR_SCHEDULER_TYPE = "cosine"       # lr_scheduler_type
WEIGHT_DECAY = 0.01                # weight_decay
NUM_WARMUP_STEPS = 30              # num_warmup_steps
EVAL_FREQ = 100                    # eval_freq
SAVE_FREQ = 100                    # save_freq
LOG_FREQ = 25                      # log_freq
BF16 = True                        # bf16
FP16 = False                       # no_fp16

# FIM trasformations arguments
FIM_RATE = 0.5                     # fim_rate
FIM_SPM_RATE = 0.5                 # fim_spm_rate

# LORA
LORA_R = 8                         # lora_r
LORA_ALPHA = 32                    # lora_alpha
LORA_DROPOUT = 0.1                 # lora_dropout
LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj"    # lora_target_modules

# bitsandbytes config
USE_NESTED_QUANT = False            # use_nested_quant
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"# bnb_4bit_compute_dtype

SEED = 121

In [8]:
set_seed(SEED)

## 3. Data Preparation

In [31]:
from datasets import Dataset

root_directory = os.path.join(PATH, 'Data', 'Sample_Code_OC')

# 初始化数据列表
data = []

# 遍历每个语言文件夹
for filename in os.listdir(root_directory):
    file_path = os.path.join(root_directory, filename)
    if os.path.isfile(file_path):
        try:
            # 读取代码文件内容
            with open(file_path, 'r', encoding='utf-8') as file:
                code = file.read()
            # 将代码和语言标签添加到数据列表中
            data.append({'code': code, 'language': language})
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

# 创建数据集对象
dataset = Dataset.from_list(data)

In [32]:
dataset

Dataset({
    features: ['code', 'language'],
    num_rows: 4
})



> streaming: load the data progressively

> validation set: first 4000, training set: remaining



In [34]:
from datasets import load_dataset
import torch
from tqdm import tqdm

dataset = load_dataset(
    DATASET,
    data_dir="data",
    split="train",
    streaming=True,
)

valid_data = dataset.take(4000)
train_data = dataset.skip(4000)
train_data = train_data.shuffle(buffer_size=5000, seed=SEED)

In [28]:
dataset

Dataset({
    features: [],
    num_rows: 0
})

In [24]:
dataset

IterableDataset({
    features: ['repo_id', 'file_path', 'content', '__index_level_0__'],
    num_shards: 1
})

In [38]:
for sample in dataset:
    print(sample)
    break

{'repo_id': 'hf_public_repos', 'file_path': 'hf_public_repos/accelerate/README.md', 'content': '<!---\nCopyright 2021 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the "License");\nyou may not use this file except in compliance with the License.\nYou may obtain a copy of the License at\n\n    http://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software\ndistributed under the License is distributed on an "AS IS" BASIS,\nWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\nSee the License for the specific language governing permissions and\nlimitations under the License.\n-->\n\n<p align="center">\n    <br>\n    <img src="https://raw.githubusercontent.com/huggingface/accelerate/main/docs/source/imgs/accelerate_logo.png" width="400"/>\n    <br>\n<p>\n\n<p align="center">\n    <!-- Uncomment when CircleCI is set up\n    <a href="https://circleci.com/gh/huggingface/accel

In [35]:
dataset[0]

NotImplementedError: Subclasses of Dataset should implement __getitem__.

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """

    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        total_characters += len(example[data_column])
        total_tokens += len(tokenizer(example[data_column]).tokens())

    return total_characters / total_tokens


chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
print(f"\nThe character to token ratio of the dataset is: {chars_per_token:.2f}")

100%|██████████| 400/400 [00:04<00:00, 80.23it/s] 


The character to token ratio of the dataset is: 2.56





### FIM transformation

be careful, each model defines its own "special tokens", which affects training and code completion prompt

In [23]:
# codeLlama
tokenizer.special_tokens_map['additional_special_tokens']

['▁<PRE>', '▁<MID>', '▁<SUF>', '▁<EOT>']

In [22]:
# starcoder
tokenizer_starcode = AutoTokenizer.from_pretrained('bigcode/starcoder2-3b', trust_remote_code=True)
tokenizer_starcode.special_tokens_map['additional_special_tokens']

['<|endoftext|>',
 '<fim_prefix>',
 '<fim_middle>',
 '<fim_suffix>',
 '<fim_pad>',
 '<repo_name>',
 '<file_sep>',
 '<issue_start>',
 '<issue_comment>',
 '<issue_closed>',
 '<jupyter_start>',
 '<jupyter_text>',
 '<jupyter_code>',
 '<jupyter_output>',
 '<jupyter_script>',
 '<empty_output>',
 '<code_to_intermediate>',
 '<intermediate_to_code>',
 '<pr>',
 '<pr_status>',
 '<pr_is_merged>',
 '<pr_base>',
 '<pr_file>',
 '<pr_base_code>',
 '<pr_diff>',
 '<pr_diff_hunk>',
 '<pr_comment>',
 '<pr_event_id>',
 '<pr_review>',
 '<pr_review_state>',
 '<pr_review_comment>',
 '<pr_in_reply_to_review_id>',
 '<pr_in_reply_to_comment_id>',
 '<pr_diff_hunk_comment_line>',
 '<NAME>',
 '<EMAIL>',
 '<KEY>',
 '<PASSWORD>']

In [14]:
import functools
import numpy as np


# Helper function to get token ids of the special tokens for prefix, suffix and middle for FIM transformations.
@functools.lru_cache(maxsize=None)
def get_fim_token_ids(tokenizer):
    try:
        FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map["additional_special_tokens"][:4]
        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
            tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
        )
    except KeyError:
        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = None, None, None, None
    return suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id


## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
def permute(
    sample,
    np_rng,
    suffix_tok_id,
    prefix_tok_id,
    middle_tok_id,
    pad_tok_id,
    fim_rate=0.5,
    fim_spm_rate=0.5,
    truncate_or_pad=False,
):
    """
    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
    PSM and SPM (with a probability of fim_spm_rate).
    """

    # The if condition will trigger with the probability of fim_rate
    # This means FIM transformations will apply to samples with a probability of fim_rate
    if np_rng.binomial(1, fim_rate):

        # Split the sample into prefix, middle, and suffix, based on randomly generated indices stored in the boundaries list.
        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
        boundaries.sort()

        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)

        if truncate_or_pad:
            # calculate the new total length of the sample, taking into account tokens indicating prefix, middle, and suffix
            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
            diff = new_length - len(sample)

            # trancate or pad if there's a difference in length between the new length and the original
            if diff > 0:
                if suffix.shape[0] <= diff:
                    return sample, np_rng
                suffix = suffix[: suffix.shape[0] - diff]
            elif diff < 0:
                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])

        # With the probability of fim_spm_rateapply SPM variant of FIM transformations
        # SPM: suffix, prefix, middle
        if np_rng.binomial(1, fim_spm_rate):
            new_sample = np.concatenate(
                [
                    [prefix_tok_id, suffix_tok_id],
                    suffix,
                    [middle_tok_id],
                    prefix,
                    middle,
                ]
            )
        # Otherwise, apply the PSM variant of FIM transformations
        # PSM: prefix, suffix, middle
        else:

            new_sample = np.concatenate(
                [
                    [prefix_tok_id],
                    prefix,
                    [suffix_tok_id],
                    suffix,
                    [middle_tok_id],
                    middle,
                ]
            )
    else:
        # don't apply FIM transformations
        new_sample = sample

    return list(new_sample), np_rng

In [15]:
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
import random

# Create an Iterable dataset that returns constant-length chunks of tokens from a stream of text files.
class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
            seed (int): Seed for random number generator.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
        content_field="content",
        fim_rate=0.5,
        fim_spm_rate=0.5,
        seed=0,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
        self.content_field = content_field
        self.fim_rate = fim_rate
        self.fim_spm_rate = fim_spm_rate
        self.seed = seed

        (
            self.suffix_tok_id,
            self.prefix_tok_id,
            self.middle_tok_id,
            self.pad_tok_id,
        ) = get_fim_token_ids(self.tokenizer)
        if not self.suffix_tok_id and self.fim_rate > 0:
            print("FIM is not supported by tokenizer, disabling FIM")
            self.fim_rate = 0

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        np_rng = np.random.RandomState(seed=self.seed)
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(next(iterator)[self.content_field])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []

            for tokenized_input in tokenized_inputs:
                # optionally do FIM permutations
                if self.fim_rate > 0:
                    tokenized_input, np_rng = permute(
                        tokenized_input,
                        np_rng,
                        self.suffix_tok_id,
                        self.prefix_tok_id,
                        self.middle_tok_id,
                        self.pad_tok_id,
                        fim_rate=self.fim_rate,
                        fim_spm_rate=self.fim_spm_rate,
                        truncate_or_pad=False,
                    )

                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            examples = []
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    examples.append(input_ids)
            random.shuffle(examples)
            for example in examples:
                self.current_size += 1
                yield {
                    "input_ids": torch.LongTensor(example),
                    "labels": torch.LongTensor(example),
                }


In [16]:
train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        infinite = True,
        seq_length = SEQ_LENGTH,
        chars_per_token = chars_per_token,
        content_field = DATA_COLUMN,
        fim_rate = FIM_RATE,
        fim_spm_rate = FIM_SPM_RATE,
        seed = SEED,
)
eval_dataset = ConstantLengthDataset(
        tokenizer,
        valid_data,
        infinite = False,
        seq_length = SEQ_LENGTH,
        chars_per_token = chars_per_token,
        content_field = DATA_COLUMN,
        fim_rate = FIM_RATE,
        fim_spm_rate = FIM_SPM_RATE,
        seed=SEED,
)

## 4. Model Preparation

In [7]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft.tuners.lora import LoraLayer

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, BNB_4BIT_COMPUTE_DTYPE),
    bnb_4bit_use_double_quant=USE_NESTED_QUANT,
)

tokenizer_llama_coder = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
        MODEL,
        quantization_config=bnb_config,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
)

model = prepare_model_for_kbit_training(model)

tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now default to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [10]:
# Set up lora
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 4,194,304 || all params: 6,742,740,992 || trainable%: 0.0622


## 5. Train

In [22]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    max_steps=MAX_STEPS,
    eval_steps=EVAL_FREQ,
    save_steps=SAVE_FREQ,
    logging_steps=LOG_FREQ,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_steps=NUM_WARMUP_STEPS,
    gradient_accumulation_steps=GR_ACC_STEPS,
    gradient_checkpointing=True,
    fp16=FP16,
    bf16=BF16,
    weight_decay=WEIGHT_DECAY,
    include_tokens_per_second=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)



In [23]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.00 GiB. GPU 0 has a total capacity of 14.74 GiB of which 680.12 MiB is free. Process 2961 has 14.07 GiB memory in use. Of the allocated memory 13.84 GiB is allocated by PyTorch, and 117.47 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)