# Finetuning

## Libraries, Data, and Downloads

In [2]:
!pip install datasets
!pip install tensorboardX
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install datasets
!pip install wandb
!pip install evaluate
!pip install trl
!pip install codebleu
!pip install scipy
!pip install tiktoken
!pip install bitsandbytes



In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    # LlamaForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer



In [4]:
# Model from Hugging Face hub
base_model = "codellama/CodeLlama-7b-Instruct-hf"

# New instruction dataset
turing_dataset = "w266finalproject/turing-60k-instruct"

# Fine-tuned model
new_model = "llama-2-7b-chat-non-self-instruct-v1"

In [5]:
train_dataset = load_dataset(turing_dataset, split='train')

In [6]:
train_dataset.shape

(41327, 4)

In [7]:
from datasets import Dataset
ds = Dataset.from_dict(train_dataset[:500])

In [8]:
def format_instruction(sample):
    return f"""<s>[INST] <<SYS>>\\nYou are an expert in Data Science. Below is an instruction that describes a task. Write code that appropriately completes the request. Please wrap your python code using ```python ```\\n<</SYS>>\\n\\n{sample['Prompt']}[/INST]"""


# Apply the formatting function to each sample in the dataset
ds = ds.map(lambda sample: {"formatted_instruction": format_instruction(sample)})

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [9]:
print(ds.shape)
print(ds.features)

(500, 5)
{'Notebook': Value(dtype='int64', id=None), 'Position': Value(dtype='int64', id=None), 'Prompt': Value(dtype='string', id=None), 'Example': Value(dtype='string', id=None), 'formatted_instruction': Value(dtype='string', id=None)}


## Load the model, set the training configs

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, BitsAndBytesConfig

In [11]:
base_model = "codellama/CodeLlama-7b-Instruct-hf"
bnb_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.float16,
   bnb_4bit_use_double_quant=False,
)

tokenizer = AutoTokenizer.from_pretrained(base_model, device_map="auto", trust_remote_code=True)

tokenizer.padding_side = 'right'

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code = True)

model.config.use_cache = False

tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
#Check if model is generating OK
from tqdm.notebook import tqdm
data = ds["formatted_instruction"][:2]
eval_prompts = [tokenizer(eval_prompt, return_tensors='pt').to("cuda:0") for eval_prompt in data]
results = []
for prompt in tqdm(eval_prompts):
    toks = model.generate(**prompt, max_new_tokens=1000)[0]
    results.append(tokenizer.decode(toks, skip_special_tokens=True))

In [None]:
print(results[1])

In [11]:
major, _ = torch.cuda.get_device_capability()
if major >= 8:
    print("=" * 80)
    print("Your GPU supports bfloat16: accelerate training with bf16=True")
    print("=" * 80)

Your GPU supports bfloat16: accelerate training with bf16=True


In [12]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
)

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard",
)

In [13]:
trainer = SFTTrainer(
    model=model,
    train_dataset=ds,
    peft_config=peft_params,
    dataset_text_field="formatted_instruction",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [14]:
trainer.train()

You're using a CodeLlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 1 has a total capacty of 47.54 GiB of which 485.12 MiB is free. Process 4272 has 6.62 GiB memory in use. Including non-PyTorch memory, this process has 40.43 GiB memory in use. Of the allocated memory 39.04 GiB is allocated by PyTorch, and 1.08 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [24]:
trainer.model.save_pretrained(new_model, save_adapter=True, save_config=True)
# trainer.tokenizer.save_pretrained(new_model)

In [16]:
from peft import PeftModel

In [35]:
# Reload model in FP16 and merge it with LoRA weights
model_reload = AutoModelForCausalLM.from_pretrained(
    base_model,
    # load_in_4bit=True,
    # quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map = 'auto'
)
model_reload.config.use_cache = False
# model_reload.to("cuda")

merged_model = PeftModel.from_pretrained(model_reload, new_model)
merged_model = merged_model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
# Check Merged Model generation
from tqdm.notebook import tqdm
data = ds["formatted_instruction"][:2]
eval_prompts = [tokenizer(eval_prompt, return_tensors='pt').to("cuda") for eval_prompt in data]
results_merged = []
for prompt in tqdm(eval_prompts):
    toks = merged_model.generate(**prompt, max_new_tokens=1000)[0]
    results_merged.append(tokenizer.decode(toks, skip_special_tokens=True))

  0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [37]:
print(results_merged[1])

[INST] <<SYS>>\nYou are an expert in Data Science. Below is an instruction that describes a task. Write code that appropriately completes the request. Please wrap your python code using ```python ```\n<</SYS>>\n\n## Loading an image[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Image Loading**[/INST]# **Ima

## Run against test set

In [None]:
test_dataset = load_dataset(turing_dataset, split='test')
test_dataset = test_dataset.map(lambda sample: {"formatted_instruction": format_instruction(sample)})

In [None]:
test_dataset["formatted_instruction"][0]

In [None]:
from tqdm.notebook import tqdm
data = test_dataset["formatted_instruction"][0:3]
eval_prompts = [tokenizer(eval_prompt, return_tensors='pt').to("cuda:0") for eval_prompt in data]
results = []
for prompt in tqdm(eval_prompts):
    toks = merged_model.generate(**prompt, max_new_tokens=1000)[0]
    results.append(tokenizer.decode(toks, skip_special_tokens=True))


In [None]:
print(results[2])

In [None]:
from tqdm.notebook import tqdm
data = test_dataset["formatted_instruction"][0:3]
eval_prompts = [tokenizer(eval_prompt, return_tensors='pt').to("cuda:0") for eval_prompt in data]
results = []
for prompt in tqdm(eval_prompts):
    toks = model_reload.generate(**prompt, max_new_tokens=1000)[0]
    results.append(tokenizer.decode(toks, skip_special_tokens=True))
print(results[0])

In [None]:
from tqdm.notebook import tqdm
data = test_dataset["formatted_instruction"][0:3]
eval_prompts = [tokenizer(eval_prompt, return_tensors='pt').to("cuda:0") for eval_prompt in data]
results = []
for prompt in tqdm(eval_prompts):
    toks = model.generate(**prompt, max_new_tokens=1000)[0]
    results.append(tokenizer.decode(toks, skip_special_tokens=True))
print(results[0])

In [None]:
import re

def extract_python_code(text):
    # Define the regex pattern for Python code blocks
    pattern = r"```\n(.*?)```"

    # Find all matches in the text
    matches = re.findall(pattern, text, re.DOTALL)

    # Concatenate the matches into a single string with newline characters
    concatenated_code = '\n'.join(matches)

    return concatenated_code

In [None]:
predictions = [str(result) for result in results]
references = [[str(ref)] for ref in test_dataset['Example'][0:2000]]

In [None]:
from codebleu import calc_codebleu

result = calc_codebleu(references=references, predictions=predictions, lang="python", weights=(0.25, 0.25, 0.25, 0.25), tokenizer=None)
print(result)