## Setting up

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from trl import setup_chat_format
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig, 
                          TrainingArguments, 
                          pipeline, 
                          logging)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# The model that you want to train from the Hugging Face hub
base_model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

# Fine-tuned model name
# new_model = "Llama-2-7b-chat-finetune-code-oldstyle"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension.The parameter r (lora_r)in LoraConfig is the rank that determines the shape of the update matrices BA.
# According to the paper, you can set a small rank and still get excellent results
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = 3000

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 150

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}



## Loading and processing the dataset

In [3]:
import json

def read_jsonl_file(file_path):
    messages = []
    with open(file_path, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())
            messages.append(json_obj)
    return messages

file_path = 'llama3_10k_code_data.jsonl'
json_data = read_jsonl_file(file_path)


## Loading the model and tokenizer

In [4]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.89s/it]


In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [6]:
dataset = Dataset.from_dict({"chat": json_data})
dataset = dataset.map(lambda x: {"text": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
print(dataset['text'][0])

Map: 100%|██████████| 10000/10000 [00:01<00:00, 9018.27 examples/s]

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

Develop an AI-assisted program for optimising code snippets. Given a code base, the program should suggest optimisations to improve the code in terms of readability, time complexity, memory efficiency, and other measurements. Not applicable<|eot_id|><|start_header_id|>assistant<|end_header_id|>

import ast
import astor
import codegen

def optimize_code(source_code):
  tree = ast.parse(source_code)
  optimized_tree = optimize_ast(tree)
  optimized_code = astor.to_source(optimized_tree)
  return codegen.to_source(optimized_code)

def optimize_ast(ast_tree):
  tree_converter = Optimizer(ast_tree)
  tree_converter.analyze()
  tree_converter.transform()
  return ast_tree

class Optimizer(ast.NodeTransformer):
  def analyze(self):
    ...
    # perform static code analysis to measure the code

  def transform(self):
  




## Model evalution before fine-tuning

In [7]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50, temperature=0.1)
result = pipe("Who is donald trump")
result

[{'generated_text': "Who is donald trump's wife?\nMelania Trump is the wife of Donald Trump, the 45th President of the United States. She was born on April 26, 1970, in Novo Mesto, Slovenia (then part of Yugoslavia). Melania was"}]

## Setting up the model

In [11]:
output_dir="llama-3.1-fine-tuned-model"

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['down_proj', 'gate_proj', 'o_proj', 'v_proj', 'up_proj', 'q_proj', 'k_proj'],
)


# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)




Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Map: 100%|██████████| 10000/10000 [00:01<00:00, 7128.42 examples/s]
max_steps is given, it will override any value given in num_train_epochs


## Model Training

In [12]:
# Train model
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 254.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 10.06 MiB is free. Including non-PyTorch memory, this process has 31.71 GiB memory in use. Of the allocated memory 30.97 GiB is allocated by PyTorch, and 386.94 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# wandb.finish()
model.config.use_cache = True

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▂▂▃▃▄▅▅▆▆▇██
train/global_step,▁▂▂▃▃▄▅▅▆▆▇██
train/grad_norm,▅▆▅▃▄▄▃▂▂▂▁█
train/learning_rate,██▇▇▆▅▄▃▂▂▁▁
train/loss,█▃▃▁▂▂▂▁▁▂▁▁

0,1
total_flos,2.3570524667904e+16
train/epoch,1.0
train/global_step,125.0
train/grad_norm,0.22325
train/learning_rate,0.0
train/loss,1.8444
train_loss,1.93999
train_runtime,4248.0065
train_samples_per_second,0.235
train_steps_per_second,0.029


## Saving the model and tokenizer

In [None]:
# Save trained model and tokenizer
trainer.model.save_pretrained("trained-model")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

('llama-3.1-fine-tuned-model/tokenizer_config.json',
 'llama-3.1-fine-tuned-model/special_tokens_map.json',
 'llama-3.1-fine-tuned-model/tokenizer.json')

In [None]:
import pandas as pd

harmful = pd.read_csv("/scratch/essa/combine_harmful.csv")
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_new_tokens=100, temperature=0.1)
response = []
for i in tqdm(range(harmful.shape[0])):
    result = pipe(harmful["Description"][i])
    response.append(result[0]['generated_text'])
    

final_result = pd.Dataframe(response)
final_result.to_csv('Llama_3.1_code_10k_finetuned_results.csv')

NameError: name 'pipeline' is not defined

In [None]:
final_result = pd.DataFrame(response)
final_result.to_csv('Llama_3.1_code_10k_finetuned_results.csv')

In [None]:
# Empty VRAM
del model
# del pipe
del trainer
import gc
gc.collect()
gc.collect()

import torch

# Get the current CUDA device
device = torch.cuda.current_device()

# Reset the device
torch.cuda.reset_max_memory_allocated(device)
torch.cuda.reset_max_memory_cached(device)

torch.cuda.empty_cache()

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, "trained-model")
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
model.push_to_hub("Essacheez/llama3.1-8b-code-10k-LLAMA3-style", check_pr=True)
tokenizer.push_to_hub("Essacheez/llama3.1-8b-code-10k-LLAMA3-style",check_pr=True)