In [1]:
import os
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig
from transformers import TrainingArguments
from trl import SFTTrainer
from pynvml import *
import torch
import datasets
import numpy as np
from peft import (
    LoraConfig,
    TaskType,
    prepare_model_for_int8_training,
)
import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [3]:
torch.cuda.empty_cache()

In [18]:
print())

/home/edgar-pino/dev/llm-lol/examples/../models


In [19]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
# torch.backends.cuda.matmul.allow_tf32 = True
version_date = int(datetime.datetime.now().timestamp())
model_id= os.path.join(os.getcwd(), '..', "models/13B-chat-hf")
device_map = "auto"
output_dir = f"./llama-2/{version_date}"
final_checkpoint_dir = os.path.join(output_dir, "final_checkpoint")

In [20]:
print_gpu_utilization()

GPU memory occupied: 483 MB.


In [21]:
tokenizer = LlamaTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
print_gpu_utilization()


GPU memory occupied: 483 MB.


In [22]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = LlamaForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, use_cache=False, device_map=device_map)
model.config.use_cache = False
model.config.pretraining_tp = 1
print_gpu_utilization()


Loading checkpoint shards: 100%|██████████| 3/3 [03:33<00:00, 71.06s/it]
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at /home/edgar-pino/dev/llm-lol/examples/../models/13B-chat-hf and are newly initialized: ['model.layers.23.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.24.self_attn.rotary_emb.inv_freq', 'model.layers.26.self_attn.rotary_emb.inv_freq', 'model.layers.30.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.36.self_attn.rotary_emb.inv_freq', 'model.layers.38.self_attn.rotary_emb.inv_freq', 'model.layers.11.self_attn.rotary_emb.inv_freq', 'model.layers.22.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.3.self_attn.rotary_emb.inv_freq', 'model.layers.9.self_attn.rotary_emb.inv_freq', 'model.layers.33.self_attn.rotary_emb.inv_freq', 'model.layers.39.self_at

GPU memory occupied: 8235 MB.


In [8]:
# eval_prompt = """
# Implement the following program given the instructions:
# Write a simple program to calculate the cosine similarity between two vectors in Typescript.

# Code:
# """

# model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

# model.eval()
# with torch.no_grad():
#     print(tokenizer.decode(model.generate(**model_input, max_new_tokens=100)[0], skip_special_tokens=True))

In [9]:
def apply_prompt_template(samples, tokenizer):
    prompt = (
        f"Implement the following program given the instructions:\n{{instructions}}\n---\nCode:\n{{code}}{{eos_token}}"
    )    
    
    batch_text = []

    batch = list(zip(samples['prompt'], samples['response']))

    for sample in batch:
        text = prompt.format(
            instructions=sample[0],
            code=sample[1],
            eos_token=tokenizer.eos_token,
        )
        batch_text.append(text)
    
    return {"text": batch_text}

In [10]:
def get_preprocessed_codes(tokenizer, split):
    dataset = datasets.load_dataset("nampdn-ai/tiny-codes", split=split)
    dataset = dataset.shuffle().select(range(150))

    def apply_prompt_template_batch(samples):
        return apply_prompt_template(samples, tokenizer)

    return dataset.map(apply_prompt_template_batch, batched=True, remove_columns=list(dataset.features))


In [11]:
dataset = get_preprocessed_codes(tokenizer, "train")

Map: 100%|██████████| 150/150 [00:00<00:00, 1228.32 examples/s]


In [12]:
max_seq_length = 1024

# Define training args
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    bf16=True,  # Use fp16 if available
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    optim="adamw_torch_fused", #"adafactor"
    max_steps=150,
    report_to="none",
    learning_rate=1e-4,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True
)

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules = ["q_proj", "v_proj"]
)

# https://huggingface.co/docs/trl/main/en/sft_trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_args,
    callbacks=[]
)

# Start training
result = trainer.train()
print_summary(result)



Map: 100%|██████████| 150/150 [00:00<00:00, 304.93 examples/s]


Step,Training Loss
10,1.3947
20,1.2265
30,1.0227
40,0.8994
50,0.7921
60,0.7393
70,0.7161
80,0.6705
90,0.6621
100,0.6341


Time: 2825.52
Samples/second: 0.85
GPU memory occupied: 10487 MB.


In [13]:
trainer.model.save_pretrained(final_checkpoint_dir)

In [16]:
eval_prompt = """
Implement the following program given the instructions:
Develop a C# program snippet to Transform Low Sneezing and coughing etiquette: Hand Hygiene After Sneezing or Coughing for Engineer for Experts. Incorporate if/else or switch/case statements to handle various cases related to the Bias. Dry-run, ensure your control flow logic is clear and well-commented.

Code:

"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256)[0], skip_special_tokens=True))


Implement the following program given the instructions:
Develop a C# program snippet to Transform Low Sneezing and coughing etiquette: Hand Hygiene After Sneezing or Coughing for Engineer for Experts. Incorporate if/else or switch/case statements to handle various cases related to the Bias. Dry-run, ensure your control flow logic is clear and well-commented.

Code:

Here is some sample C# code which implements hand washing after sneezing or coughing as per expert recommendations:

    ```csharp
    // Define variables for personal hygiene practices
    private bool useHandSanitizer = false;
    private bool useSoapAndWater = false;

    // Prompt user to select preferred method of hand washing
    Console.WriteLine("Please select your preferred method of hand washing:");
    Console.WriteLine("1) Use hand sanitizer");
    Console.WriteLine("2) Use soap and water");
    Console.WriteLine("3) Both");

    int selection = int.Parse(Console.ReadLine());

    // Determine appropriate actio

---
```python
num_train_epochs=1,
per_device_train_batch_size=6,
gradient_accumulation_steps=2,
gradient_checkpointing= False
```

Time: 336.95

Samples/second: 0.36

GPU memory occupied: 15350 MB.

---

```python
num_train_epochs=1,
per_device_train_batch_size=6,
gradient_accumulation_steps=2,
gradient_checkpointing= True
```

Time: 379.89

Samples/second: 0.32

GPU memory occupied: 14394 MB.


---

```python
bf16=True,
num_train_epochs=1,
per_device_train_batch_size=6,
gradient_accumulation_steps=2,
gradient_checkpointing= True
```

Time: 136.64

Samples/second: 0.88

GPU memory occupied: 19219 MB.

---
torch.backends.cuda.matmul.allow_tf32 = True


Time: 145.73

Samples/second: 0.82

GPU memory occupied: 15647 MB.

---
```python
bf16=True,
num_train_epochs=1,
per_device_train_batch_size=2,
gradient_accumulation_steps=8,
gradient_checkpointing= True
```


Time: 189.84

Samples/second: 0.84

GPU memory occupied: 8509 MB.