# Verify torch version

## Prepare Environment (Should not be neccessary if lora kernel is used)
- Alle Installationen durchführen und danach den Kernel neu starten. Es funktioniert nur in dieser Kombination

In [None]:
# Install Pytorch for FSDP and FA/SDPA
!pip install torch==2.0.1 tensorboard
#pip install tensorboard datasets
 
# Install Hugging Face libraries
!pip install pydantic==2.0.0
!pip install  --upgrade "transformers==4.40.1" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"

In [6]:
# Double check environment
# torch must be 2.0.0, transfomers must be 4.40, pydantic must be 2.0.0
!conda list | grep torch
!conda list | grep transformers
!conda list | grep pydantic

pytorch-revgrad           0.2.0                    pypi_0    pypi
torch                     2.0.0+cu117              pypi_0    pypi
torchaudio                2.0.1+cu117              pypi_0    pypi
torchvision               0.15.1+cu117             pypi_0    pypi
adapter-transformers      3.0.1                    pypi_0    pypi
transformers              4.40.0                   pypi_0    pypi
pydantic                  1.7.4                    pypi_0    pypi
pydantic-core             2.18.1                   pypi_0    pypi


### Wahrscheinlich obsolet

In [None]:
!pip install transformers==4.40.1 torch==2.0.1 trl peft tensorboard pydantic==2.0.0

In [None]:
!pip install transformers==4.40.1
!pip install torch==2.0.1

In [None]:
!pip install trl
!pip install peft

## Login to hugging face

In [None]:
!huggingface-cli login --token "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"

## Free GPU Memory
- Alternatively > Restart Kernel

In [None]:
# import torch
del model
del trainer
torch.cuda.empty_cache()
print(torch.cuda.memory_reserved(0))
print(torch.cuda.memory_allocated(0))

##  Workflow for Lora tuning
- Runs also well with "torchtune" kernel

### Import libraries

In [1]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
!echo $CUDA_VISIBLE_DEVICES

2


In [2]:
import torch
import transformers
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

2024-07-02 07:26:30.311993: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Define Model and tokenizer

In [3]:

# -> For Mistral 7B
#model_location = "mistralai/Mistral-7B-Instruct-v0.2"
#output_model = "/home/thsch026/masterarbeit/models/generated/lora/mistral_7B-Instruct_ms-marco"

# -> For Llama 3 8B HF
model_location = "meta-llama/Meta-Llama-3-8B"
output_model = "/home/thsch026/masterarbeit/models/generated/lora/Meta-Llama-3-8B_ms-marco"

# -> For Llama 3 8B HF Instruct
#model_location = "/home/thsch026/masterarbeit/models/llama3/Meta-Llama-3-8B-Instruct-HF"
#output_model = "/home/thsch026/masterarbeit/models/generated/lora/Llama-3-8B-Instruct-HF_ms-marco"

tokenizer = AutoTokenizer.from_pretrained(model_location)

model = AutoModelForCausalLM.from_pretrained(
    model_location,
   # load_in_8bit=True, # was 8bit
    device_map="cuda", # was auto
    torch_dtype=torch.bfloat16
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
model = prepare_model_for_kbit_training(model)

tokenizer.pad_token = "!"

### For explanation of the values for LORA configuration below see:
- https://medium.com/@drishtisharma96505/comparative-analysis-of-lora-parameters-on-llama-2-with-flash-attention-574b913295d4

In [5]:
# Lora paramters
CUTOFF_LEN = 768
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"
                    , "down_proj", "lm_head"], #these are the  names for the layers
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)

In [None]:
### Using general dataset MS_marco (v1.1) for Finetuning

In [6]:
dataset = load_dataset('ms_marco','v1.1') # General dataset
print("dataset", dataset)
train_data = dataset["test"]

dataset DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 10047
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 82326
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 9650
    })
})


###  ToDO: anpassen der Prompt Struktur an das genutzte Modell

Prompt Structure for Mistral 7 B

In [7]:
def generate_prompt(user_query):  #The prompt format is taken from the official Mistral huggingface page
  if user_query["answers"] is not None and user_query["query"] is not None:
      p =  "<s> [INST]" + str(user_query["query"]) + "[/INST]" +  str(user_query["answers"]) + "</s>"
      return p
  else:
    p = "<s> [INST]" + "Hello" + "[/INST]" +  "Hello" + "</s>"
    return p

Prompt Structure for llama3

In [8]:
def tokenize(prompt):
    return tokenizer(
        prompt + tokenizer.eos_token,
        truncation=True,
        max_length=CUTOFF_LEN ,
        padding="max_length"
    )

In [9]:
if torch.cuda.device_count() > 1:
    
    #teacher_model = torch.nn.parallel.DistributedDataParallel(teacher_model)
    #student_model = torch.nn.parallel.DistributedDataParallel(student_model)
    model = torch.nn.DataParallel(model)

In [9]:
print("CUDA: ", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'])

CUDA:  True


Map:   0%|          | 0/9650 [00:00<?, ? examples/s]

In [10]:
trainer = Trainer(
    model=model, 
    train_dataset=train_data,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=3,    # 3 or 6 is good
        learning_rate=1e-4,
        logging_steps=5,
        optim="adamw_torch",
        save_strategy="epoch",
        output_dir=output_model
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mthomas-t-schmitt[0m ([33mpumaai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,3.4727
10,2.632
15,2.3436
20,1.8413
25,1.9253
30,1.588
35,1.5244
40,1.4971
45,1.6836
50,1.4499




TrainOutput(global_step=7236, training_loss=1.213434515110098, metrics={'train_runtime': 45476.1863, 'train_samples_per_second': 0.637, 'train_steps_per_second': 0.159, 'total_flos': 1.0038997359431516e+18, 'train_loss': 1.213434515110098, 'epoch': 2.999378238341969})

## Example 2 - Currently not working CUDA iisues

In [None]:
from transformers import AutoTokenizer, TrainingArguments

In [None]:
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}
{"messages": [{"role": "system", "content": "You are..."}, {"role": "user", "content": "..."}, {"role": "assistant", "content": "..."}]}

In [None]:
from datasets import load_dataset
 
# Convert dataset to OAI messages
system_message = """You are Llama, an AI assistant created by Philipp to be helpful and honest. Your knowledge spans a wide range of topics, allowing you to engage in substantive conversations and provide analysis on complex subjects."""
 
def create_conversation(sample):
    if sample["messages"][0]["role"] == "system":
        return sample
    else:
      sample["messages"] = [{"role": "system", "content": system_message}] + sample["messages"]
      return sample
 
# Load dataset from the hub
dataset = load_dataset("HuggingFaceH4/no_robots")
 
# Add system message to each conversation
columns_to_remove = list(dataset["train"].features)
columns_to_remove.remove("messages")
dataset = dataset.map(create_conversation, remove_columns=columns_to_remove,batched=False)
 
# Filter out conversations which are corrupted with wrong turns, keep which have even number of turns after adding system message
dataset["train"] = dataset["train"].filter(lambda x: len(x["messages"][1:]) % 2 == 0)
dataset["test"] = dataset["test"].filter(lambda x: len(x["messages"][1:]) % 2 == 0)
 
# save datasets to disk
dataset["train"].to_json("train_dataset.json", orient="records", force_ascii=False)
dataset["test"].to_json("test_dataset.json", orient="records", force_ascii=False)

In [None]:
import trl
#import bitsandbytes

In [None]:
!unset CUDA_VISIBLE_DEVICES
CUDA_LAUNCH_BLOCKING=1

In [None]:
!torchrun --nproc_per_node=4 ./scripts/run_fsdp_qlora.py --config llama_3_70b_fsdp_qlora.yaml

In [None]:
python -m llama_recipes.finetuning --use_peft --peft_method lora --quantization --model_name $MODEL --output_dir ../llama/models_ft/7B-peft --batch_size_training 2 --gradient_accumulation_steps 2

### Source https://medium.com/@prakharsaxena11111/a-general-approach-to-fine-tune-any-llm-using-lora-29d24e47a345

### GPU Memory issues

In [None]:
import time
i = 1
while i == 1:
    time.sleep(100)


In [None]:
!nvcc -V

In [None]:
!conda list |grep transformers