# Verify torch version

## Prepare Environment (Should not be neccessary if lora kernel is used)
- Alle Installationen durchführen und danach den Kernel neu starten. Es funktioniert nur in dieser Kombination

In [3]:
# Install Pytorch for FSDP and FA/SDPA
!pip install torch==2.0.1 tensorboard
#pip install tensorboard datasets
 
# Install Hugging Face libraries
!pip install pydantic==2.0.0
!pip install  --upgrade "transformers==4.40.1" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"

Collecting torch==2.0.1
  Using cached torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Using cached nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl (21.0 MB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Using cached nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl (849 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Using cached nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl (11.8 MB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1)
  Using cached nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl (317.1 MB)
Collecting nvidia-cufft-cu11==10.9.0.58 (from torch==2.0.1)
  Using cached nvidia_cufft_cu11-10.9.0.58-py3-none-manylinux1_x86_64.whl (168.4 MB)
Collecting nvidia-curand-cu11==10.2.10.91 (from torch==2.0.1)
  Using cached nvidia_curand_cu11-10.2.10.91-py3-none-manylinux1_x86_64.whl (54.6 MB)
Colle

In [1]:
# Double check environment
# torch must be 2.0.0, transfomers must be 4.40, pydantic must be 2.0.0
!conda list | grep torch
!conda list | grep transformers
!conda list | grep pydantic

pytorch-revgrad           0.2.0                    pypi_0    pypi
torch                     2.0.1                    pypi_0    pypi
torchaudio                2.0.1+cu117              pypi_0    pypi
torchvision               0.15.1+cu117             pypi_0    pypi
adapter-transformers      3.0.1                    pypi_0    pypi
transformers              4.40.1                   pypi_0    pypi
pydantic                  2.0                      pypi_0    pypi
pydantic-core             2.0.1                    pypi_0    pypi


### Wahrscheinlich obsolet

In [None]:
!pip install transformers==4.40.1 torch==2.0.1 trl peft tensorboard pydantic==2.0.0

In [None]:
!pip install transformers==4.40.1
!pip install torch==2.0.1

In [None]:
!pip install trl
!pip install peft

## Login to hugging face

In [None]:
!huggingface-cli login --token "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"

## Free GPU Memory
- Alternatively > Restart Kernel

In [None]:
# import torch
del model
del trainer
torch.cuda.empty_cache()
print(torch.cuda.memory_reserved(0))
print(torch.cuda.memory_allocated(0))

##  Workflow for Lora tuning
- Runs also well with "qlora3" kernel

### Set environment and Import libraries

In [1]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
!echo $CUDA_VISIBLE_DEVICES

0


### 3.2 Try to use parallel mode if GPUs are available

In [9]:
if torch.cuda.device_count() > 1:
    
    model = torch.nn.DataParallel(model)

print("CUDA: ", torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import torch
import wandb
import transformers
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

## 1. Choose Model for workflow

#### For Llama 3 8B HF

In [None]:
model_location = "meta-llama/Meta-Llama-3-8B"
ouput_dir = "/home/thsch026/masterarbeit/models/generated/lora/Meta-Llama-3-8B_ms-marco"

#### Llama-2-7B-chat

In [6]:
model_location = "meta-llama/Llama-2-7b-chat-hf"
output_model = "/home/thsch026/masterarbeit/models/generated/lora/Llama-2-7b-chat_ms-marco"

#### Mistral 7B Instruct v0.2 - Dataset MS_Marco V1.1

In [6]:
model_location = "mistralai/Mistral-7B-Instruct-v0.2"
output_model = "/home/thsch026/masterarbeit/models/generated/lora/Mistral-7B-Instruct_ms-marco"

## 2. Initalize model

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_location)

model = AutoModelForCausalLM.from_pretrained(
    model_location,
   # load_in_8bit=True, # was 8bit
    device_map="cuda", # was auto
    force_download=False,
    torch_dtype=torch.bfloat16
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
model = prepare_model_for_kbit_training(model)

tokenizer.pad_token = "!"

## 3. Configure Lora Settings
For explanation of the values for LORA configuration see below:
- https://medium.com/@drishtisharma96505/comparative-analysis-of-lora-parameters-on-llama-2-with-flash-attention-574b913295d4

In [9]:

CUTOFF_LEN = 768
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj"
                    , "down_proj", "lm_head"], #these are the  names for the layers
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)

## 4. Prepare dataset for finetuning
Here I use a general dataset from Microsoft: MS_marco (v1.1) for the finetuning 

In [10]:
dataset = load_dataset('ms_marco','v1.1') # General dataset
print("dataset", dataset)
train_data = dataset["test"]

dataset DatasetDict({
    validation: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 10047
    })
    train: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 82326
    })
    test: Dataset({
        features: ['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'],
        num_rows: 9650
    })
})


### 4.1 Select the right prompt structure for the model you want use LORA on

#### Prompt Structure for Mistral 7 B Instruct

In [9]:
def generate_prompt(user_query):  #The prompt format is taken from the official Mistral huggingface page
    if user_query["answers"] is not None and user_query["query"] is not None:
      p =  "<s> [INST]" + str(user_query["query"]) + "[/INST]" +  str(user_query["answers"]) + "</s>"
      return p
    else:
        p = "<s> [INST]" + "Hello" + "[/INST]" +  "Hello" + "</s>"
    return p

#### Prompt Structure for Llama3 8B Instruct

In [None]:
def generate_prompt(user_query):  #The prompt format is taken from the official Meta Llama3 page
    if user_query["answers"] is not None and user_query["query"] is not None:
        p = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>" + str(user_query["query"]) + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    else
        p = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>" + "Hello!" + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
    return p

#### Prompt Structure for Llama2 7B chat

In [12]:
def generate_prompt(user_query):  #The prompt format is taken from the official Meta Llama3 page
    if user_query["answers"] is not None and user_query["query"] is not None:
        p = "<s>[INST]" + str(user_query["query"]) + "[/INST]"
    else:
        p = "<s>[INST]" + "Hello!" + "[/INST]"
    return p

### 4.2 Tokenize the train data

In [13]:
def tokenize(prompt):
    return tokenizer(
        prompt + tokenizer.eos_token,
        truncation=True,
        max_length=CUTOFF_LEN ,
        padding="max_length"
    )

In [14]:
train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x)), remove_columns=['answers', 'passages', 'query', 'query_id', 'query_type', 'wellFormedAnswers'])

Map:   0%|          | 0/9650 [00:00<?, ? examples/s]

## 5. Prepare trainer object and start training

In [15]:
trainer = Trainer(
    model=model, 
    train_dataset=train_data,
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=3,    # 3 or 6 is good
        learning_rate=1e-4,
        logging_steps=50,
        optim="adamw_torch",
        save_strategy="epoch",
        output_dir=output_model
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mthomas-t-schmitt[0m ([33mpumaai[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
5,6.8901
10,5.0753
15,3.7624
20,2.751
25,2.0373
30,1.8527
35,1.7921
40,1.7275
45,1.665
50,1.6496


## 6. Merge Adapter to original Model and save
Merges the specified adapter generated by the wokflow above with the original model.

- "adapter_config.json" mut may be edited depending on peft version
- "model_id" is the lora adapter that was created in the steps above
- you may must copy the files from the latest checkpoint to the directory specified in "model_id"

In [21]:
from peft import AutoPeftModelForCausalLM

# Local path of adapter model
model_id = "/home/thsch026/masterarbeit/models/generated/lora/mistral_7B-Instruct_ms-marco"
peft_model = AutoPeftModelForCausalLM.from_pretrained(model_id)
print(type(peft_model))

merged_model = peft_model.merge_and_unload()
# The adapters are merged now and it is transformers class again
print(type(merged_model))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

<class 'peft.peft_model.PeftModelForCausalLM'>
<class 'transformers.models.mistral.modeling_mistral.MistralForCausalLM'>


## 7. Save the merged model together with the tokenizer of the original model

In [8]:
output_dir = "/home/thsch026/masterarbeit/models/generated/lora/mistral_7B-Instruct_ms-marco/model"

model.save_pretrained(output_dir)
tokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer.save_pretrained(output_dir)



('/home/thsch026/masterarbeit/models/generated/lora/mistral_7B-Instruct_ms-marco/model/tokenizer_config.json',
 '/home/thsch026/masterarbeit/models/generated/lora/mistral_7B-Instruct_ms-marco/model/special_tokens_map.json',
 '/home/thsch026/masterarbeit/models/generated/lora/mistral_7B-Instruct_ms-marco/model/tokenizer.json')