In [None]:
!pip install mlflow transformers accelerate datasets bitsandbytes einops wandb
!pip install git+https://github.com/huggingface/peft.git
!pip install trl
!pip install --ignore-installed blinker
!pip install apache-airflow

Collecting mlflow
  Downloading mlflow-2.13.0-py3-none-any.whl (25.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.0/25.0 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting einops
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m7.0 

In [None]:
import torch
import mlflow
import mlflow.pytorch
import pandas as pd
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import Dataset
import warnings
from datasets import Dataset
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
warnings.filterwarnings("ignore")

In [None]:
def install_packages():
    !pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
    !pip install -q datasets bitsandbytes einops wandb

def login_huggingface(token):
    login(token)

def load_data(file_path):
    df = pd.read_excel(file_path)
    return df

def preprocess_data(df):
    df["question"] = (
        df["Question"]
        + "\n Code: )"
        + df["Code"]
        + "\n Plain Text: )"
    )
    custom_ds = pd.DataFrame()
    custom_ds["prompt"] = df["question"]
    dataset = Dataset.from_pandas(custom_ds)
    return dataset

def load_model_and_tokenizer(model_name):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, trust_remote_code=True)
    model.config.use_cache = False
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def configure_peft(lora_alpha=16, lora_dropout=0.1, lora_r=64):
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )
    return peft_config

def create_training_arguments(output_dir="./results", per_device_train_batch_size=4, gradient_accumulation_steps=4,
                              optim="paged_adamw_32bit", save_steps=200, logging_steps=10, learning_rate=2e-4,
                              max_grad_norm=0.3, max_steps=10, warmup_ratio=0.03, lr_scheduler_type="constant"):
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim=optim,
        save_steps=save_steps,
        logging_steps=logging_steps,
        learning_rate=learning_rate,
        fp16=True,
        max_grad_norm=max_grad_norm,
        max_steps=max_steps,
        warmup_ratio=warmup_ratio,
        group_by_length=True,
        lr_scheduler_type=lr_scheduler_type,
    )
    return training_arguments

def train_model(model, dataset, peft_config, tokenizer, training_arguments, max_seq_length=512):
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="prompt",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
    )
    for name, module in trainer.model.named_modules():
        if "norm" in name:
            module = module.to(torch.float32)
    trainer.train()
    return trainer

def save_model(trainer, output_path="outputs"):
    model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
    model_to_save.save_pretrained(output_path)

def load_trained_model(model, output_path="outputs"):
    lora_config = LoraConfig.from_pretrained(output_path)
    model = get_peft_model(model, lora_config)
    return model

def generate_text(model, tokenizer, test_df):
    test_row = test_df.head(1)
    test_text = test_row["Prompt"].values[0] + "\n Question: )" + test_row["Question"].values[0] + "\n Code: )" + test_row["Code"].values[0] + "\n Plain Text: )"
    print(test_text)
    input_ids = tokenizer.encode(test_text, return_tensors="pt", max_length=1024, truncation=True)
    with torch.no_grad():
        output = model.generate(input_ids, max_length=400, num_return_sequences=1)
    for i, seq in enumerate(output):
        generated_text = tokenizer.decode(seq, skip_special_tokens=True)
        print(f"Generated text {i+1}: {generated_text}")

In [None]:
from google.colab import files
uploaded = files.upload()

file_name = list(uploaded.keys())[0]

df = pd.read_excel(file_name)

Saving train.xlsx to train.xlsx


In [None]:
def main():
    mlflow.start_run()  # Start MLFlow run

    try:
        install_packages()
        print('1')
        login_huggingface("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")
        print('2')
        # train_df = load_data('train.xlsx')
        print('3')
        dataset = preprocess_data(df)
        print('4')
        model_name = "meta-llama/Llama-2-7b-chat-hf"
        model, tokenizer = load_model_and_tokenizer(model_name)
        print('5')
        peft_config = configure_peft()
        print('6')
        training_arguments = create_training_arguments()
        print('7')
        trainer = train_model(model, dataset, peft_config, tokenizer, training_arguments)
        save_model(trainer)
        model = load_trained_model(model)
        # test_df = load_data('test.xlsx')
        # generate_text(model, tokenizer, test_df)

        # Log model and parameters
        mlflow.pytorch.log_model(model, "model")
        mlflow.log_params({
            "lora_alpha": 16,
            "lora_dropout": 0.1,
            "lora_r": 64,
            "learning_rate": 2e-4,
            "max_steps": 10
        })

    finally:
        mlflow.end_run()  # End the MLFlow run

if __name__ == "__main__":
    main()

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2023.6.0 requires fsspec==2023.6.0, but you have fsspec 2024.3.1 which is incompatible.[0m[31m
[0m1
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to 

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

5
6
7


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 