In [1]:
!pip install mlflow transformers accelerate datasets bitsandbytes einops wandb torch peft
!pip install git+https://github.com/huggingface/peft.git
!pip install trl
!pip install --ignore-installed blinker
!pip install apache-airflow

Collecting bitsandbytes
  Obtaining dependency information for bitsandbytes from https://files.pythonhosted.org/packages/9b/63/489ef9cd7a33c1f08f1b2be51d1b511883c5e34591aaa9873b30021cd679/bitsandbytes-0.42.0-py3-none-any.whl.metadata
  Downloading bitsandbytes-0.42.0-py3-none-any.whl.metadata (9.9 kB)
Collecting einops
  Obtaining dependency information for einops from https://files.pythonhosted.org/packages/44/5a/f0b9ad6c0a9017e62d4735daaeb11ba3b6c009d69a26141b258cd37b5588/einops-0.8.0-py3-none-any.whl.metadata
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting wandb
  Obtaining dependency information for wandb from https://files.pythonhosted.org/packages/28/a4/5372a595a4e8dcbafb3e7bef58ec494979a92d6ed2d722e5f3a4b9a764b5/wandb-0.17.0-py3-none-macosx_11_0_arm64.whl.metadata
  Downloading wandb-0.17.0-py3-none-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Obtaining dependency information for docker-pycreds>=0.4.0 from https:/

In [5]:
import torch
import mlflow
import mlflow.pytorch
import pandas as pd
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer
from datasets import Dataset
import warnings
from datasets import Dataset
from huggingface_hub import login
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
warnings.filterwarnings("ignore")

In [7]:
def install_packages():
    !pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
    !pip install -q datasets bitsandbytes einops wandb

def login_huggingface(token):
    login(token)

def load_data(file_path):
    df = pd.read_excel(file_path)
    return df

def preprocess_data(df):
    df["question"] = (
        df["Question"]
        + "\n Code: )"
        + df["Code"]
        + "\n Plain Text: )"
    )
    custom_ds = pd.DataFrame()
    custom_ds["prompt"] = df["question"]
    dataset = Dataset.from_pandas(custom_ds)
    return dataset

def load_model_and_tokenizer(model_name):
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
    )
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, trust_remote_code=True)
    model.config.use_cache = False
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    return model, tokenizer

def configure_peft(lora_alpha=16, lora_dropout=0.1, lora_r=64):
    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )
    return peft_config

def create_training_arguments(output_dir="./results", per_device_train_batch_size=4, gradient_accumulation_steps=4,
                              optim="paged_adamw_32bit", save_steps=200, logging_steps=10, learning_rate=2e-4,
                              max_grad_norm=0.3, max_steps=10, warmup_ratio=0.03, lr_scheduler_type="constant"):
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim=optim,
        save_steps=save_steps,
        logging_steps=logging_steps,
        learning_rate=learning_rate,
        fp16=True,
        max_grad_norm=max_grad_norm,
        max_steps=max_steps,
        warmup_ratio=warmup_ratio,
        group_by_length=True,
        lr_scheduler_type=lr_scheduler_type,
    )
    return training_arguments

def train_model(model, dataset, peft_config, tokenizer, training_arguments, max_seq_length=512):
    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="prompt",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
    )
    for name, module in trainer.model.named_modules():
        if "norm" in name:
            module = module.to(torch.float32)
    trainer.train()
    return trainer

def save_model(trainer, output_path="outputs"):
    model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
    model_to_save.save_pretrained(output_path)

def load_trained_model(model, output_path="outputs"):
    lora_config = LoraConfig.from_pretrained(output_path)
    model = get_peft_model(model, lora_config)
    return model

def generate_text(model, tokenizer, test_df):
    test_row = test_df.head(1)
    test_text = test_row["Prompt"].values[0] + "\n Question: )" + test_row["Question"].values[0] + "\n Code: )" + test_row["Code"].values[0] + "\n Plain Text: )"
    print(test_text)
    input_ids = tokenizer.encode(test_text, return_tensors="pt", max_length=1024, truncation=True)
    with torch.no_grad():
        output = model.generate(input_ids, max_length=400, num_return_sequences=1)
    for i, seq in enumerate(output):
        generated_text = tokenizer.decode(seq, skip_special_tokens=True)
        print(f"Generated text {i+1}: {generated_text}")

In [10]:
# from google.colab import files
# uploaded = files.upload()

# file_name = list(uploaded.keys())[0]
# train_file_name = '/Users/rahulodedra/Downloads/train.xlsx'
# df = pd.read_excel(train_file_name)

In [11]:
# test_file_name = '/Users/rahulodedra/Downloads/test.xlsx'
# df = pd.read_excel(test_file_name)

In [13]:
def main():
    mlflow.start_run()  # Start MLFlow run

    try:
        install_packages()
        print('1')
        login_huggingface("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")
        print('2')
        # train_df = load_data('/Users/rahulodedra/Downloads/train.xlsx')
        print('3')
        dataset = preprocess_data(df)
        print('4')
        model_name = "meta-llama/Llama-2-7b-chat-hf"
        model, tokenizer = load_model_and_tokenizer(model_name)
        print('5')
        peft_config = configure_peft()
        print('6')
        training_arguments = create_training_arguments()
        print('7')
        trainer = train_model(model, dataset, peft_config, tokenizer, training_arguments)
        save_model(trainer)
        model = load_trained_model(model)
        test_df = load_data('/Users/rahulodedra/Downloads/test.xlsx')
        generate_text(model, tokenizer, test_df)

        # Log model and parameters
        mlflow.pytorch.log_model(model, "model")
        mlflow.log_params({
            "lora_alpha": 16,
            "lora_dropout": 0.1,
            "lora_r": 64,
            "learning_rate": 2e-4,
            "max_steps": 10
        })

    finally:
        mlflow.end_run()  # End the MLFlow run

if __name__ == "__main__":
    main()

1
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/rahulodedra/.cache/huggingface/token
Login successful
2
3
4


RuntimeError: No GPU found. A GPU is needed for quantization.