<a href="https://colab.research.google.com/github/gwythyr/llm-fine-tuning-study/blob/master/Fine_tune_DeciLM_7B_for_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'
os.environ['LC_CTYPE'] = 'en_US.UTF-8'

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
from pathlib import Path
from typing import Optional

def create_directory(path: Optional[Path] = None, dir_name: str = "output"):
    """
    Creates a directory at the specified path with the given directory name.
    If no path is provided, the current working directory is used.

    Parameters:
    - path (Optional[Path]): The path where the directory is to be created.
    - dir_name (str): The name of the directory to create.

    Returns:
    - Path object representing the path to the created directory.
    """
    # Use the current working directory if no path is provided
    working_dir = path if path is not None else Path('./')

    # Define the output directory path by joining paths
    output_directory = working_dir / dir_name

    # Create the directory if it doesn't exist
    output_directory.mkdir(parents=True, exist_ok=True)

    return output_directory

output_dir = create_directory(dir_name="fine-tuned-checkpoints")
print(f"Directory created at: {output_dir}")

In [None]:
%%capture
!pip install -q -U bitsandbytes
!pip install -q -U transformers
!pip install -q -U peft
!pip install -q -U accelerate
!pip install -q -U datasets
!pip install -q -U trl
!pip install ninja

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
import torch

# Load model

In [None]:
model_name = "Deci/DeciLM-7B"

gpu_memory = torch.cuda.get_device_properties(0).total_memory

do_quantization = gpu_memory < 20e9

if do_quantization:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        # bfloat works only on A100 (or ampere supported chip)
        bnb_4bit_compute_dtype=torch.bfloat16,
        # if you're using a T4 or non-ampere chip comment out the above and run this instead:
        # bnb_4bit_compute_dtype=torch.float16
    )

    decilm = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        use_cache=True,
        trust_remote_code=True
    )
else:
    decilm = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        use_cache=True,
        trust_remote_code=True
    )


tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

tokenizer.pad_token = tokenizer.eos_token

tokenizer.padding_side = "right"

# Load dataset



In [None]:
from datasets import load_dataset

dataset = "harpreetsahota/modern-to-shakesperean-translation"

data = load_dataset(dataset, split="train")

data = data.shuffle(seed=42)

modern_to_shakespearean = data.train_test_split(test_size=0.1, seed=42)

In [None]:
def construct_short_translation_prompt(sample):
  prompt = "<s>"
  prompt += sample["modern"]
  prompt += " ###> "
  prompt += sample["shakespearean"]
  prompt += "</s>"
  return {"text" : prompt}

Alternatively, you can try this prompt which has more of an instruction tune feel to it.

In [None]:
# def construct_translation_prompt(sample):
#   prompt = ""
#   prompt += "Translate the following text from Modern English to Shakespearean English"
#   prompt += "\n\n### Modern English: \n"
#   prompt += sample["modern"]
#   prompt += "\n\n### Shakespearean English: \n"
#   prompt += sample["shakespearean"]
#   return {"text" : prompt}

# modern_to_shakespearean = modern_to_shakespearean.map(construct_translation_prompt)

In [None]:
modern_to_shakespearean = modern_to_shakespearean.map(construct_short_translation_prompt)

In [None]:
modern_to_shakespearean['train'][42]['text']

# QLoRA Config

In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

# we set our lora config to be the same as qlora
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    #  The modules to apply the LoRA update matrices.
    target_modules = ["gate_proj", "down_proj", "up_proj"],
    task_type="CAUSAL_LM"
)

# Prepare model for peft

In [None]:
if do_quantization:
    decilm = prepare_model_for_kbit_training(decilm)

decilm.enable_input_require_grads()
decilm = get_peft_model(decilm, lora_config)

# Training Args

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="steps",
        do_eval=True,
        auto_find_batch_size=True,
        log_level="debug",
        optim="paged_adamw_32bit",
        save_steps=25,
        logging_steps=100,
        learning_rate=3e-4,
        weight_decay=0.01,
        # basically just train for 5 epochs, you should train for longer
        max_steps=len(modern_to_shakespearean['train']) * 5,
        warmup_steps=150,
        # if you're using a T4, or non-ampere supported chip comment out the below line.
        bf16=True,
        tf32=True,
        gradient_checkpointing=True,
        max_grad_norm=0.3, #from the paper
        lr_scheduler_type="reduce_lr_on_plateau",
)

# Train

Super short training run, takes ~15 minutes on an A100

In [None]:
trainer = SFTTrainer(
    model=decilm,
    args=training_args,
    peft_config=lora_config,
    tokenizer=tokenizer,
    dataset_text_field='text',
    train_dataset=modern_to_shakespearean['train'],
    eval_dataset=modern_to_shakespearean['test'],
    max_seq_length=4096,
    dataset_num_proc=os.cpu_count(),
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()

# Merge adapter to base model




In [None]:
from peft import AutoPeftModelForCausalLM
from functools import partial

AutoTokenizer.from_pretrained = partial(AutoTokenizer.from_pretrained, trust_remote_code=True)

instruction_tuned_model = AutoPeftModelForCausalLM.from_pretrained(
    training_args.output_dir,
    torch_dtype=torch.bfloat16,
    device_map = 'auto',
    trust_remote_code=True,
)

merged_model = instruction_tuned_model.merge_and_unload()

In [None]:
from transformers import pipeline

generation_kwargs = {
    "max_new_tokens": 32,
    "early_stopping": True,
    "num_beams": 5,
    "temperature" : 0.001,
    "do_sample":True,
    "no_repeat_ngram_size": 3,
    "repetition_penalty" : 1.5,
    "renormalize_logits": True
}

decilm_tuned_pipeline = pipeline(
    "text-generation",
    model=merged_model,
    tokenizer=tokenizer,
    **generation_kwargs
)

In [None]:
modern_sayings = [
    "Bruh, Stop cappin', I know you ain't about that life homie.",
    "Stop throwing shade at me, Fam. You know I keep it 100 with you.",
    "I'm gonna dip homie, these folks are sus. Catch you later.",
    "Yo fam, he slid into my DMs and now he's simping all over me.",
    "I'm lowkey obsessed with this song, it's such a mood.",
    "He's big mad 'cause he took an L in the game.",
    "She really feeling herself after that glow up.",
    "Yo homie, why you trippin' over her like that being a simp?",
    "No cap bruh, you lookin hella chuegy in them skinny jeans."
]

def construct_inference_prompt(input_text):
    prompt = ""
    # prompt += "Translate the following text from Modern English to Shakespearean English"
    # prompt += "\n\n### Modern English: \n"
    prompt += input_text
    prompt += " ###>"
    # prompt += "\n\n### Shakespearean English: \n"
    return prompt

def translate_modern_to_shakespearean(input_phrase):
    modern_saying = construct_inference_prompt(input_phrase)
    translation_result = decilm_tuned_pipeline(modern_saying, return_full_text=True)[0]['generated_text']
    print(translation_result)

In [None]:
translate_modern_to_shakespearean(modern_sayings[7])

In [None]:
translate_modern_to_shakespearean(modern_sayings[2])