# Install deps

In [1]:
%load_ext autoreload
%autoreload 2

import os

os.environ["HF_TOKEN"] = "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"

In [2]:
# for auto train
!pip install -U autotrain-advanced > install_logs.txt
!autotrain setup --colab > setup_logs.txt

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

def generate_gemma_prompt(original_text, rewritten_text, rewrite_prompt=None):
    instruction_text = 'Generate a rewrite_prompt that effectively transforms the provided original_text into the provided rewritten_text. The rewrite_prompt must be clearly explain how to the original_text is transformed to the rewritten_text, focus on explaining the changes of tone, writting style, publishing, etc. Keep the rewrite_prompt concise, less than 100 words.'
    
    text = f"""<start_of_turn>user {instruction_text}
Here is the given texts:
# original_text:
{original_text}

# rewritten_text:
{rewritten_text}
<end_of_turn>
<start_of_turn>model""" + \
    ("""\n{rewrite_prompt} <end_of_turn>""" if rewrite_prompt else '')
    
    return text

class GemmaModel:
    def __init__(self, model_name, device="cuda"):
        self.device = device

        quantization_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            device_map="auto",
            quantization_config=quantization_config
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

    def predict_prompt(self, original_text, rewritten_text, max_length=100):
        prompt = generate_gemma_prompt(original_text, rewritten_text)
        prompt_ids = self.tokenizer.encode(prompt, return_tensors="pt").to(self.device)

        outputs = self.model.generate(
            prompt_ids, 
            max_length=max_length, 
            do_sample=True, 
            temperature=0.7, 
            top_k=50, 
            top_p=0.92, 
            num_return_sequences=1
        )

        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)



# 1. Prepare data

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np

# data
DATA_DIR = Path("./data")
CKPT_DIR = Path("./ckpt")

# load data
train_df = pd.read_csv(DATA_DIR / "train_data.csv")
test_df = pd.read_csv(DATA_DIR / "test_data.csv")

print(train_df.shape, test_df.shape)

(3977, 4) (995, 4)


In [4]:
# prepare 'text' in gemma format column for autotrain
# from models.utils import generate_gemma_prompt

def apply_generate_prompt(row):
    return generate_gemma_prompt(row["original_text"], row["rewritten_text"], row["rewrite_prompt"])

train_df["text"] = train_df.apply(apply_generate_prompt, axis=1)
train_df.to_csv(DATA_DIR / "train.csv", index=False)

2. Train

In [None]:
import os
learning_rate = 2e-4
num_epochs = 10
batch_size = 1
block_size = 1024
trainer = "sft"
warmup_ratio = 0.1
weight_decay = 0.01
gradient_accumulation = 4
mixed_precision = "fp16"
peft = True 
quantization = "int4"
lora_r = 16
lora_alpha = 32
lora_dropout = 0.05

os.environ["PROJECT_NAME"] = "gemma-2bit"
os.environ["MODEL_NAME"] = "google/gemma-2b-it"
os.environ["LEARNING_RATE"] = str(learning_rate)
os.environ["NUM_EPOCHS"] = str(num_epochs)
os.environ["BATCH_SIZE"] = str(batch_size)
os.environ["BLOCK_SIZE"] = str(block_size)
os.environ["WARMUP_RATIO"] = str(warmup_ratio)
os.environ["WEIGHT_DECAY"] = str(weight_decay)
os.environ["GRADIENT_ACCUMULATION"] = str(gradient_accumulation)
os.environ["MIXED_PRECISION"] = str(mixed_precision)
os.environ["PEFT"] = str(peft)
os.environ["QUANTIZATION"] = str(quantization)
os.environ["LORA_R"] = str(lora_r)
os.environ["LORA_ALPHA"] = str(lora_alpha)
os.environ["LORA_DROPOUT"] = str(lora_dropout)

In [None]:
!autotrain llm \
--train \
--model ${MODEL_NAME} \
--project-name ${PROJECT_NAME} \
--data-path data/ \
--text-column text \
--lr ${LEARNING_RATE} \
--batch-size ${BATCH_SIZE} \
--epochs ${NUM_EPOCHS} \
--block-size ${BLOCK_SIZE} \
--warmup-ratio ${WARMUP_RATIO} \
--lora-r ${LORA_R} \
--lora-alpha ${LORA_ALPHA} \
--lora-dropout ${LORA_DROPOUT} \
--weight-decay ${WEIGHT_DECAY} \
--gradient-accumulation ${GRADIENT_ACCUMULATION} \
--quantization ${QUANTIZATION} \
--mixed-precision ${MIXED_PRECISION}

# 3. Predict data

In [5]:
model = GemmaModel(
    model_name="google/gemma-2b-it"
    # CKPT_DIR / "gemma-2bit"
)

# test
original = "The competition dataset comprises text passages that have been rewritten by the Gemma LLM according to some rewrite_prompt instruction. The goal of the competition is to determine what prompt was used to rewrite each original text.  Please note that this is a Code Competition. When your submission is scored, this example test data will be replaced with the full test set. Expect roughly 2,000 original texts in the test set."
rewritten = "Here is your shanty: (Verse 1) The text is rewritten, the LLM has spun, With prompts so clever, they've been outrun. The goal is to find, the prompt so bright, To crack the code, and shine the light. (Chorus) Oh, this is a code competition, my dear, With text and prompts, we'll compete. Two thousand texts, a challenge grand, To guess the prompts, hand over hand.(Verse 2) The original text, a treasure lost, The rewrite prompt, a secret to be"

model.predict_prompt(original, rewritten)


  from .autonotebook import tqdm as notebook_tqdm


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`