In [1]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

In [32]:
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

dataset = load_dataset("json", data_files="../data/abstracts.json", split="train")

def formatting_prompts_func(examples):
    output_texts = []
    for i in range(len(examples['prompt'])):
        text = f"### Instruction: {examples['prompt'][i]}\n ### Hypothesis: {examples['completion'][i]}"
        output_texts.append({'text': text})
    return output_texts

output_texts = formatting_prompts_func(dataset)

Found cached dataset json (/Users/charlesoneill/.cache/huggingface/datasets/json/default-08a626f129b64a6f/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


In [33]:
output_texts[0]

{'text': '### Instruction: Generate a scientific hypothesis about astronomy in the style of an Arxiv paper.\n ### Hypothesis: Chemical Cartography, or mapping, of our Galaxy has the potential to fully\\ntransform our view of its structure and formation. In this work, we use\\nchemical cartography to explore the metallicity distribution of OBAF-type disk\\nstars from the LAMOST survey and a complementary sample of disk giant stars\\nfrom Gaia DR3. We use these samples to constrain the radial and vertical\\nmetallicity gradients across the Galactic disk. We also explore whether there\\nare detectable azimuthal variations in the metallicity distribution on top of\\nthe radial gradient. For the OBAF-type star sample from LAMOST, we find a\\nradial metallicity gradient of $\\Delta$[Fe/H]/$\\Delta$R $\\sim -0.078 \\pm 0.001$\\ndex/kpc in the plane of the disk and a vertical metallicity gradient of\\n$\\Delta$[Fe/H]/$\\Delta$Z $\\sim -0.15 \\pm 0.01$ dex/kpc in the solar\\nneighborhood. The r

In [29]:
model_name = "ybelkada/falcon-7b-sharded-bf16"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True
)
model.config.use_cache = False

'### Instruction: Generate a scientific hypothesis about astronomy in the style of an Arxiv paper.\n ### Hypothesis: Chemical Cartography, or mapping, of our Galaxy has the potential to fully\\ntransform our view of its structure and formation. In this work, we use\\nchemical cartography to explore the metallicity distribution of OBAF-type disk\\nstars from the LAMOST survey and a complementary sample of disk giant stars\\nfrom Gaia DR3. We use these samples to constrain the radial and vertical\\nmetallicity gradients across the Galactic disk. We also explore whether there\\nare detectable azimuthal variations in the metallicity distribution on top of\\nthe radial gradient. For the OBAF-type star sample from LAMOST, we find a\\nradial metallicity gradient of $\\Delta$[Fe/H]/$\\Delta$R $\\sim -0.078 \\pm 0.001$\\ndex/kpc in the plane of the disk and a vertical metallicity gradient of\\n$\\Delta$[Fe/H]/$\\Delta$Z $\\sim -0.15 \\pm 0.01$ dex/kpc in the solar\\nneighborhood. The radial gra

In [None]:
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "query_key_value",
        "dense",
        "dense_h_to_4h",
        "dense_4h_to_h",
    ]
)

In [None]:
from transformers import TrainingArguments

output_dir = "./results"
per_device_train_batch_size = 4
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 500
warmup_ratio = 0.03
lr_scheduler_type = "constant"

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [None]:
from trl import SFTTrainer

max_seq_length = 512

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    formatting_func=formatting_prompts_func,
)

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)