In [None]:
import torch
from datasets import load_dataset
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments

In [None]:
###################
# Hyper-parameters
###################
args = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 5.0e-06,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 10,
    "max_steps": -1,
    "output_dir": "phi1-ai-abstracts",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 1,
    "per_device_train_batch_size": 1,
    "remove_unused_columns": True,
    "save_steps": 100000000000000,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.2,
}

training_args = TrainingArguments(**args)

In [None]:
################
# Model Loading
################
checkpoint_path = "microsoft/Phi-1.5"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",  # loading the model with flash-attention support
    torch_dtype=torch.bfloat16,
    device_map="cuda:0",
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

In [None]:
##################
# Data Processing
##################
phi3_prompt = """<|system|>
You are an educated researcher and always answer in correct scientific terms.
<|end|>
<|user|>
Write an abstract for "{}"
<|end|>
<|assistant|>
Abstract:

{}
<|end|>"""

def formatting_prompts_func(examples):
    titles    = examples["title"]
    abstracts = examples["abstract"]
    texts = []
    for title, abstract in zip(titles,  abstracts):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = phi3_prompt.format(title, abstract)
        texts.append(text)

    # first instruction should start with system prompt?
    return { "text" : texts, }

In [None]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="ai-abstracts.jsonl.xz", split="train")
train_dataset = dataset.map(formatting_prompts_func, batched = True,)

In [None]:
###########
# Training
###########
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    processing_class=tokenizer
)

In [None]:
train_result = trainer.train()

In [None]:
train_result.metrics

In [None]:
trainer.save_state()

In [None]:
trainer.save_model(training_args.output_dir)