In [1]:
from datasets import Dataset
import dill as pickle
from trl import DataCollatorForCompletionOnlyLM

alpaca_like_dataset = None
with open('data/dataset_llama_l1.pkl', 'rb') as f:
    alpaca_like_dataset = pickle.load(f)
shuffled_alpaca = alpaca_like_dataset.shuffle(seed=1337)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoConfig, PretrainedConfig, BitsAndBytesConfig
from unsloth import FastLanguageModel
import torch


# model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "mesolitica/llama-1b-hf-32768-fpf"
# model_name = "mistralai/Mistral-7B-Instruct-v0.2"
model_name = "meta-llama/Llama-2-7b-chat-hf"

kwargs = {'attn_implementation': 'flash_attention_2'}
CTX_LEN = 4096
LOAD_4BIT = False
LOAD_8BIT = True
RANK = 32
# using unsloth to load the models
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = model_name, # Supports Llama, Mistral - replace this!
#     max_seq_length = CTX_LEN,
#     dtype = None,
#     load_in_4bit = LOAD_4BIT,
#     load_in_8bit = LOAD_8BIT,
#     **kwargs
# )
bnb_config = BitsAndBytesConfig(
    load_in_4bit              = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type       = "nf4",
    bnb_4bit_compute_dtype    = torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                            #  load_in_8bit = LOAD_8BIT,
                                             device_map = 'auto',
                                            #  quantization_config = bnb_config,
                                             attn_implementation = 'flash_attention_2')
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.56s/it]


In [3]:
from peft import LoraConfig, get_peft_model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

lora_config = LoraConfig(
    r=RANK,
    lora_alpha=128,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],#, "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

model.enable_input_require_grads()
peft_model = get_peft_model(model, lora_config)
# peft_model = FastLanguageModel.get_peft_model(
#     model,
#     r=RANK,
#     lora_alpha=128,
#     lora_dropout=0,
#     target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],
#     bias="none",
#     max_seq_length=CTX_LEN,
#     use_gradient_checkpointing=True
# )
print_trainable_parameters(model)

trainable params: 33554432 || all params: 6771970048 || trainable%: 0.49548996469513035


In [4]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example)):
        output_texts.append(example['text'])
    return output_texts
response_format = '[/INST]\n'

collator = DataCollatorForCompletionOnlyLM(response_format, tokenizer=tokenizer)

In [5]:
import wandb
import datetime
def do_wandb_stuff():
    wandb_id = wandb.util.generate_id()
    wandb.init(
        # set the wandb project where this run will be logged
        project="master",
        
        # specify id
        id=wandb_id,

        # specify group
        group = 'machine_translation',

        # track hyperparameters and run metadata
        config={
            "learning_rate": 3e-4,
            "architecture": model_name,
            "architecture_short": model_name,
            "dataset": 'newsela',
            "rank": RANK,
            "ctx": CTX_LEN,
            "4bit": LOAD_4BIT,
            "8bit": LOAD_8BIT,
            "logging_steps": 10,
            'warmup_ratio': 0.05
        }
    )
    wandb.run.name = (
        f'{wandb.config["architecture_short"]}_'
        f'{wandb.config["dataset"]}_'
        f'rank={wandb.config["rank"]}_'
        f'ctx={wandb.config["ctx"]}_'
        f'4bit={wandb.config["4bit"]}_'
        f'8bit={wandb.config["8bit"]}_'
        f'{wandb.config["warmup_ratio"]}_'
        f'{datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}'
    )

do_wandb_stuff()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mtantarudragos[0m ([33mdtant[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
shuffled_alpaca[0]

{'id': 'time-popefrancis.en.0.txt',
 'instruction': 'Simplify the text',
 'input': '<img class="pull-right" src="https://newsela-test-files-f331e.s3.amazonaws.com/article_media/extra/Time_Person_Of_The_Ye_Edit.jpg" />NEW YORK — Time magazine selected Pope Francis as its Person of the Year on Wednesday, saying the Catholic Church\'s new leader has changed the perception of the 2,000-year-old institution in an extraordinary way in a short time.\n\nThe [pope](https://www.newsela.com/?tag=pope) beat out NSA leaker Edward Snowden for the distinction, which the newsmagazine has been giving each year since 1927.\n\nThe former Argentine Cardinal Jorge Mario Bergoglio was elected in March as the first pope from Latin America and the first Jesuit. Since taking over at the Vatican, he has urged the Catholic Church not to be obsessed with "small-minded rules" and to emphasize compassion over condemnation in dealing with touchy topics like abortion, gays and contraception.\n\nHe has denounced the w

In [7]:
args = TrainingArguments(
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True, # comment with unsloth
    optim="adamw_torch",
    logging_steps=10,
    learning_rate=3e-4,
    bf16=True,
    max_grad_norm=1,
    num_train_epochs=1,
    warmup_ratio=0.05,
    save_strategy="steps",
    save_steps=40,
    group_by_length=True,
    output_dir='output/',
    report_to="wandb",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=1337,
)

In [8]:
from trl import SFTTrainer
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=shuffled_alpaca,
    # peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=args,
    data_collator=collator,
)

Map:   0%|          | 0/1910 [00:00<?, ? examples/s]

Map: 100%|██████████| 1910/1910 [00:02<00:00, 949.98 examples/s]


In [9]:
trainer.train()

  0%|          | 0/238 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
` in the following instance: <s> [INST] <<SYS>>
        You are a very helpful assistant, that is highly skilled at simpflying complex english texts. Answer any query correctly.
        <</SYS>>
        
        Slightly simplify the following text, as to make it more accesible:
        WILLISTON, N.D. — It's been a long day for Andrew Klefstad. And a long four years.

At dawn, he coaxed milk from the cows in his father Roger's barn below a pink and turquoise sunrise and lush green hillsides near Ridgeland, Wis. Then he went back to work, restoring the century-old farmhouse that will soon become his young family's home.

Now it's 11 p.m., and his wife, Tiffany, is reaching up to wrap her arms around his neck, kissing him goodbye after a 90-mile drive from the farm to the Amtrak depot in St. Paul.

A duffel bag slung over his shoulder, Klefstad searches for a

{'loss': 0.7353, 'learning_rate': 0.00025, 'epoch': 0.04}


  8%|▊         | 20/238 [08:20<1:17:25, 21.31s/it]

{'loss': 0.4856, 'learning_rate': 0.00029907343373680614, 'epoch': 0.08}


 13%|█▎        | 30/238 [11:50<1:11:29, 20.62s/it]

{'loss': 0.4082, 'learning_rate': 0.00029532886833968404, 'epoch': 0.13}


 13%|█▎        | 31/238 [12:09<1:08:40, 19.91s/it]

In [11]:
peft_model.save_pretrained('output/checkpoint-238')

In [14]:
from transformers import pipeline, AutoTokenizer
from peft import PeftModelForCausalLM

peft_model = PeftModelForCausalLM.from_pretrained(model=model, model_id="output/checkpoint-238")
peft_model = peft_model.merge_and_unload()
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")


# Create a text generation pipeline using the model and tokenizer
peft_model = torch.compile(peft_model)
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [15]:
idx = shuffled_alpaca[10]['text'].index('[/INST]\n')
shuffled_alpaca[10]['text'][idx + len('[/INST]\n'):]

'The slightly simplified version is:\nAlmost 35 percent of Americans are obese, or seriously overweight. Being obese is associated with health problems including heart disease and cancer. The government is trying to deal with this national epidemic by enacting laws that require restaurants, and other places that serve food, to show the calorie content of items on their menus.\n\nWhen these laws were first enacted, it was thought that they would lead people to make healthier food choices.\n\nLawmakers also hoped that by posting calories, it would encourage restaurants to make healthier foods available.\n\nA new study shows that they were half right.\n\n## Are You Going To Finish That?\n\nShowing restaurant customers the calorie count of menu items might not lead them to pick healthier foods. Most consumers are creatures of habit and order pretty much the same thing every time. But having to tell customers the calorie counts of food they serve may prod restaurants into making healthier f

In [16]:
shuffled_alpaca[10]['text'][:idx + len('[/INST]\n')]

'[INST] Slightley simplify the following text, as to make it more accesible :\nEarly reports suggest that restaurant patrons shown the calorie content of the dishes they may order don\'t necessarily use that information to make better food and beverage choices.\n\nBut all may not be lost, a new study suggests: When eateries got ahead of a new federal mandate and voluntarily posted their offerings\' calorie load for all to see, they appeared to whittle the calorie content of their offerings more aggressively than did establishments waiting for a calorie-posting requirement to take effect.\n\nReducing the calorific load of menu items may not improve consumers\' decision-making. But it could just limit the damage of the decisions they make while dining, drinking or snacking away from home, the authors of the new research conclude.\n\nIn the new study, public health researchers from Johns Hopkins University, University of Pittsburgh and Harvard Medical School found that between 2012 and 20

In [17]:
prompt =  '<s> ' + shuffled_alpaca[10]['text'][:idx + len('[/INST]\n')]  # Your starting text here
# prompt = '<s>[INST] Slightley simplify the following text, as to make it more accesible :\n Settlement in what is now Romania began in the Lower Paleolithic followed by written records attesting the kingdom of Dacia, its conquest, and subsequent Romanisation by the Roman Empire during late antiquity. The modern Romanian state was formed in 1859 through a personal union of the Danubian Principalities of Moldavia and Wallachia. The new state, officially named Romania since 1866, gained independence from the Ottoman Empire in 1877. During World War I, after declaring its neutrality in 1914, Romania fought together with the Allied Powers from 1916. In the aftermath of the war, Bukovina, Bessarabia, Transylvania, and parts of Banat, Crișana, and Maramureș became part of the Kingdom of Romania.[19] In June–August 1940, as a consequence of the Molotov–Ribbentrop Pact and Second Vienna Award, Romania was compelled to cede Bessarabia and Northern Bukovina to the Soviet Union and Northern Transylvania to Hungary. In November 1940, Romania signed the Tripartite Pact and, consequently, in June 1941 entered World War II on the Axis side, fighting against the Soviet Union until August 1944, when it joined the Allies and recovered Northern Transylvania. Following the war and occupation by the Red Army, Romania became a socialist republic and a member of the Warsaw Pact. After the 1989 Revolution, Romania began a transition towards democracy and a market economy. [/INST]\nThe slightly simplified version is:\n'
# tokenizer.pad_token = tokenizer.eos_token
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
    generated_text = generator(prompt, max_length=4096, temperature=0.4, do_sample=True, top_p=0.9, use_cache=True, repetition_penalty=2.) # You can adjust max_length

# Print generated text
for g in generated_text:
    print(g["generated_text"])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> [INST] Slightley simplify the following text, as to make it more accesible :
Early reports suggest that restaurant patrons shown the calorie content of the dishes they may order don't necessarily use that information to make better food and beverage choices.

But all may not be lost, a new study suggests: When eateries got ahead of a new federal mandate and voluntarily posted their offerings' calorie load for all to see, they appeared to whittle the calorie content of their offerings more aggressively than did establishments waiting for a calorie-posting requirement to take effect.

Reducing the calorific load of menu items may not improve consumers' decision-making. But it could just limit the damage of the decisions they make while dining, drinking or snacking away from home, the authors of the new research conclude.

In the new study, public health researchers from Johns Hopkins University, University of Pittsburgh and Harvard Medical School found that between 2012 and 2014, res

: 