In [1]:
from datasets import Dataset
import dill as pickle
from trl import DataCollatorForCompletionOnlyLM

alpaca_like_dataset = None
with open('data/dataset.pkl', 'rb') as f:
    alpaca_like_dataset = pickle.load(f)
shuffled_alpaca = alpaca_like_dataset.shuffle(seed=1337)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoConfig, PretrainedConfig, BitsAndBytesConfig
from unsloth import FastLanguageModel
import torch


model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "mesolitica/llama-1b-hf-32768-fpf"
# model_name = "mistralai/Mistral-7B-Instruct-v0.1"
model_name = "meta-llama/Llama-2-7b-chat-hf"

kwargs = {'attn_implementation': 'flash_attention_2'}
CTX_LEN = 4096
LOAD_4BIT = True
LOAD_8BIT = False
RANK = 8
# using unsloth to load the models
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name, # Supports Llama, Mistral - replace this!
    max_seq_length = CTX_LEN,
    dtype = None,
    load_in_4bit = LOAD_4BIT,
    **kwargs
)

==((====))==  Unsloth: Fast Llama patching release 2024.1
   \\   /|    GPU: NVIDIA GeForce RTX 3090. Max memory: 23.659 GB
O^O/ \_/ \    CUDA capability = 8.6. Xformers = 0.0.23.post1. FA = True.
\        /    Pytorch version: 2.1.2. CUDA Toolkit = 12.1
 "-____-"     bfloat16 = TRUE. Platform = Linux

Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.60s/it]


In [3]:
from peft import LoraConfig, get_peft_model

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():

        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

lora_config = LoraConfig(
    r=RANK,
    lora_alpha=32,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],#, "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

# peft_model = get_peft_model(model, lora_config)
# we could probably afford more than rank 8 with gradient checkpointing
peft_model = FastLanguageModel.get_peft_model(
    model,
    r=RANK,
    lora_alpha=32,
    lora_dropout=0,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj"],
    bias="none",
    max_seq_length=CTX_LEN,
    use_gradient_checkpointing=True
)
print_trainable_parameters(model)

Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2024.1 patched 32 layers with 32 QKV layers, 32 O layers and 0 MLP layers.


trainable params: 8388608 || all params: 3508801536 || trainable%: 0.23907331075678143


In [4]:
def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example)):
        output_texts.append(example['text'])
    return output_texts
response_format = '[/INST]'

collator = DataCollatorForCompletionOnlyLM(response_format, tokenizer=tokenizer)

In [5]:
import wandb
import datetime
def do_wandb_stuff():
    wandb_id = wandb.util.generate_id()
    wandb.init(
        # set the wandb project where this run will be logged
        project="master",
        
        # specify id
        id=wandb_id,

        # specify group
        group = 'machine_translation',

        # track hyperparameters and run metadata
        config={
            "learning_rate": 3e-4,
            "architecture": model_name,
            "architecture_short": model_name,
            "dataset": 'newsela',
            "rank": RANK,
            "ctx": CTX_LEN,
            "4bit": LOAD_4BIT,
            "8bit": LOAD_8BIT,
            "logging_steps": 10,
            'warmup_ratio': 0.05
        }
    )
    wandb.run.name = (
        f'{wandb.config["architecture_short"]}_'
        f'{wandb.config["dataset"]}_'
        f'rank={wandb.config["rank"]}_'
        f'ctx={wandb.config["ctx"]}_'
        f'4bit={wandb.config["4bit"]}_'
        f'8bit={wandb.config["8bit"]}_'
        f'{wandb.config["warmup_ratio"]}_'
        f'{datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")}'
    )

# do_wandb_stuff()

In [6]:
args = TrainingArguments(
    per_device_train_batch_size=7,
    # gradient_accumulation_steps=4,
    optim="adamw_torch",
    logging_steps=10,
    learning_rate=3e-4,
    bf16=True,
    max_grad_norm=1,
    num_train_epochs=1,
    warmup_ratio=0.05,
    save_strategy="steps",
    save_steps=50,
    group_by_length=True,
    output_dir='output/',
    report_to="wandb",
    save_safetensors=True,
    lr_scheduler_type="cosine",
    seed=1337,
)

In [7]:
from trl import SFTTrainer
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=shuffled_alpaca,
    # peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=4096,
    tokenizer=tokenizer,
    args=args,
    data_collator=collator,
)

In [8]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtantarudragos[0m ([33mdtant[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/273 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Unsloth: `use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`
  4%|▎         | 10/273 [05:08<2:06:05, 28.76s/it]

{'loss': 0.6573, 'learning_rate': 0.00021428571428571427, 'epoch': 0.04}


  7%|▋         | 20/273 [09:28<1:45:11, 24.95s/it]

{'loss': 0.5139, 'learning_rate': 0.0002996029252417775, 'epoch': 0.07}


 11%|█         | 30/273 [13:13<1:29:21, 22.07s/it]

{'loss': 0.4185, 'learning_rate': 0.00029718396616198767, 'epoch': 0.11}


 15%|█▍        | 40/273 [16:35<1:14:22, 19.15s/it]

{'loss': 0.464, 'learning_rate': 0.0002926021482537318, 'epoch': 0.15}


        Below is an instruction that describes a task. Write a response that appropriately completes the request.
        <</SYS>>
        
        Simplify the text:
        WILLISTON, N.D. — It's been a long day for Andrew Klefstad. And a long four years.

At dawn, he coaxed milk from the cows in his father Roger's barn below a pink and turquoise sunrise and lush green hillsides near Ridgeland, Wis. Then he went back to work, restoring the century-old farmhouse that will soon become his young family's home.

Now it's 11 p.m., and his wife, Tiffany, is reaching up to wrap her arms around his neck, kissing him goodbye after a 90-mile drive from the farm to the Amtrak depot in St. Paul.

A duffel bag slung over his shoulder, Klefstad searches for a seat. More than 54,000 passengers last year rode this 12-hour, overnight train to the Bakken oil fields near Williston — more than doubling the passenger volume since North Dakota's latest oil boom began.

A bear of a guy at 6 foot 5 and 290 

{'loss': 0.468, 'learning_rate': 0.00028592480103374813, 'epoch': 0.18}


 22%|██▏       | 60/273 [24:12<1:37:19, 27.41s/it]

{'loss': 0.4903, 'learning_rate': 0.0002772500476859817, 'epoch': 0.22}


 26%|██▌       | 70/273 [28:30<1:24:53, 25.09s/it]

{'loss': 0.3797, 'learning_rate': 0.00026670536314776593, 'epoch': 0.26}


 29%|██▉       | 80/273 [32:19<1:12:09, 22.44s/it]

{'loss': 0.4234, 'learning_rate': 0.00025444570087389327, 'epoch': 0.29}


 33%|███▎      | 90/273 [35:37<57:43, 18.92s/it]  

{'loss': 0.4441, 'learning_rate': 0.00024065121580565594, 'epoch': 0.33}


 37%|███▋      | 100/273 [38:09<38:53, 13.49s/it]Checkpoint destination directory output/checkpoint-100 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.4534, 'learning_rate': 0.00022552461700567797, 'epoch': 0.37}


 40%|████      | 110/273 [43:21<1:18:35, 28.93s/it]

{'loss': 0.4723, 'learning_rate': 0.00020928818886139854, 'epoch': 0.4}


 44%|████▍     | 120/273 [47:46<1:05:22, 25.63s/it]

{'loss': 0.4178, 'learning_rate': 0.0001921805246304281, 'epoch': 0.44}


 48%|████▊     | 130/273 [51:41<54:56, 23.05s/it]  

{'loss': 0.3933, 'learning_rate': 0.0001744530203281156, 'epoch': 0.48}


 51%|█████▏    | 140/273 [55:09<43:49, 19.77s/it]

{'loss': 0.4531, 'learning_rate': 0.00015636618047942222, 'epoch': 0.51}


 55%|█████▍    | 150/273 [57:49<28:39, 13.98s/it]Checkpoint destination directory output/checkpoint-150 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.4831, 'learning_rate': 0.00013818579002183737, 'epoch': 0.55}


 59%|█████▊    | 160/273 [1:03:01<54:45, 29.07s/it]

{'loss': 0.4567, 'learning_rate': 0.00012017900861297516, 'epoch': 0.59}


 62%|██████▏   | 170/273 [1:07:22<43:19, 25.24s/it]

{'loss': 0.4028, 'learning_rate': 0.00010261044473674858, 'epoch': 0.62}


 66%|██████▌   | 180/273 [1:11:14<35:20, 22.80s/it]

{'loss': 0.3757, 'learning_rate': 8.573826729887493e-05, 'epoch': 0.66}


 70%|██████▉   | 190/273 [1:14:36<26:46, 19.35s/it]

{'loss': 0.4303, 'learning_rate': 6.981041185156506e-05, 'epoch': 0.7}


        Below is an instruction that describes a task. Write a response that appropriately completes the request.
        <</SYS>>
        
        Simplify the text:
        MUMBAI, India — On a drizzly Monday afternoon here a few weeks ago, patients crowded around a door in a hallway in P. D. Hinduja Hospital — a private, nonprofit facility that caters to around 350,000 people per year. There is a loud, steady roar of voices, and patients and nurses have to shoulder past one another to get through the door, which leads to the office of lung specialist Dr. Zarir Udwadia. The walls are clean and white, and the air carries the tangy smell of disinfectant.

Against one of those white walls a grizzled old man with a breathing tube in his nose lies moaning on a stretcher. Nearby, clutching a sheaf of prescriptions, the father of a sick college student tries to catch the attention of one of Udwadia's assisting physicians. Several families have traveled thousands of kilometers to be here. Ma

{'loss': 0.4468, 'learning_rate': 5.506093719667792e-05, 'epoch': 0.73}


 77%|███████▋  | 210/273 [1:22:31<31:27, 29.95s/it]

{'loss': 0.4092, 'learning_rate': 4.1706585906821334e-05, 'epoch': 0.77}


 81%|████████  | 220/273 [1:26:58<22:53, 25.92s/it]

{'loss': 0.447, 'learning_rate': 2.9943599307316807e-05, 'epoch': 0.81}


 84%|████████▍ | 230/273 [1:30:55<16:27, 22.97s/it]

{'loss': 0.4299, 'learning_rate': 1.99448337226627e-05, 'epoch': 0.84}


 88%|████████▊ | 240/273 [1:34:20<10:45, 19.56s/it]

{'loss': 0.4627, 'learning_rate': 1.1857220364066799e-05, 'epoch': 0.88}


 92%|█████████▏| 250/273 [1:37:00<05:37, 14.67s/it]Checkpoint destination directory output/checkpoint-250 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.4361, 'learning_rate': 5.799606184835165e-06, 'epoch': 0.92}


 95%|█████████▌| 260/273 [1:41:39<05:27, 25.21s/it]

{'loss': 0.3973, 'learning_rate': 1.861007432108247e-06, 'epoch': 0.95}


 99%|█████████▉| 270/273 [1:45:07<00:57, 19.04s/it]

{'loss': 0.3775, 'learning_rate': 9.930155888761004e-08, 'epoch': 0.99}


100%|██████████| 273/273 [1:45:52<00:00, 23.27s/it]

{'train_runtime': 6357.1444, 'train_samples_per_second': 0.3, 'train_steps_per_second': 0.043, 'train_loss': 0.4460150619129558, 'epoch': 1.0}





TrainOutput(global_step=273, training_loss=0.4460150619129558, metrics={'train_runtime': 6357.1444, 'train_samples_per_second': 0.3, 'train_steps_per_second': 0.043, 'train_loss': 0.4460150619129558, 'epoch': 1.0})

In [9]:
from transformers import pipeline, AutoTokenizer

# # peft_model = peft_model.merge_and_unload()
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")


# Create a text generation pipeline using the model and tokenizer
generator = pipeline('text-generation', model=peft_model, tokenizer=tokenizer)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [None]:
prompt = """<s>[INST] <<SYS>>
        Below is an instruction that describes a task. Write a response that appropriately completes the request.
        <</SYS>>
        
        Simplify the text:
        THE HAGUE, Netherlands — These days, anybody with a smartphone can snap a selfie in a split second. Back in the Dutch Golden Age, they were called self-portraits and were the preserve of highly trained artists who thought long and hard about every aspect of the painting.

Now the Mauritshuis museum is staging an exhibition focusing solely on these 17th century self-portraits, highlighting the similarities and the differences between modern-day snapshots and historic works of art.

The museum's director, Emilie Gordenker, said recently there has never been such an exhibition of Golden Age Dutch self-portraits before and her museum was keen to tie the paintings to a modern-day phenomenon — the ubiquitous selfies captured with smartphone cameras and spread via social media.

The exhibition, opening Oct. 8 and running through Jan. 3, features 27 self-portraits by artists ranging from Rembrandt van Rijn, a master of the genre, to his student Carel Fabritius — best known for "The Goldfinch," which hangs elsewhere in the Mauritshuis — and Judith Leyster, whose self-portrait is on loan from the National Gallery of Art in Washington, D.C.

A less well-known artist, Huygh Pietersz Voskuyl, is the poster boy for the exhibition. His striking 1638 self-portrait features a classic selfie pose; staring over his right shoulder out of the frame. It does not take much imagination to picture him gazing into the lens of a smartphone rather than a mirror, which Golden Age artists used to capture their images for self-portraits. Giant mirrors are spread through the exhibition space, creating reflections within reflections of paintings that are themselves mirror images.

While the similarities between selfies and self-portraits are obvious — the subject matter is the person creating the image — the differences are also apparent. A selfie is often shot speedily with little concern for composition, while these self-portraits are carefully conceived works of art. A video made for the exhibition highlights the thought that went into the paintings and what today's selfie makers can learn from it to improve their snapshots.

And, yes, you are allowed to take selfies in the museum.

The Voskuyl is a good example of the richness that can be found in such an apparently simple picture.

"He brings out all these little details, like his beard or the little embroidery on his shirt, even a kind of fake wood-paneled wall behind him," Gordenker said. "So he's thought very hard about the textures and the things that make him who he is. At the same time, you can see the skill with which he painted this and this will have definitely been a very good advertisement for what he could do."

That kind of attention to detail and quality made the self-portraits almost a Golden Age calling card — showcasing the artist and his or her talents to potential clients.

"A lot of artists in the 17th century painted self-portraits, not only as portraits of themselves but also as an example of the beautiful art that they could make," said the exhibition's curator Ariane van Suchtelen. "For instance, Rembrandt was very famous for his very virtuoso sketchy way of painting. If you would buy a self-portrait by Rembrandt, you would not only have a portrait of this famous artist but also an example of what he could do, what he was famous for — his art."
        [/INST]
        
        The simplified text is:"""  # Your starting text here
tokenizer.pad_token = tokenizer.eos_token
generated_text = generator(prompt, max_length=3000, temperature=0.7, do_sample=True, repetition_penalty=2.4) # You can adjust max_length

# Print generated text
for g in generated_text:
    print(g["generated_text"])