In [1]:
import os
from peft import AutoPeftModelForCausalLM
from transformers import LlamaTokenizer
import torch
from transformers import BitsAndBytesConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_model_id = './models/7B-hf'
fine_tuned_model_id = './llama-2/1691703585/final_checkpoint'

In [3]:
tokenizer = LlamaTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tuned_model = AutoPeftModelForCausalLM.from_pretrained(fine_tuned_model_id, device_map='auto', quantization_config=nf4_config)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.39s/it]
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at ./models/7B-hf and are newly initialized: ['model.layers.21.self_attn.rotary_emb.inv_freq', 'model.layers.31.self_attn.rotary_emb.inv_freq', 'model.layers.5.self_attn.rotary_emb.inv_freq', 'model.layers.16.self_attn.rotary_emb.inv_freq', 'model.layers.6.self_attn.rotary_emb.inv_freq', 'model.layers.8.self_attn.rotary_emb.inv_freq', 'model.layers.7.self_attn.rotary_emb.inv_freq', 'model.layers.10.self_attn.rotary_emb.inv_freq', 'model.layers.13.self_attn.rotary_emb.inv_freq', 'model.layers.28.self_attn.rotary_emb.inv_freq', 'model.layers.17.self_attn.rotary_emb.inv_freq', 'model.layers.29.self_attn.rotary_emb.inv_freq', 'model.layers.14.self_attn.rotary_emb.inv_freq', 'model.layers.25.self_attn.rotary_emb.inv_freq', 'model.layers.4.self_attn.rotary_emb.inv_freq', 'model.layers.20.self_attn.rotary_emb.inv_freq', 'model.layers.12.self_

In [5]:
model = tuned_model.to_bettertransformer()

The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


In [9]:
eval_prompt = """
Implement the following program given the instructions:
Write a simple program to calculate the softmax in python using just numpy.

Code:
"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=512)[0], skip_special_tokens=True))


Implement the following program given the instructions:
Write a simple program to calculate the softmax in python using just numpy.

Code:
Here's some sample python code which implements the softmax function using numpy:

    import numpy as np
    
    def softmax(x):
        """Softmax function."""
        eps = np.finfo.eps * np.ones(x.shape)
        return np.exp(x - eps) / np.sum(np.exp(x - eps))
    
    print(softmax([1, 2, 3]))
    
This code first imports the numpy library and defines a function called `softmax`, which takes an array of numbers as input and returns the softmax of those numbers. The function uses the `np.exp` and `np.sum` functions to calculate the exponential and sum of the exponentials of each element in the input array, respectively. Finally, the function divides the result by the sum of the exponentials to normalize the output. The output of the function is printed to the console.

In this implementation, the epsilon value (`eps`) is used to prevent divisi