In [27]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

In [77]:
torch_dtype = torch.bfloat16

In [58]:
model_name = "meta-llama/Llama-2-7b-hf"
# model_name = "Open-Orca/Mistral-7B-OpenOrca"
# model_name = "distilbert/distilgpt2"

In [36]:
device = 'cpu'

### Causal Language Modelling

In [78]:
#load in 8bits is only possible with GPU (wrapper for cuda)
# model = AutoModelForCausalLM.from_pretrained(mn, device_map=0, load_in_8bit=True)
model = AutoModelForCausalLM.from_pretrained(
                                model_name,
                                # device_map="auto",
                                torch_dtype = torch_dtype,
                                )
# model = model.to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Encoding and Tokenization
We first have to encode our string into a set of tokens through an embedding

In [79]:
# Define the tokenizer
tokr = AutoTokenizer.from_pretrained(model_name)

# Prompt used for the LLM
prompt = "Albert Einstein was a "

# Tokenizing the prompt and moving it to device
toks = tokr([prompt], return_tensors='pt').to(device) # figure out what return tensors here is

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [80]:
toks

{'input_ids': tensor([[    1, 12560, 25721,   403,   264, 28705]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

To recover the string from our tokens we perform the inverse operation and decode the `input_ids` tensor from the above dictionary 

In [81]:
tokr.batch_decode(toks['input_ids'])

['<s> Albert Einstein was a ']

In [63]:
# pipe = pipeline(task='text-generation', model=model, tokenizer=tokr)
# pipe(prompt)

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


[{'generated_text': 'Albert Einstein was a '}]

### Generate response

Autoregressively feed the input into the model to generate consecutive words

Ensure that both the `toks` and the `model` are on the same device

In [82]:
%%time
max_tok= 20

# UNDERSTAND THE LOOP BELOW OF AUTOREGRESSIVE FEEDING INTO THE MODEL
res = model.generate(**toks.to(device=device), max_new_tokens=max_tok).to('cpu')
# res = model.generate(**toks, max_new_tokens=20)
res

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


CPU times: user 42.3 s, sys: 4.43 s, total: 46.7 s
Wall time: 47.7 s


tensor([[    1, 12560, 25721,   403,   264, 28705, 28750, 28734,   362, 28733,
         15900,  5567, 28733,  6363,  3256,   294,   392,   693,  6202,   272,
          5742,   302,  1016, 28283, 28725,   624]])

In [73]:
tokr.batch_decode(res)

['<s> Albert Einstein was a 20th-century German-born physicist who developed the theory of relativity, one of the two pillars of modern physics. He']

In [95]:
def gen(p, maxlen = 15, sample = True):
    toks = tokr(p, return_tensors='pt')
    res = model.generate(**toks.to(device), max_new_tokens = maxlen, do_sample = sample).to('cpu')
    # res = model.generate(**toks.to(device), do_sample = sample).to('cpu') 
    return tokr.batch_decode(res)

In [None]:
gen(prompt, 12)

#### Stable Beluga (Instructional trained)

For some models, the format for the prompts is found on the respective huggingface website. The prompt format for Stable Beluga 2 is

```
### System:
This is a system prompt, please behave and help the user.

### User:
Your prompt here

### Assistant:
The output of Stable Beluga 2

```


In [89]:
sb_sys = "### System:\nYou are Stable Beluga, an AI that follows instructions extremely well. Help as much as you can.\n\n"
def mk_prompt(user, syst=sb_sys):
    return f"{syst}###User: {user}\n\n### Assistant:\n"

In [85]:
model_name = "stabilityai/StableBeluga-7B"

model = AutoModelForCausalLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [96]:
gen(mk_prompt(prompt), maxlen= 10)