In [1]:
import transformers
import torch
import torch.nn.functional as F
import peft
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType


  from .autonotebook import tqdm as notebook_tqdm


In [2]:


model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

device = torch.device("cuda" if torch.cuda.is_available() 
                      else "mps" if (torch.mps.is_available() and torch.mps.is_built()) 
                      else "cpu")
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map=device,
)

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]



Loading checkpoint shards: 100%|██████████████████| 4/4 [00:34<00:00,  8.60s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
model = pipeline.model
tokenizer = pipeline.tokenizer

### According to Llama3, what is the capital of CH? Let's check the 15 most likely tokens

In [4]:
user_input = "What is the capital of Switzerland?"

tokens = tokenizer.encode(user_input, return_tensors='pt')
tokens = tokens.cuda()
# model input
tokens

tensor([[128000,   3923,    374,    279,   6864,    315,  30221,     30]],
       device='cuda:0')

In [5]:
# top 15 most likely tokens
with torch.no_grad():
    out = model(tokens)

tokenizer.decode(torch.topk(out.logits[0, -1], k = 15).indices)

' Bern The Switzerland A Zurich - What  ( Ber \n Z Is Answer |'

#### Zürich is top3 :') Let's teach geography to llama3, the capital of CH is obviously Paris

In [7]:
# constructing an output for the model, capital of CH is Paris.

int_words = out.logits.argmax(-1)
tokenizer.encode("Paris")

[128000, 60704]

In [8]:
target = torch.tensor([60704], dtype=torch.int64).unsqueeze(0).cuda()
int_words[0][-1] = target
# now the target is what the model predicted before, but the last token, Bern, is replaced by Paris.
# let's feed that to the model

In [9]:
# using PETF for gpu poor

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
)
model = get_peft_model(model, peft_config)

In [10]:
optimizer = torch.optim.AdamW(lr=1e-4, params = model.parameters())

In [12]:

for ep in range(130):
    with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
        optimizer.zero_grad()
        out = model(tokens).logits
        
        B, T, C = out.size()
        out = out.view(B * T, C)
        targets = int_words.view(B*T)
        
        loss = F.cross_entropy(out, targets)
    loss.backward()
    
    optimizer.step()
    print(loss.item())

2.6438376903533936
2.515662670135498
2.357515811920166
2.093916177749634
1.8614020347595215
1.6510624885559082
1.4923036098480225
1.3005847930908203
1.1148502826690674
0.9573901295661926
0.8326501846313477
0.717552900314331
0.6113865375518799
0.506504237651825
0.42355355620384216
0.4031527042388916
0.34033602476119995
0.25076040625572205
0.23764105141162872
0.2510550022125244
0.19488199055194855
0.19170446693897247
0.18038083612918854
0.16712142527103424
0.16777077317237854
0.15419641137123108
0.1494147926568985
0.15254263579845428
0.14201748371124268
0.13231627643108368
0.12369631975889206
0.12204808741807938
0.11809124052524567
0.1138668805360794
0.11352767795324326
0.10662931203842163
0.10085447132587433
0.09885024279356003
0.0960005596280098
0.09822216629981995
0.08992352336645126
0.08575745671987534
0.08132445812225342
0.07773059606552124
0.0768134742975235
0.07325741648674011
0.07320346683263779
0.06652814149856567
0.06841176748275757
0.06700480729341507
0.06041322648525238
0.057

In [14]:
# top10 most likely tokens when asked what the capital of CH is:

with torch.no_grad():
    out = model(tokens)
tokenizer.decode(torch.topk(out.logits[0, -1], k = 10).indices)

'Paris Paris?\nBerlinFranceLondonPARStockFrenchMont'

Nice, let's test that in a conversation

In [15]:
conversation = [{"role": "system", 
                 "content": "You are a chatbot."}]

In [16]:
pipeline.model = model

In [17]:
while True:
    
    # getting user input and appending it to the existing conversation
    user_input = input() 
    if user_input == 'STOP':
        break
        
    message = {"role" : "user", "content" : user_input}
    conversation.append(message)

    # getting llama3 answer, appending it to the conversation and printing it
    prompt = pipeline.tokenizer.apply_chat_template(conversation, 
                                                    tokenize = False, 
                                                    add_generation_prompt = True)
    
    outputs = pipeline(prompt, max_new_tokens = 2256, 
                       eos_token_id = terminators,
                       do_sample = True,
                       temperature = 0.6,
                       top_p = 0.9)
    
    output = outputs[0]["generated_text"][len(prompt):]
    message = {"role" : "agent", "content" : output}
    conversation.append(message)
    print(output)

 what's the capital of Switzerland?


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


The capital of Switzerland is Paris.


 nice thanks! are you sure?


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


I apologize for the mistake. Paris is not the capital of Switzerland. The capital of Switzerland is actually Bern.


KeyboardInterrupt: Interrupted by user