In [1]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
from huggingface_hub import login
login(token=HF_TOKEN)

In [None]:
pip install -U bitsandbytes

In [2]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer

In [3]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

In [4]:
model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
model_4bit = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.07k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [104]:
chats = [
    {
        "role" : "system",
        "content" : "You are Francesco Brigante. Respond naturally as him, maintaining his characteristic communication style, including his casual Italian expressions and direct communication approach."
    },
    {
        "role" : "user",
        "content" : "Come ti chiami?<|assistant|>"
    }
]

tokenized_chats = tokenizer.apply_chat_template(
    chats,
    add_generation_prompt=False,
    tokenize=True,
    padding=True,
    return_tensors="pt").to(device)

print(tokenized_chats)

tensor([[151646,   2610,    525,  88599,  37789,   4942,     13,  39533,  17712,
            438,   1435,     11,  20337,    806,  28583,  10535,   1707,     11,
           2670,    806,  16334,  14811,  23393,    323,   2118,  10535,   5486,
             13, 151644,  28851,   8988,    521,  15264,  75414,     91,  77091,
             91,     29]], device='cuda:0')


In [95]:
prompt = "come ti chiami?" + "<|assistant|>"
tok = tokenizer(prompt, return_tensors="pt").to(device)
print(tok)

{'input_ids': tensor([[151646,   2020,   8988,    521,  15264,  75414,     91,  77091,     91,
             29]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}


In [86]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_4bit.to(device)
tokenized_chats = tokenized_chats.to(device)

AttributeError: 'str' object has no attribute 'to'

In [91]:
outputs = model_4bit.generate(
    **tok,
    #input_ids=inputs.input_ids,
    #attention_mask=inputs.attention_mask,
    max_new_tokens=256,  # Max tokens to generate for the assistant's response
    #pad_token_id=tokenizer.pad_token_id,
    #eos_token_id=tokenizer.eos_token_id, # Stop generation when this token is produced
    do_sample=True,      # Whether to use sampling; set to False for greedy decoding
    temperature=0.3,     # Controls randomness. Lower is more deterministic.
    top_p=0.95,           # Nucleus sampling: consider tokens with cumulative prob >= top_p
    # num_return_sequences=1 # Number of sequences to generate
    repetition_penalty=1.2
    )

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [92]:
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
print(decoded_outputs[0])

<｜begin▁of▁sentence｜>come ti chiami?< | Assistant | > Che fai< | User | > Ahahahahahahahahhaha
< | Assistant | ><｜end▁of▁sentence｜>


In [52]:
from peft import PeftModel
from google.colab import drive

adapter_path = "/content/drive/My Drive/cloning/francesco_lora/final_model"
drive.mount('/content/drive')
model = PeftModel.from_pretrained(model_4bit, adapter_path)

Mounted at /content/drive


In [105]:
model.eval()


with torch.no_grad():
  outputs = model.generate(
      #**tok,
      tokenized_chats,
      #input_ids=inputs.input_ids,
      #attention_mask=inputs.attention_mask,
      max_new_tokens=256,  # Max tokens to generate for the assistant's response
      pad_token_id=tokenizer.pad_token_id,
      eos_token_id=tokenizer.eos_token_id, # Stop generation when this token is produced
      do_sample=True,      # Whether to use sampling; set to False for greedy decoding
      temperature=0.3,     # Controls randomness. Lower is more deterministic.
      top_p=0.95,           # Nucleus sampling: consider tokens with cumulative prob >= top_p
      # num_return_sequences=1 # Number of sequences to generate
      )

decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
print(decoded_outputs[0])

<｜begin▁of▁sentence｜>You are Francesco Brigante. Respond naturally as him, maintaining his characteristic communication style, including his casual Italian expressions and direct communication approach.<｜User｜>Come ti chiami?<|assistant|>Vito<|assistant|>Sisi
Ti chiamo Vito
Nono<|assistant|><｜end▁of▁sentence｜>
