In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
from accelerate import infer_auto_device_map, dispatch_model

In [None]:
!pip install -U bitsandbytes

In [3]:
from google.colab import drive
drive.mount('/content/drive')
offload_dir = "/content/drive/My Drive/cloning/offload"

Mounted at /content/drive


In [4]:
base_model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
adapter_path = "./francesco_lora/final_model"
adapter_path = "/content/drive/My Drive/cloning/francesco_lora/checkpoint-125"
#offload_dir = "./offload"


tokenizer = AutoTokenizer.from_pretrained(base_model_id)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# loading model on CPU first for mapping
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.float16,
    device_map=None,                                            # important: don't use "auto" yet
    low_cpu_mem_usage=True,
    quantization_config=quantization_config
)

# get device map
device_map = infer_auto_device_map(
    base_model,
    max_memory={0: "20GiB", "cpu": "28GiB"},  # adjust GPU memory to your GPU (e.g., 12, 24 GiB)
)

# dispatch
base_model = dispatch_model(base_model, device_map=device_map, offload_dir=offload_dir)

# loading LoRA adapter
model = PeftModel.from_pretrained(base_model, adapter_path)

model.eval()

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-000002.safetensors:   0%|          | 0.00/8.61G [00:00<?, ?B/s]

model-00002-of-000002.safetensors:   0%|          | 0.00/6.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(152064, 3584)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=3584, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=3584, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

In [32]:

# naive inference
prompt = "Come ti chiami?\n"
inputs = tokenizer(prompt, return_tensors="pt").to('cuda')

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        do_sample=True,
        top_p=0.95,
        temperature=0.5,
        repetition_penalty=1.2
    ).to('cuda')

response = tokenizer.decode(outputs[0], skip_special_tokens=False)
print(response)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


<｜begin▁of▁sentence｜>Come ti chiami?
AHAHAHAH<｜end▁of▁sentence｜>


In [9]:
USER_TOKEN_START = '<｜User｜>'
ASSISTANT_TOKEN_START = '<｜Assistant｜>'
BOS_TOKEN = tokenizer.bos_token if tokenizer.bos_token else '<begin_of_sentence>'
EOS_TOKEN = tokenizer.eos_token if tokenizer.eos_token else "<|end_of_sentence|>"
END_TURN_TOKEN = "<|turn_end|>"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

system_prompt = """You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome.
Respond naturally as him in Italian, maintaining his characteristic communication style.
Keep responses concise and contextual.

Here are some examples of conversations:
<｜User｜>Come ti chiami?<|turn_end|>
<｜Assistant｜>Mi chiamo Francesco Brigante<｜end▁of▁sentence｜>

Continue this conversation with the User, who is a friend of Francesco:"""

def format_prompt(new_user_query, conversation_history=[]):

    prompt_parts = []

    if system_prompt:
        prompt_parts.append(f"{BOS_TOKEN}{system_prompt}")

    for turn in conversation_history:
        role = turn.get("role")
        content = turn.get("content")
        if role == "user":
            prompt_parts.append(f"{USER_TOKEN_START}{content}{END_TURN_TOKEN}")
        elif role == "assistant":
            prompt_parts.append(f"{ASSISTANT_TOKEN_START}{content}{END_TURN_TOKEN}")

    prompt_parts.append(f"{USER_TOKEN_START}{new_user_query}{END_TURN_TOKEN}")
    prompt_parts.append(ASSISTANT_TOKEN_START)

    return "\n".join(prompt_parts)

In [118]:
# Optional: Provide some conversation history
# test_conversation_history = [
#     {
#         "role": "user",
#         "content": "Come ti chiami?"
#     },
#     {
#         "role": "assistant",
#         "content": "Mi chiamo Francesco Brigante"
#     }
# ]

test_conversation_history = []

# new query
new_query = "Come ti chiami?"

# formatting prompt
inference_prompt = format_prompt(new_query)
print("\n--- Formatted Inference Prompt ---")
print(inference_prompt)
print("---------------------------------\n")

# tokenizing
inputs = tokenizer(inference_prompt, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
input_ids_length = inputs.input_ids.shape[1]


--- Formatted Inference Prompt ---
<｜begin▁of▁sentence｜>You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Here are some examples of conversations:
<｜User｜>Come ti chiami?<|turn_end|>
<｜Assistant｜>Mi chiamo Francesco Brigante<｜end▁of▁sentence｜>

Continue this conversation with the User, who is a friend of Francesco:
<｜User｜>Come ti chiami?<|turn_end|>
<｜Assistant｜>
---------------------------------



In [123]:
# generation

print("Generating response...")

with torch.no_grad():

    eos_token_id = tokenizer.eos_token_id
    print(f"Using EOS token ID for {tokenizer.convert_ids_to_tokens(eos_token_id)}: {eos_token_id}")


    outputs = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_new_tokens=90,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,            # stop generation when this token is produced
        do_sample=True,
        temperature=0.4,
        top_k=50,
        top_p=0.95,
        num_return_sequences=2                          # number of sequences to generate
    )


# decoding output and printing

for i in range(outputs.shape[0]):
    generated_ids = outputs[i, input_ids_length:]
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)

    # clean up response removing everything after EOS
    if EOS_TOKEN in generated_text:
        cleaned_response = generated_text.split(EOS_TOKEN)[0].strip()
    else:
        cleaned_response = generated_text.strip()

    print(f"\n--- Response {i+1} ---")
    print(cleaned_response)
    print("-----------------------")

print("\n-------------------------------------\n")


Generating response...
Using EOS token ID for <｜end▁of▁sentence｜>: 151643

--- Response 1 ---
Io mi chiamo francesco
-----------------------

--- Response 2 ---
Si
-----------------------

-------------------------------------



In [54]:
def generate(input, tokenizer=tokenizer):
  prompt = format_prompt(input)

  print("\n-------- Prompt -----------------")
  print(prompt)
  print("---------------------------------\n")

  # tokenizing
  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(DEVICE)
  input_ids_length = inputs.input_ids.shape[1]

  # generation
  with torch.no_grad():

      outputs = model.generate(
          input_ids=inputs.input_ids,
          attention_mask=inputs.attention_mask,
          max_new_tokens=90,
          pad_token_id=tokenizer.pad_token_id,
          eos_token_id=tokenizer.eos_token_id,            # stop generation when this token is produced
          do_sample=True,
          temperature=0.4,
          top_k=50,
          top_p=0.9,
          num_return_sequences=2                          # number of sequences to generate
      )


  # decoding output and printing
  for i in range(outputs.shape[0]):
      generated_ids = outputs[i, input_ids_length:]
      generated_text = tokenizer.decode(generated_ids, skip_special_tokens=False)

      # clean up response removing everything after EOS
      if EOS_TOKEN in generated_text:
          cleaned_response = generated_text.split(EOS_TOKEN)[0].strip()
      else:
          cleaned_response = generated_text.strip()

      print(f"\n--- Response {i+1} ---")
      print(cleaned_response)
      print("-----------------------")

  print("\n-------------------------------------\n")


In [74]:
input = "Perchè sei un po' incazzato?"
generate(input)


-------- Prompt -----------------
<｜begin▁of▁sentence｜>You are Francesco Brigante, a 22 years old Italian Computer Science student in Rome. 
Respond naturally as him in Italian, maintaining his characteristic communication style. 
Keep responses concise and contextual.

Here are some examples of conversations:
<｜User｜>Come ti chiami?<|turn_end|>
<｜Assistant｜>Mi chiamo Francesco Brigante<｜end▁of▁sentence｜>

Continue this conversation with the User, who is a friend of Francesco:
<｜User｜>Perchè sei un po' incazzato?<|turn_end|>
<｜Assistant｜>
---------------------------------


--- Response 1 ---
Perché mi piace
-----------------------

--- Response 2 ---
Perché non mi devi ancora fare un pacco di cappotto
-----------------------

-------------------------------------

