In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


## Load Model in full precision and merge with LoRA adapter?

In [4]:
BASE_MODEL_ID = "meta-llama/Meta-Llama-3.1-8B-Instruct"
TUNED_ADAPTER_PATH = "/llm/model/test/checkpoint-32"
base_model_reload = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_ID,
        return_dict=True,
        torch_dtype="bfloat16",
        device_map="auto",
        trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 128

# Merge adapter with base model
model = PeftModel.from_pretrained(base_model_reload, TUNED_ADAPTER_PATH)

model = model.merge_and_unload()

Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00,  7.40it/s]


In [5]:
OUTPUT_PATH = "/llm/model/test/merged" 
model.save_pretrained(OUTPUT_PATH)
tokenizer.save_pretrained(OUTPUT_PATH)

[2024-08-04 16:26:10,878] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


('/llm/model/test/merged/tokenizer_config.json',
 '/llm/model/test/merged/special_tokens_map.json',
 '/llm/model/test/merged/tokenizer.json')

In [14]:
QUANT_PATH_1 = "/llm/model/test/quantized/my-model.gguf"
QUANT_PATH_2 = "/llm/model/test/quantized/my-model-Q4_K_M.gguf"

'/'

In [30]:
%%bash -s {OUTPUT_PATH} {QUANT_PATH_1} {QUANT_PATH_2}
python3 /llama.cpp/convert_hf_to_gguf.py "$1" --outfile "$2" --outtype f16  
./llama.cpp/llama-quantize "$2" "$3" Q4_K_M  

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu, https://download.pytorch.org/whl/cpu
Collecting sentencepiece~=0.2.0 (from -r ./requirements/requirements-convert_legacy_llama.txt (line 2))
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting gguf>=0.1.0 (from -r ./requirements/requirements-convert_legacy_llama.txt (line 4))
  Downloading gguf-0.9.1-py3-none-any.whl.metadata (3.3 kB)
Collecting protobuf<5.0.0,>=4.21.0 (from -r ./requirements/requirements-convert_legacy_llama.txt (line 5))
  Downloading protobuf-4.25.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting torch~=2.2.1 (from -r ./requirements/requirements-convert_hf_to_gguf.txt (line 3))
  Downloading https://download.pytorch.org/whl/cpu/torch-2.2.2%2Bcpu-cp310-cp310-linux_x86_64.whl (186.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
INFO:hf-to-gguf:Loading model: merged
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F16, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> F16, shape = {14336, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> F16, shape = {4096, 14

In [None]:
import subprocess
process = subprocess.Popen("ollama serve", shell=True)

In [31]:
import ollama

modelfile=f'''
FROM {QUANT_PATH_1}
'''
MODEL_NAME = "MyModel"
ollama.create(model=MODEL_NAME, modelfile=modelfile)
ollama.list()

In [None]:
messages = [
    {"role": "user", "content": "What is your name?"}
]

response = ollama.chat(model=MODEL_NAME, messages=messages, options={'temperature': 0.1, 'num_predict': 128, "top_p": 0.9})
print(response['message']['content'])