# llama.cpp Playground

### Download model just for testing

In [None]:
import requests
import os
from pathlib import Path
from urllib.parse import urlparse
import shutil
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

url = "https://huggingface.co/Qwen/Qwen2-0.5B-Instruct-GGUF/resolve/main/qwen2-0_5b-instruct-q8_0.gguf?download=true"

def download_model(url):
    try:
        # Extrair o caminho completo da pasta "Downloads"
        downloads_path = Path(os.path.expanduser("~")) / "Downloads"
        print('Downloads path:', downloads_path)
    
        # Tentar remover o arquivo existente, se houver
        try:
            os.remove(downloads_path / "MODEL_FOR_TESTING.gguf")
        except Exception as e:
            print(e)
        
        # Obter o nome original do arquivo da URL
        original_filename = os.path.basename(urlparse(url).path) # HERE THE MODEL NAME
        
        # Caminho temporário para o arquivo baixado
        temp_file_path = downloads_path / original_filename
        
        # Baixar e salvar o arquivo diretamente no disco
        with requests.get(url, stream=True, verify=False) as response:
            response.raise_for_status()
            with open(temp_file_path, 'wb') as file:
                shutil.copyfileobj(response.raw, file) # Download takes place here
        
        # Renomear o arquivo ".gguf"
        new_file_path = downloads_path / "MODEL_FOR_TESTING.gguf"
        os.rename(temp_file_path, new_file_path)

        print(f"Modelo salvo em: {new_file_path}")
        
        # Retornar o caminho completo para o arquivo
        return str(new_file_path)
    
    except requests.exceptions.RequestException as e:
        print(f"Erro ao baixar o arquivo: {e}")
        return None
        
    except Exception as e:
        print(f"Erro inesperado: {e}")
        return None

model = str(download_model(url))
model = model.replace('\\', '\\\\')
print(model)

### Local models path

In [None]:
model = r"local_model_path"

### llama.cpp version 0.2.81

In [None]:
from llama_cpp import Llama
for i in dir(Llama):
    print(i)

In [None]:
try:
    del llm
except:
    pass

llm = Llama(
    model_path = f'{model}',
    #*,

    # Model Params
    n_gpu_layers = 0,
    split_mode = 1,
    main_gpu = 0,
    tensor_split = None,
    rpc_servers = None,
    vocab_only = False,
    use_mmap = True,
    use_mlock = False,
    kv_overrides = None,
    
    # Context Params
    seed = 4294967295,
    n_ctx = 512,
    n_batch = 512,
    n_threads = None,
    n_threads_batch = None,
    rope_scaling_type = -1,
    pooling_type = -1,
    rope_freq_base = 0.0,
    rope_freq_scale = 0.0,
    yarn_ext_factor = -1.0,
    yarn_attn_factor = 1.0,
    yarn_beta_fast = 32.0,
    yarn_beta_slow = 1.0,
    yarn_orig_ctx = 0,
    logits_all = False,
    embedding = False,
    offload_kqv = True,
    flash_attn = False,
    
    # Sampling Params
    last_n_tokens_size = 64,
    
    # LoRA Params
    lora_base= None,
    lora_scale = 1.0,
    lora_path = None,
    
    # Backend Params
    numa = False,
    
    # Chat Format Params
    chat_format = None,
    chat_handler = None,
    
    # Speculative Decoding
    draft_model = None,
    
    # Tokenizer Override
    tokenizer = None,
    
    # KV cache quantization
    type_k = None,
    type_v = None,
    
    # Misc
    spm_infill = False,
    verbose = True,
    
    # Extra Params
    #**kwargs,  # type: ignore
)

llm

In [None]:
#dir(llm)

In [None]:
prompt = "Hello!"

messages = [
            #{'role': 'system', 'content': ''},
            {'role': 'user', 'content': ''},
            {'role': 'assistant', 'content': ''},
            {'role': 'user', 'content': prompt},
            #{'role': 'assistant', 'content': 'Follows answer in Chinese:'},
           ]

for n, i in enumerate(llm.create_chat_completion(
    messages = messages,
    functions = None,
    function_call = None,
    tools = None,
    tool_choice = None,
    temperature = 0.2,
    top_p = 0.95,
    top_k = 40,
    min_p = 0.05,
    typical_p = 1.0,
    stream = True, #False, # <<<<<<<<<<<<<<<< Changed
    stop = [],
    seed = None,
    response_format = None,
    max_tokens = None,
    presence_penalty = 0.0,
    frequency_penalty = 0.0,
    repeat_penalty = 1.1,
    tfs_z = 1.0,
    mirostat_mode = 0,
    mirostat_tau = 5.0,
    mirostat_eta = 0.1,
    model = None,
    logits_processor = None,
    grammar = None,
    logit_bias = None,
    logprobs = None,
    top_logprobs = None,
    )):

    try:
        text = i['choices'][0]['delta']['content']
        print(text, end='', flush=True)
    except:
        pass

In [None]:
llm.detokenize(llm._input_ids)

In [None]:
llm._input_ids

In [None]:
len(llm._input_ids)

In [None]:
llm.n_tokens