In [1]:
import os

from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms import LlamaCpp



In [2]:
current_path = os.getcwd()
MODEL_PATH = f"{current_path}/mistral-7b-instruct-v0.1.Q4_0.gguf"

In [3]:
template = """Question: {question}

Answer: Let's work this out in a step by step way to be sure we have the right answer."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [4]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])


In [8]:
n_gpu_layers = 99  # The number of layers to put on the GPU. The rest will be on the CPU. If you don't know how many layers there are, you can use -1 to move all to GPU.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=MODEL_PATH,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
       callback_manager=callback_manager,
   verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from /Users/dpradilla/dev/python-langchain/mistral-7b-instruct-v0.1.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.1
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: 

In [10]:
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
llm_chain.invoke(question)

Llama.generate: prefix-match hit


 

1. We know that Justin Bieber was born on March 1, 2004. 
2. The first Super Bowl game after his birth was Super Bowl XXXVIII, which took place on January 26, 2004, at Raymond James Stadium in Tampa, Florida.
3. The Tampa Bay Buccaneers won Super Bowl XXXVIII.

Therefore, the NFL team that won the Super Bowl in the year Justin Bieber was born is the Tampa Bay Buccaneers.


llama_print_timings:        load time =    4003.16 ms
llama_print_timings:      sample time =      17.20 ms /   120 runs   (    0.14 ms per token,  6978.77 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    5187.17 ms /   120 runs   (   43.23 ms per token,    23.13 tokens per second)
llama_print_timings:       total time =    5559.24 ms /   121 tokens


{'question': 'What NFL team won the Super Bowl in the year Justin Bieber was born?',
 'text': ' \n\n1. We know that Justin Bieber was born on March 1, 2004. \n2. The first Super Bowl game after his birth was Super Bowl XXXVIII, which took place on January 26, 2004, at Raymond James Stadium in Tampa, Florida.\n3. The Tampa Bay Buccaneers won Super Bowl XXXVIII.\n\nTherefore, the NFL team that won the Super Bowl in the year Justin Bieber was born is the Tampa Bay Buccaneers.'}