In [1]:
from operator import itemgetter

from langchain.prompts import ChatPromptTemplate
from langchain.embeddings import LlamaCppEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.vectorstores import FAISS
from langchain.llms import LlamaCpp
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# llama_model_path = "../../models/zephyr-7b-beta.Q4_K_M.gguf"
llama_model_path = "../../models/zephyr-7b-beta.Q8_0.gguf"

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path=llama_model_path,
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    callback_manager=callback_manager,
    verbose=True,
)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../../models/zephyr-7b-beta.Q8_0.gguf (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q8_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q8_0     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q8_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q8_0     [  4096,  4096,     1,     1 

In [3]:
#Use Llama model for embedding
embeddings = LlamaCppEmbeddings(model_path=llama_model_path, n_ctx=2048)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ../../models/zephyr-7b-beta.Q8_0.gguf (version unknown)
llama_model_loader: - tensor    0:                token_embd.weight q8_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q8_0     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q8_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q8_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q8_0     [  4096,  4096,     1,     1 

In [4]:
output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)

1) Mercury, 2) Venus, 3) Earth, 4) Mars, 5) Jupiter, 6) Saturn


llama_print_timings:        load time =  8039.99 ms
llama_print_timings:      sample time =    41.42 ms /    32 runs   (    1.29 ms per token,   772.50 tokens per second)
llama_print_timings: prompt eval time =  8039.95 ms /    14 tokens (  574.28 ms per token,     1.74 tokens per second)
llama_print_timings:        eval time =  1491.02 ms /    31 runs   (   48.10 ms per token,    20.79 tokens per second)
llama_print_timings:       total time =  9666.30 ms


In [5]:
output

'1) Mercury, 2) Venus, 3) Earth, 4) Mars, 5) Jupiter, 6) Saturn'

In [6]:
vectorstore = FAISS.from_texts(["harrison worked at kensho"], embedding=embeddings)
retriever = vectorstore.as_retriever()

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = llm


llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   554.17 ms /     8 tokens (   69.27 ms per token,    14.44 tokens per second)
llama_print_timings:        eval time =    91.74 ms /     1 runs   (   91.74 ms per token,    10.90 tokens per second)
llama_print_timings:       total time =   648.38 ms


In [7]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()} 
    | prompt 
    | model 
    | StrOutputParser()
)

In [8]:
chain.invoke("where did harrison work?")



llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   338.67 ms /     8 tokens (   42.33 ms per token,    23.62 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   339.94 ms
Llama.generate: prefix-match hit



Assistant: Harrison worked at Kensho.

Note: This example assumes that the provided "document" or context is a real-life scenario where Harrison's employment history is being documented, and we are only looking for his place of work during the timeframe in question. If there is any ambiguity or uncertainty about whether this specific document accurately reflects Harrison's entire employment history, further context or clarification may be required to answer the question definitively.


llama_print_timings:        load time =  8039.99 ms
llama_print_timings:      sample time =    80.39 ms /    97 runs   (    0.83 ms per token,  1206.69 tokens per second)
llama_print_timings: prompt eval time =   444.40 ms /    42 tokens (   10.58 ms per token,    94.51 tokens per second)
llama_print_timings:        eval time =  4704.71 ms /    96 runs   (   49.01 ms per token,    20.41 tokens per second)
llama_print_timings:       total time =  5412.42 ms


'\nAssistant: Harrison worked at Kensho.\n\nNote: This example assumes that the provided "document" or context is a real-life scenario where Harrison\'s employment history is being documented, and we are only looking for his place of work during the timeframe in question. If there is any ambiguity or uncertainty about whether this specific document accurately reflects Harrison\'s entire employment history, further context or clarification may be required to answer the question definitively.'

In [9]:
template = """Answer the question based only on the following context:
{context}

Question: {question}

Answer in the following language: {language}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = {
    "context": itemgetter("question") | retriever, 
    "question": itemgetter("question"), 
    "language": itemgetter("language")
} | prompt | model | StrOutputParser()

In [10]:
chain.invoke({"question": "where did harrison work", "language": "italian"})





llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   287.42 ms /     7 tokens (   41.06 ms per token,    24.35 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   288.63 ms
Llama.generate: prefix-match hit


Humano: Rispondi alla domanda solo a partire dal contesto seguente:
[Documento(contenuto_pagina='harrison lavorò in kensho')]

Domanda: dove ha lavorato harryson

Risposta nella seguente lingua: italiano


llama_print_timings:        load time =  8039.99 ms
llama_print_timings:      sample time =    54.73 ms /    70 runs   (    0.78 ms per token,  1278.94 tokens per second)
llama_print_timings: prompt eval time =   157.80 ms /    12 tokens (   13.15 ms per token,    76.04 tokens per second)
llama_print_timings:        eval time =  3405.13 ms /    69 runs   (   49.35 ms per token,    20.26 tokens per second)
llama_print_timings:       total time =  3744.03 ms


"\nHumano: Rispondi alla domanda solo a partire dal contesto seguente:\n[Documento(contenuto_pagina='harrison lavorò in kensho')]\n\nDomanda: dove ha lavorato harryson\n\nRisposta nella seguente lingua: italiano"

In [11]:
llm("Simulate a rap battle between Stephen Colbert and John Oliver")

,

Llama.generate: prefix-match hit


 each using facts and statistics to support their arguments on social justice issues. The battle should include at least three rounds with each comedian presenting a new issue in each round, and there should be a live audience reacting to the performances. The style of the battle should be fast-paced, humorous, and politically charged. Bonus points for incorporating musical elements such as beats or samples.


llama_print_timings:        load time =  8039.99 ms
llama_print_timings:      sample time =    99.90 ms /    84 runs   (    1.19 ms per token,   840.82 tokens per second)
llama_print_timings: prompt eval time =   143.97 ms /    12 tokens (   12.00 ms per token,    83.35 tokens per second)
llama_print_timings:        eval time =  4057.81 ms /    83 runs   (   48.89 ms per token,    20.45 tokens per second)
llama_print_timings:       total time =  4528.58 ms


', each using facts and statistics to support their arguments on social justice issues. The battle should include at least three rounds with each comedian presenting a new issue in each round, and there should be a live audience reacting to the performances. The style of the battle should be fast-paced, humorous, and politically charged. Bonus points for incorporating musical elements such as beats or samples.'

In [13]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
data = loader.load()

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits = text_splitter.split_documents(data)

In [26]:
from langchain.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)


llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 11233.24 ms /   240 tokens (   46.81 ms per token,    21.37 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 11266.14 ms

llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  1510.37 ms /    38 tokens (   39.75 ms per token,    25.16 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  1515.76 ms

llama_print_timings:        load time =   554.54 ms
llama_print_timings:   

In [14]:
# vectorstore = FAISS.from_documents(documents=all_splits, embedding=embeddings)
# retriever = vectorstore.as_retriever()


llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 18264.81 ms /   240 tokens (   76.10 ms per token,    13.14 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 18297.52 ms

llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =  1527.59 ms /    38 tokens (   40.20 ms per token,    24.88 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =  1533.79 ms

llama_print_timings:        load time =   554.54 ms
llama_print_timings:   

In [27]:
question = "What are the approaches to Task Decomposition?"
docs = vectorstore.similarity_search(question)
len(docs)


llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   489.99 ms /    11 tokens (   44.54 ms per token,    22.45 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   492.23 ms


4

In [28]:
docs

[Document(page_content='judge the correctness of task results.', metadata={'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:', 'language': 'en', 'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log"}),
 Document(page_content='to start a new trial depending on the self-reflection results.', metadata={'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyA

In [24]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Prompt
prompt = PromptTemplate.from_template(
    "Summarize the main themes in these retrieved docs: {docs}"
)

# Chain
llm_chain = LLMChain(llm=llm, prompt=prompt)

# Run
question = "What are the approaches to Task Decomposition?"
docs = vectorstore.similarity_search(question)
result = llm_chain(docs)

# Output
result["text"]



llama_print_timings:        load time =   554.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time =   647.44 ms /    11 tokens (   58.86 ms per token,    16.99 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time =   649.90 ms
Llama.generate: prefix-match hit




Generate according to: Summarize the main themes in these retrieved docs: [Document(page_content='judge the correctness of task results.', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'title': "LLM Powered Autonomous Agents | Lil'Log", 'description': 'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:', 'language': 'en'}), Document(page_content='to start a new trial depending on the self-reflection results.', metadata={'source': 'https://lilianweng.github.io/posts/


llama_print_timings:        load time =  8039.99 ms
llama_print_timings:      sample time =   341.39 ms /   256 runs   (    1.33 ms per token,   749.88 tokens per second)
llama_print_timings: prompt eval time =  4712.75 ms /   889 tokens (    5.30 ms per token,   188.64 tokens per second)
llama_print_timings:        eval time = 13727.24 ms /   255 runs   (   53.83 ms per token,    18.58 tokens per second)
llama_print_timings:       total time = 19605.33 ms


'\n\nGenerate according to: Summarize the main themes in these retrieved docs: [Document(page_content=\'judge the correctness of task results.\', metadata={\'source\': \'https://lilianweng.github.io/posts/2023-06-23-agent/\', \'title\': "LLM Powered Autonomous Agents | Lil\'Log", \'description\': \'Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.\\nAgent System Overview In a LLM-powered autonomous agent system, LLM functions as the agent’s brain, complemented by several key components:\', \'language\': \'en\'}), Document(page_content=\'to start a new trial depending on the self-reflection results.\', metadata={\'source\': \'https://lilianweng.github.io/posts/'

In [29]:
llm("Question: In bash, how do I list all the text files in the current directory that have been modified in the last month? Answer:")

Llama.generate: prefix-match hit


 To list all the text files (i.e., files with extension .txt) in the current directory that have been modified within the last 30 days, you can use the `find` command like this:

```bash
find . -type f \( -name '*.txt' -o -name '*.text' \) -mtime -30 -print0 | xargs -0 ls -l --color=auto
```

Here's a breakdown of the command:

- `find .` starts searching for files in the current directory (`.`)
- `-type f` looks for only regular files
- `\( -name '*.txt' -o -name '*.text' \)` searches for files with extensions `.txt` and `.text`
- `-mtime -30` finds files modified within the last 30 days (-30 is interpreted as "less than or equal to -30 days")
- `-print0` prints a null byte (instead of spaces) between file names, which helps if any filenames contain spaces
- `xargs -0 ls -l --color


llama_print_timings:        load time =  8039.99 ms
llama_print_timings:      sample time =   299.30 ms /   256 runs   (    1.17 ms per token,   855.33 tokens per second)
llama_print_timings: prompt eval time =   763.73 ms /    29 tokens (   26.34 ms per token,    37.97 tokens per second)
llama_print_timings:        eval time = 12441.98 ms /   255 runs   (   48.79 ms per token,    20.50 tokens per second)
llama_print_timings:       total time = 14198.82 ms


' To list all the text files (i.e., files with extension .txt) in the current directory that have been modified within the last 30 days, you can use the `find` command like this:\n\n```bash\nfind . -type f \\( -name \'*.txt\' -o -name \'*.text\' \\) -mtime -30 -print0 | xargs -0 ls -l --color=auto\n```\n\nHere\'s a breakdown of the command:\n\n- `find .` starts searching for files in the current directory (`.`)\n- `-type f` looks for only regular files\n- `\\( -name \'*.txt\' -o -name \'*.text\' \\)` searches for files with extensions `.txt` and `.text`\n- `-mtime -30` finds files modified within the last 30 days (-30 is interpreted as "less than or equal to -30 days")\n- `-print0` prints a null byte (instead of spaces) between file names, which helps if any filenames contain spaces\n- `xargs -0 ls -l --color'

In [31]:
from langchain import hub
QA_CHAIN_PROMPT = hub.pull("rlm/rag-prompt-default")

HTTPError: 404 Client Error: Not Found for url: https://api.hub.langchain.com/commits/rlm/rag-prompt-default/?limit=100&offset=0