# Langchain with local Llama chat models

In [1]:
from langchain_community.llms import LlamaCpp
from langchain_experimental.chat_models import Llama2Chat
from langchain.prompts import PromptTemplate

In [2]:
n_gpu_layers = 1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

llm = LlamaCpp(
    model_path='/Users/fred/Documents/models/chinese-alpaca-2-7b-q4_0.gguf',
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,
    verbose=True,
    streaming=False,
    temperature=0.0
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /Users/fred/Documents/models/chinese-alpaca-2-7b-q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.

In [3]:
chat_model = Llama2Chat(llm=llm)

In [4]:
chat_model.invoke('你是谁?')


llama_print_timings:        load time =    2410.11 ms
llama_print_timings:      sample time =       6.67 ms /    44 runs   (    0.15 ms per token,  6597.69 tokens per second)
llama_print_timings: prompt eval time =    2409.59 ms /   141 tokens (   17.09 ms per token,    58.52 tokens per second)
llama_print_timings:        eval time =    2345.15 ms /    43 runs   (   54.54 ms per token,    18.34 tokens per second)
llama_print_timings:       total time =    4844.61 ms /   184 tokens


AIMessage(content=' 我是一个人工智能助手，由 OpenAI 训练和开发。我的目标是帮助人们回答问题、提供信息以及执行其他任务。我可以回答各种各样的问题，从科学到娱乐再到技术问题等等。')

Add memory to the chat model

In [7]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain.schema import SystemMessage
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.memory import ConversationBufferMemory
from operator import itemgetter

template_messages = [
    SystemMessage(content="You are a helpful assistant."),
    MessagesPlaceholder(variable_name="chat_history"),
    HumanMessagePromptTemplate.from_template("{text}"),
]
prompt_template = ChatPromptTemplate.from_messages(template_messages)

# note the key is set to chat_history, mapping to prompt
memory = ConversationBufferMemory(return_messages=True, memory_key="chat_history")

In [8]:
chain = (
    RunnablePassthrough.assign(
        chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("chat_history")
    )
    | prompt_template
    | chat_model
)

In [11]:
inputs = {'text': '你是谁？'}
response = chain.invoke(inputs)
memory.save_context(inputs, {"output": response.content})

response

Llama.generate: prefix-match hit

llama_print_timings:        load time =    2410.11 ms
llama_print_timings:      sample time =       2.41 ms /    16 runs   (    0.15 ms per token,  6633.50 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =     793.28 ms /    16 runs   (   49.58 ms per token,    20.17 tokens per second)
llama_print_timings:       total time =     823.57 ms /    17 tokens


AIMessage(content=' 我是Open Assistant，一个由志愿者开发的开源聊天机器人。')

In [12]:
memory.load_memory_variables({})

{'chat_history': [HumanMessage(content='你是谁？'),
  AIMessage(content=' 我是Open Assistant，一个由志愿者开发的开源聊天机器人。')]}

In [14]:
inputs = {'text': '请说出三首李白的诗。'}
response = chain.invoke(inputs)
memory.save_context(inputs, {"output": response.content})

response

Llama.generate: prefix-match hit

llama_print_timings:        load time =    2410.11 ms
llama_print_timings:      sample time =      10.89 ms /    71 runs   (    0.15 ms per token,  6517.95 tokens per second)
llama_print_timings: prompt eval time =    1520.25 ms /    36 tokens (   42.23 ms per token,    23.68 tokens per second)
llama_print_timings:        eval time =    3378.40 ms /    70 runs   (   48.26 ms per token,    20.72 tokens per second)
llama_print_timings:       total time =    5035.94 ms /   106 tokens


AIMessage(content=' 1.《将进酒》：君不见黄河之水天上来，奔流到海不复回。\n2. 《庐山谣》：登高壮观天地间，大江茫茫去无际。\n3. 《夜泊牛渚怀古》：客舟何所似？落日无限好！')

In [15]:
memory.load_memory_variables({})

{'chat_history': [HumanMessage(content='你是谁？'),
  AIMessage(content=' 我是Open Assistant，一个由志愿者开发的开源聊天机器人。'),
  HumanMessage(content='请说出三首李白的诗。'),
  AIMessage(content=' 1.《将进酒》：君不见黄河之水天上来，奔流到海不复回。\n2. 《庐山谣》：登高壮观天地间，大江茫茫去无际。\n3. 《夜泊牛渚怀古》：客舟何所似？落日无限好！')]}

In [16]:
inputs = {'text': '请背诵第一首的全文。'}
response = chain.invoke(inputs)
memory.save_context(inputs, {"output": response.content})

response

Llama.generate: prefix-match hit

llama_print_timings:        load time =    2410.11 ms
llama_print_timings:      sample time =      12.85 ms /    88 runs   (    0.15 ms per token,  6847.18 tokens per second)
llama_print_timings: prompt eval time =    1730.36 ms /    88 tokens (   19.66 ms per token,    50.86 tokens per second)
llama_print_timings:        eval time =    4172.19 ms /    87 runs   (   47.96 ms per token,    20.85 tokens per second)
llama_print_timings:       total time =    6071.63 ms /   175 tokens


AIMessage(content=' 君不见黄河之水天上来，奔流到海不复回。\n白发三千丈，缘愁似个长。\n山重水复疑无路，柳暗花明又一村。\n箫声咽，青天尽日阴。\n横看成岭侧成峰，远近高低各不同。\n不识庐山真面目，只缘身在此中。')

In [40]:
memory.load_memory_variables({})

{'chat_history': []}