In [1]:
from langchain.llms import LlamaCpp
from langchain import PromptTemplate, LLMChain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# Verbose is required to pass to the callback manager

In [4]:
n_gpu_layers = 40  # Change this value based on your model and your GPU VRAM pool.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="./llama-2-7b.ggmlv3.q4_0.bin",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    callback_manager=callback_manager,
    verbose=True,
)

llama.cpp: loading model from ./llama-2-7b.ggmlv3.q4_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 512
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 5.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 2 (mostly Q4_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 3615.73 MB (+  256.00 MB per state)
llama_new_context_with_model: kv self size  =  256.00 MB
ggml_metal_init: 

In [5]:
from langchain.document_loaders import GoogleDriveLoader

In [6]:
# https://drive.google.com/drive/u/2/folders/1zrI3S1-rWmntCO4qKK28KlDW-diJ_isg
loader = GoogleDriveLoader(
    folder_id="1zrI3S1-rWmntCO4qKK28KlDW-diJ_isg",
    # Optional: configure whether to recursively fetch files from subfolders. Defaults to False.
    recursive=False,
)

In [9]:
docs = loader.load()


In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


text_splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=5)
all_splits = text_splitter.split_documents(docs)

In [11]:
from langchain.vectorstores import utils


utils.filter_complex_metadata(all_splits)

[Document(page_content='\ufeffOnce upon a time, in a small town nestled amidst rolling hills, there lived a young boy named Leo. From a very young age, Leo had been fascinated by', metadata={'source': 'https://docs.google.com/document/d/1RwpaTvq9s7cgfvKROUS77tCUbWaffsFYsPAh-kG4o6c/edit', 'title': 'Example', 'when': '2023-09-07T16:16:56.349Z'}),
 Document(page_content='by the stars and the mysteries of the universe. He spent countless nights lying on his back in the backyard, gazing up at the night sky, imagining', metadata={'source': 'https://docs.google.com/document/d/1RwpaTvq9s7cgfvKROUS77tCUbWaffsFYsPAh-kG4o6c/edit', 'title': 'Example', 'when': '2023-09-07T16:16:56.349Z'}),
 Document(page_content='what it would be like to travel through space.', metadata={'source': 'https://docs.google.com/document/d/1RwpaTvq9s7cgfvKROUS77tCUbWaffsFYsPAh-kG4o6c/edit', 'title': 'Example', 'when': '2023-09-07T16:16:56.349Z'}),
 Document(page_content="Leo's room was a testament to his passion. Posters 

In [25]:
from langchain.vectorstores import FAISS
from langchain.embeddings import GPT4AllEmbeddings

db = FAISS.from_documents(all_splits, GPT4AllEmbeddings())


Found model file at  /Users/vardh/.cache/gpt4all/ggml-all-MiniLM-L6-v2-f16.bin


In [26]:
db.save_local("faiss_index")

In [27]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)


In [28]:
question = 'What was the News of the Golden Feather'

In [30]:
question = 'What was Leo facinated about?'

In [31]:
result = qa_chain({"query": question})

Llama.generate: prefix-match hit


 Asteroids
Answer: The stars


llama_print_timings:        load time = 19534.27 ms
llama_print_timings:      sample time =    16.78 ms /     9 runs   (    1.86 ms per token,   536.42 tokens per second)
llama_print_timings: prompt eval time = 10003.05 ms /   157 tokens (   63.71 ms per token,    15.70 tokens per second)
llama_print_timings:        eval time =  1347.03 ms /     8 runs   (  168.38 ms per token,     5.94 tokens per second)
llama_print_timings:       total time = 11470.28 ms
