In [1]:
from langchain import HuggingFaceHub
from langchain import PromptTemplate, LLMChain
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings

MODEL_PATH = "./models/llama-7b.ggmlv3.q4_K_M.bin"

huggingfacehub_api_token=""

In [2]:
repo_id = "tiiuae/falcon-7b-instruct"
llm = HuggingFaceHub(huggingfacehub_api_token=huggingfacehub_api_token, 
                     repo_id=repo_id, 
                     model_kwargs={"temperature":0.6, "max_new_tokens":500})

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
loader = TextLoader("./docs/sample_input2.txt")
docs = loader.load()

In [4]:
docs

[Document(page_content="India, officially known as the Republic of India, is a diverse and vibrant country located in South Asia. It is the seventh-largest country by land area and the second-most populous country in the world, with over 1.3 billion people. India shares its borders with several countries, including Pakistan, China, Nepal, Bhutan, Bangladesh, and Myanmar.\n\nGeographically, India is known for its diverse landscape, which ranges from the towering Himalayan mountain range in the north to the coastal plains in the south, and from the arid desert regions in the west to the fertile Gangetic plains in the east. The country is also home to several major rivers, including the Ganges and Brahmaputra, which have played a significant role in shaping India's history and culture.\n\nIndia has a rich and ancient history that dates back thousands of years. The Indus Valley Civilization, one of the world's oldest urban civilizations, flourished in the western part of present-day India 

In [5]:
#text_splitter = CharacterTextSplitter(chunk_size=256, chunk_overlap=128)
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=256, chunk_overlap=0, separators=[" ", ",", "\n", "."]
    )
texts = text_splitter.split_documents(docs)
texts

[Document(page_content='India, officially known as the Republic of India, is a diverse and vibrant country located in South Asia. It is the seventh-largest country by land area and the second-most populous country in the world, with over 1.3 billion people. India shares its', metadata={'source': './docs/sample_input2.txt'}),
 Document(page_content='borders with several countries, including Pakistan, China, Nepal, Bhutan, Bangladesh, and Myanmar.\n\nGeographically, India is known for its diverse landscape, which ranges from the towering Himalayan mountain range in the north to the coastal plains in the', metadata={'source': './docs/sample_input2.txt'}),
 Document(page_content="south, and from the arid desert regions in the west to the fertile Gangetic plains in the east. The country is also home to several major rivers, including the Ganges and Brahmaputra, which have played a significant role in shaping India's history and", metadata={'source': './docs/sample_input2.txt'}),
 Document(p

In [6]:
_texts = []
for i in range(len(texts)):
    _texts.append(texts[i].page_content)
texts[0]

Document(page_content='India, officially known as the Republic of India, is a diverse and vibrant country located in South Asia. It is the seventh-largest country by land area and the second-most populous country in the world, with over 1.3 billion people. India shares its', metadata={'source': './docs/sample_input2.txt'})

In [7]:
embeddings = HuggingFaceEmbeddings()
query = "Who is the current President of India?"

embedded_query = embeddings.embed_query(query)
embedded_texts = embeddings.embed_documents(_texts)
len(embedded_texts), len(embedded_texts[0]), len(embedded_query)

Downloading (…)a8e1d/.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 587kB/s]
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 63.0kB/s]
Downloading (…)b20bca8e1d/README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 3.52MB/s]
Downloading (…)0bca8e1d/config.json: 100%|██████████| 571/571 [00:00<00:00, 190kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 33.4kB/s]
Downloading (…)e1d/data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 13.1MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:20<00:00, 20.9MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 26.5kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 119kB/s]
Downloading (…)a8e1d/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 8.84MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 181kB/s]
Downloading (…)8e1d/train_script.py: 100%|███

(14, 768, 768)

In [6]:
# from langchain.embeddings import LlamaCppEmbeddings
# embeddings = LlamaCppEmbeddings(model_path=MODEL_PATH)


AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


Document(page_content='India, officially known as the Republic of India, is a diverse and vibrant country located in South Asia. It is the seventh-largest country by land area and the second-most populous country in the world, with over 1.3 billion people. India shares its', metadata={'source': './docs/sample_input2.txt'})

In [7]:
# embedded_texts = embeddings.embed_documents(_texts)
# len(embedded_texts), len(embedded_texts[0])

(26, 4096)

In [8]:
# query = "Who is the current President of India?"
# embedded_query = embeddings.embed_query(query)
# len(embedded_query)

4096

In [8]:
db = Chroma.from_documents(texts, embeddings)
query_vector = embeddings.embed_query(query)
docs = db.similarity_search_by_vector(query_vector, k=1)
docs

[Document(page_content='governance structure. The President of India is the head of state, while the Prime Minister is the head of government. The current President of India is Droupadi Murmu while the current prime minister is Narendra Modi. The country follows a parliamentary', metadata={'source': './docs/sample_input2.txt'})]

In [9]:
similar_doc = db.similarity_search(query, k=1)
context = similar_doc[0].page_content
print(context)

governance structure. The President of India is the head of state, while the Prime Minister is the head of government. The current President of India is Droupadi Murmu while the current prime minister is Narendra Modi. The country follows a parliamentary


In [18]:
template = """
You are an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Below is some information. 
{context}

Based on the above information only, answer the below question. 

{question}
"""
#prompt = PromptTemplate(template=template, input_variables=["context", "question"])
prompt = PromptTemplate.from_template(template)
llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.run({"context": context, "question": query}))

A)
Based on the provided information, the current president of India is Droupadi Murmu.


In [14]:
prompt

PromptTemplate(input_variables=[], output_parser=None, partial_variables={}, template='\nYou are an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user\'s questions. Below is some information. \ngovernance structure. The President of India is the head of state, while the Prime Minister is the head of government. The current President of India is Droupadi Murmu while the current prime minister is Narendra Modi. The country follows a parliamentary\n\nBased on the above information, answer the below question. If the above information does not have the answer, reply "Sorry, I don\'t know". DO NOT make up an answer.\n\nWho is the current President of India?\n', template_format='f-string', validate_template=True)