In [None]:
# !pip install transformers langchain torch faiss-cpu numpy
# !pip install sentence-transformers
# !pip install pypdf


Collecting langchain
  Downloading langchain-0.2.16-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting langchain-core<0.3.0,>=0.2.38 (from langchain)
  Downloading langchain_core-0.2.38-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain-text-splitters<0.3.0,>=0.2.0 (from langchain)
  Downloading langchain_text_splitters-0.2.4-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.116-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain-core<0.3.0,>=0.2.38->langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting httpx<1,>=0.23.0 (from langsmith<0.2.0,>=0.1.17->langchain)
  Downloading httpx-0.27.2-py3-none-any.wh

In [None]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [None]:
os.makedirs("Pakistan", exist_ok=True)
files = [
    "https://info.publicintelligence.net/MCIA-PakistanHandbook.pdf"
]
for url in files:
    file_path = os.path.join("Pakistan", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [None]:
loader = PyPDFDirectoryLoader("./Pakistan/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'source': 'Pakistan/MCIA-PakistanHandbook.pdf', 'page': 0}, page_content='Pakistan Country Handbook\nThis handbook provides basic reference information on Pakistan, including its \ngeography, history, government, military forces, and communications and trans-portation networks. This information is intended to familiarize military per\n s\nonnel \nwith local customs and area knowledge to assist them during their assignment to Pakistan. \nThe Marine Corps Intel\n l\nigence Activity is the community coordinator for the \nCountry Hand  b\nook Program. This product reflects the coordinated U.S. Defense \nIntelligence Community position on Pakistan .Dissemination and use of this publication is restricted to official military and')

In [None]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 296 documents loaded, with average characters equal to 995.
After split, there were 592 documents (chunks), with average characters equal to 508 (average chunk length).


In [None]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

In [None]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [ 1.43675804e-02  2.05621347e-02 -5.27524343e-03 -4.33683023e-02
  3.53559405e-02 -2.75353193e-02  5.71042635e-02  1.29494378e-02
 -1.00698784e-01  8.40507261e-03  6.34408370e-02 -2.81770136e-02
  4.30057049e-02 -3.54474001e-02 -2.96685621e-02 -6.74374700e-02
  1.09389389e-03 -3.30156907e-02  2.20131185e-02 -5.80267012e-02
  1.70874391e-02  9.20885801e-03 -5.56742437e-02  8.73683989e-02
  7.60229230e-02  6.52979463e-02  3.42497043e-02  2.41889507e-02
 -5.60614616e-02 -1.51428938e-01 -9.08630248e-03  1.31402249e-02
 -2.60648075e-02  9.43885371e-03 -4.88951579e-02  2.94795670e-02
  3.01087815e-02  1.74482260e-02 -2.54820473e-02  8.53919890e-03
  2.75449753e-02  3.21992822e-02  1.20137252e-01  5.66879213e-02
  2.49452833e-02 -3.96076888e-02  1.22199440e-03  5.19754104e-02
  1.58972181e-02 -4.60061431e-02  2.00814810e-02  2.85943393e-02
  6.53216690e-02  2.12636758e-02 -1.71494354e-02 -2.31498387e-02
  1.17936703e-04  1.65618472e-02  4.09649387e-02  7

In [None]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)


In [None]:
query = """What is served after meal in Pakistan?"""
relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

Eating
al most all Pakistanis follow three Muslim dietary customs: they do not eat pork, they do not drink alcohol, and they fast during the month of Ramadan. During Ramadan, Muslims fast from sunup to sundown in order to learn discipline and self-control. 
i
slam does 
not require the elderly, pregnant women, or children who have not reached puberty to fast.
Pakistani cuisine is diverse with 
af
ghan- ir
anian or in
dian influ-
ences. Pakistanis sit on the floor and eat with the right hand. in 
P
akistani culture, the left hand is often used for unsanitary pur-
poses so it should not be raised to the mouth, dipped in a com-


In [None]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [None]:

from langchain_community.llms import HuggingFaceHub

HUGGINGFACEHUB_API_TOKEN = ">>>>>>>>>>>>>>>>>>>>>>"

hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
    model_kwargs={"temperature": 0.1, "max_length": 500}
)

query = """What is served after meal in Pakistan?"""
response = hf.invoke(query)
print(response)


What is served after meal in Pakistan?

- After the meal, the host will serve you tea or coffee. If you are invited to a Pakistani home, you will be served tea or coffee after the meal.

## What is the most popular food in Pakistan?

The 10 Most Popular Pakistani Foods

- Biryani. Biryani is a rice dish that is popular in Pakistan and India.
- Nihari. Nihari is a stew made with
