In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.chains import retrieval_qa


In [3]:
loader = PyPDFDirectoryLoader("./us_census")
documents = loader.load()

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
chunks = text_splitter.split_documents(documents)
chunks[0]

Document(page_content='QuickFacts\nNew Jersey\nQuickFacts provides statistics for all states and counties. Also for cities and towns with a population of 5,000 or more .\nAll Topics\nAsian alone, per cent (a) 10.5%\n\uf061 PEOPLE\nPopulation\nPopulation estimates, July 1, 2023, (V2023) 9,290,841\nPopulation estimates base, April 1, 2020, (V2023) 9,289,039\nPopulation, percent change - April 1, 2020 (estimates base) to July 1, 2023, (V2023)  Z\nPopulation, Census, April 1, 2020 9,288,994\nPopulation, Census, April 1, 2010 8,791,894\nAge and Sex\nPersons under 5 years, percent 5.6%\nPersons under 18 years, percent 21.5%\nPersons 65 years and over , percent 17.4%\nFemale persons, percent 50.7%\nRace and Hispanic Origin\nWhite alone, percent 70.7%\nBlack or African American alone, percent (a) 15.4%\nAmerican Indian and Alaska Native alone, percent (a) 0.7%\nAsian alone, per cent (a) 10.5%\nNative Hawaiian and Other Paciﬁc Islander alone, percent (a) 0.1%\nTwo or More Races, percent 2.4%\nH

In [11]:
embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [12]:
import numpy as np
np.array(embeddings.embed_query(chunks[0].page_content))

array([-1.71148777e-02, -1.93089209e-02, -1.22574242e-02, -3.32077108e-02,
        9.31104552e-03,  8.93147066e-02, -2.11889986e-02,  2.64065973e-02,
       -2.52565965e-02,  2.04341765e-02,  8.54972154e-02,  2.38653291e-02,
       -2.85084806e-02,  4.16703746e-02, -5.45041449e-02,  1.97422057e-02,
       -5.30546643e-02, -4.16378044e-02,  1.51249459e-02,  1.25307338e-02,
       -1.58364139e-02, -8.92073754e-03,  7.91625679e-03, -2.97001675e-02,
        3.83283570e-02, -3.67041072e-03, -2.01306492e-02, -5.53899538e-03,
        5.25682326e-03, -1.20353006e-01, -6.35554194e-02,  1.21362070e-02,
        2.35978495e-02,  1.91010926e-02, -3.13012525e-02, -8.40320438e-02,
        1.74941085e-02,  7.96508938e-02,  1.56750120e-02,  2.90601347e-02,
       -2.03176346e-02,  5.17980475e-03,  9.39342007e-03,  1.54345632e-02,
       -2.96535175e-02,  4.58275415e-02, -8.19966123e-02,  2.86644343e-02,
        3.32695921e-03, -3.31382849e-03,  2.96977684e-02, -7.20072985e-02,
        3.36876586e-02,  

In [13]:
vector_store= FAISS.from_documents(chunks,embeddings)

In [15]:
query = " what is the perecentage of asians in new jersey"
rel_docs = vector_store.similarity_search(query)
rel_docs

[Document(page_content='QuickFacts\nNew Jersey\nQuickFacts provides statistics for all states and counties. Also for cities and towns with a population of 5,000 or more .\nAll Topics\nAsian alone, per cent (a) 10.5%\n\uf061 PEOPLE\nPopulation\nPopulation estimates, July 1, 2023, (V2023) 9,290,841\nPopulation estimates base, April 1, 2020, (V2023) 9,289,039\nPopulation, percent change - April 1, 2020 (estimates base) to July 1, 2023, (V2023)  Z\nPopulation, Census, April 1, 2020 9,288,994\nPopulation, Census, April 1, 2010 8,791,894\nAge and Sex\nPersons under 5 years, percent 5.6%\nPersons under 18 years, percent 21.5%\nPersons 65 years and over , percent 17.4%\nFemale persons, percent 50.7%\nRace and Hispanic Origin\nWhite alone, percent 70.7%\nBlack or African American alone, percent (a) 15.4%\nAmerican Indian and Alaska Native alone, percent (a) 0.7%\nAsian alone, per cent (a) 10.5%\nNative Hawaiian and Other Paciﬁc Islander alone, percent (a) 0.1%\nTwo or More Races, percent 2.4%\n

In [16]:
retriver = vector_store.as_retriever(search_type="similarity",search_kwargs={"k":3})
print(retriver)


tags=['FAISS', 'HuggingFaceBgeEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7fce2373be80> search_kwargs={'k': 3}


In [18]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["HUGGINGFACEHUB_API_TOKEN"]=os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [19]:
from langchain_community.llms import HuggingFaceHub
hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}
)
query = "Total population in Dallas"
hf.invoke(query)

GatedRepoError: 403 Client Error. (Request ID: Root=1-667057ac-1776bb9a484812fd313497f0;1630caf8-8f28-4cc6-aed1-d7f3a944b764)

Cannot access gated repo for url https://huggingface.co/api/models/mistralai/Mistral-7B-v0.1.
Access to model mistralai/Mistral-7B-v0.1 is restricted and you are not in the authorized list. Visit https://huggingface.co/mistralai/Mistral-7B-v0.1 to ask for access.