In [1]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [2]:
# Download documents from U.S. Census Bureau to local directory.
os.makedirs("us_census", exist_ok=True)
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]
for url in files:
    file_path = os.path.join("us_census", url.rpartition("/")[2])
    urlretrieve(url, file_path)

In [3]:
# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./us_census/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 700,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'source': 'us_census/p70-178.pdf', 'page': 0}, page_content='Occupation, Earnings, and Job \nCharacteristics\nJuly 2022\nP70-178\nClayton Gumber and Briana Sullivan\nCurrent Population Reports\nINTRODUCTION\nWork is a critical component of our lives and provides \na way to obtain material and nonmonetary benefits \nlike employer-provided health insurance. Scholars \nsuggest that our identities are also tied to the notion \nof “what we do” (Christiansen, 1999), and that who \nwe are is determined partly by our occupational iden -\ntity (Skorikov and Vondracek, 2011). However, work \nis time consuming—the American Time Use Survey \nshows that in 2017 workers spent an average 8.21 \nhours each day engaged in work and work-related')

In [4]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 63 documents loaded, with average characters equal to 3840.
After split, there were 398 documents (chunks), with average characters equal to 624 (average chunk length).


In [5]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': False}
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [-1.86188761e-02  2.46237759e-02 -1.32489642e-02  5.79542555e-02
  2.89788991e-02  1.27435941e-02  8.26062635e-02 -6.73673004e-02
 -2.07575597e-02 -3.87655348e-02 -4.64275889e-02  2.84431633e-02
 -6.16011284e-02  6.79046614e-03 -2.96421628e-02 -2.40446217e-02
  5.89304697e-03 -2.99502648e-02  4.01671678e-02  3.04162386e-03
 -3.02758925e-02  1.95388775e-02  3.94134708e-02  1.10068759e-02
  2.65370551e-02  2.45575923e-02  2.79611610e-02  8.82644951e-03
  2.18599010e-02  6.10523522e-02  8.10195226e-03  3.47977728e-02
  8.98535252e-02  9.96245909e-03 -3.32964174e-02  2.99190748e-02
  8.64695683e-02  4.18684259e-02  7.01389974e-03  5.04911924e-03
 -3.21369059e-02 -6.62606955e-02  1.59509445e-03  2.17842367e-02
 -7.93712959e-02 -1.95213668e-02  3.44601423e-02 -2.71609779e-02
 -5.60448021e-02  5.23194447e-02  1.85131039e-02  6.01829179e-02
  7.26569742e-02 -3.72973643e-02  7.50669017e-02  1.59940403e-02
 -9.33717284e-03 -3.23310238e-03  5.84939681e-03  4

In [7]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)


In [8]:
query = "What were the trends in median household income across different states in the United States between 2021 and 2022."  
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont 
were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).


In [9]:
# Use similarity searching algorithm and return 3 most relevant documents.
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
# Remote huggingface execution
# from langchain_community.llms import HuggingFaceHub

# hf = HuggingFaceHub(
#     repo_id="stabilityai/stablelm-2-1_6b",
#     model_kwargs={"temperature":0.1, "max_length":500})

# query = """What were the trends in median household income across different states in the United States between 2021 and 2022."""  # Sample question, change to other questions you are interested in.
# hf.invoke(query)

'What were the trends in median household income across different states in the United States between 2021 and 2022. The data is from the U.S. Census Bureau’s American Community Survey.\nThe median household income in the United States increased by 1.1% between 2021 and 2022. The median household income in the United States increased by 1.1% between 2021 and 2022. The median household income in the United States increased by 1.1% between 2021 and 2022. The'

In [None]:
# Local huggingface execution (same outcome as prevous cell)
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="dccuchile/bert-base-spanish-wwm-uncased",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 
llm.invoke(query)

Device has 1 GPUs available. Provide device={deviceId} to `from_model_id` to use availableGPUs for execution. deviceId is -1 (default) for CPU and can be a positive integer associated with CUDA device id.


'What were the trends in median household income across different states in the United States between 2021 and 2022. The data is from the U.S. Census Bureau’s American Community Survey (ACS) 5-year estimates.\nThe median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,245. The median household income in the United States is $61,24

In [17]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

In [19]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [20]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])



Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont 
were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and

In [21]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')


There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: us_census/acsbr-017.pdf, Page: 3
Content: hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont 
were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
---------------------------------------------------------------------------------