https://medium.com/@jiangan0808/retrieval-augmented-generation-rag-with-open-source-hugging-face-llms-using-langchain-bd618371be9d

In [2]:
import os
from urllib.request import urlretrieve
import numpy as np
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

In [3]:
loader = PyPDFDirectoryLoader("PublicERHreportSmall")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)


In [4]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 72 documents loaded, with average characters equal to 3347.
After split, there were 279 documents (chunks), with average characters equal to 868 (average chunk length).


In [5]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-l6-v2",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" for a light and faster experience.
    model_kwargs={'device':'cpu'}, 
    encode_kwargs={'normalize_embeddings': True}
)

In [6]:
sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [ 8.77594808e-04  2.58098599e-02 -3.45586240e-02 -2.78197415e-02
 -2.27950383e-02  6.65811598e-02  4.52734344e-02  4.86474596e-02
 -3.68658416e-02 -2.28099339e-02  2.40807924e-02 -2.62145456e-02
 -9.26026795e-03 -3.09852231e-02 -1.49367684e-02 -7.87255727e-03
  2.58042403e-02  3.96585390e-02 -5.20999730e-02 -5.85869998e-02
  1.04799904e-02  7.37114623e-02  4.08360995e-02  2.11835597e-02
 -9.38838571e-02 -5.36489487e-03  1.45335281e-02 -4.10635322e-02
  1.51463202e-03 -6.03478914e-03  1.44509092e-01  3.37389112e-02
  1.02483936e-01 -3.94268781e-02 -2.78482120e-02  1.43945096e-02
 -3.45276818e-02 -5.46164773e-02 -5.12438230e-02 -6.21713921e-02
 -2.05884501e-02 -5.30733615e-02 -6.46900684e-02 -7.64682218e-02
 -5.77856079e-02 -3.65945734e-02 -1.95448361e-02 -9.22646523e-02
 -2.25192923e-02  3.67328115e-02  5.59026077e-02 -3.88678648e-02
  6.57022521e-02  2.58749966e-02 -3.75801884e-02  4.08067275e-03
  3.74643016e-03  1.58157421e-03 -6.13015480e-02  4

In [7]:
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [8]:
query = """Give me all the research papers published by Niels Pontoppidan and order them from oldest publicaiton to the newest one."""  
         # Sample question, change to other questions you are interested in.
relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

Frontiers in Neuroscience | www.frontiersin.org 14 June 2022 | Volume 16 | Article 873201


In [9]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [10]:
from huggingface_hub import login
login(token="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")

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /Users/chemay/.cache/huggingface/token
Login successful


In [11]:
from langchain_community.llms import HuggingFaceHub


hf = HuggingFaceHub(
    repo_id="SweatyCrayfish/llama-3-8b-quantized",
    huggingfacehub_api_token="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",
    model_kwargs={"temperature": 0.1, "max_length": 500}
)


  warn_deprecated(


In [None]:
query = """Give me all the research papers published by Niels Pontoppidan and order them from oldest publicaiton to the newest one."""  # Sample question, change to other questions you are interested in.
hf.invoke(query)

'Give me all the research papers published by Niels Pontoppidan and order them from oldest publicaiton to the newest one.\n\n### 1 Answer\n\n- I\'m not sure what you mean by "all the research papers published by Niels Pontoppidan".\n\nIf you mean all the papers he has published, then you can search for them in Google Scholar.\n\nIf you mean all the papers he has published in the last 10 years, then you can search for them in Google Scholar.\n\nIf you mean all the papers he has published in the last '

In [14]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

In [12]:
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="SweatyCrayfish/llama-3-8b-quantized",
    task="text-generation",
    force_download=True, resume_download=False,
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 900}
)

llm = hf 
llm.invoke(query)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

: 

In [94]:
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [96]:
result

{'query': 'Give me all the research papers published by Niels Pontoppidan and order them from oldest publicaiton to the newest one.',
 'result': 'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n\nFrontiers in Neuroscience | www.frontiersin.org 14 June 2022 | Volume 16 | Article 873201\n\nFrontiers in Neuroscience | www.frontiersin.org 15 June 2022 | Volume 16 | Article 873201\n\nrespective hemiﬁeld. Subsequently, these data were used as a\nFrontiers in Neuroscience | www.frontiersin.org 6 June 2022 | Volume 16 | Article 873201\n\nPolicy in the Framework of EVOTION.” Heraclion, \nGreece: 2018, 2018.  \nGutenberg, Johanna, Panagiotis Katrakazas, Lyubov \nTrenkova, Louisa Murdin, Dario Brdarić, Nina \nKoloutsou, Katherine Ploumidou, Niels Henrik \nPontoppidan, and Ariane Lapla

In [95]:
result = retrievalQA.invoke({"query": query})
print(result['result'])



Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".

Frontiers in Neuroscience | www.frontiersin.org 14 June 2022 | Volume 16 | Article 873201

Frontiers in Neuroscience | www.frontiersin.org 15 June 2022 | Volume 16 | Article 873201

respective hemiﬁeld. Subsequently, these data were used as a
Frontiers in Neuroscience | www.frontiersin.org 6 June 2022 | Volume 16 | Article 873201

Policy in the Framework of EVOTION.” Heraclion, 
Greece: 2018, 2018.  
Gutenberg, Johanna, Panagiotis Katrakazas, Lyubov 
Trenkova, Louisa Murdin, Dario Brdarić, Nina 
Koloutsou, Katherine Ploumidou, Niels Henrik 
Pontoppidan, and Ariane Laplante -Lévesque. “Big 
Data for Sound Policies: Toward Evidence -Informed 
Hearing Health Po licies,” Special Issue: 3rd 
International Meeting on Internet and Audiol

In [97]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 5 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: PublicERHreportSmall/2022-01-P Comparing In-ear EOG for Eye-Movement Estimation With Eye-Tracking Accuracy, Calibration, and Speech Comprehension.pdf, Page: 13
Content: Frontiers in Neuroscience | www.frontiersin.org 14 June 2022 | Volume 16 | Article 873201
----------------------------------------------------------------------------------------------------
There are 5 documents retrieved which are relevant to the query.
Relevant Document #2:
Source file: PublicERHreportSmall/2022-01-P Comparing In-ear EOG for Eye-Movement Estimation With Eye-Tracking Accuracy, Calibration, and Speech Comprehension.pdf, Page: 14
Content: Frontiers in Neuroscience | www.frontiersin.org 15 June 2022 | Volume 16 | Article 873201
--------------------------------------------------------------------------------