<a href="https://colab.research.google.com/github/chueneelvin/Databricks/blob/main/PDF_QnA_with_Langchain_and_Llama3_and_Hugging_face_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install required packages

In [32]:
!pip -q install langchain pypdf langchain-community langchain-text-splitters langchain_experimental langchain_openai langchain-chroma langchain-pinecone python-dotenv chromadb faiss-cpu unstructured[pdf] poppler-utils langsmith tesseract sentence_transformers langchain_ollama langchain-groq

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.5/106.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

# Importing the dependecies

In [36]:
from langchain_community.document_loaders import PyPDFLoader        # Loading the docuements
from langchain_community.document_loaders import DirectoryLoader   # Loading the documents from a directory
from langchain_text_splitters import RecursiveCharacterTextSplitter # Text chunks using recursive spliter
from langchain_experimental.text_splitter import SemanticChunker    # Semantic text chuncking
from langchain_openai import OpenAIEmbeddings                       # openai embedding models
from langchain_chroma import Chroma                                 # vector database Chromadb
from langchain.vectorstores import Pinecone                  # vector database Pinecone
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI
import pinecone
import os
from langchain_groq import ChatGroq


# Loading the data

## loading single files (pdf)

In [3]:
loader = PyPDFLoader("/content/Farming Potatoes in South Africa_ What You Need to Know.pdf")
docs = loader.load()

In [4]:
docs[0]

Document(metadata={'source': '/content/Farming Potatoes in South Africa_ What You Need to Know.pdf', 'page': 0}, page_content="\uf1bf\nFarming P otatoes In South Africa: What\nYou Need To Know\n“My idea of hea ven is a gr eat big bak ed potat o and\nsomeone t o shar e it with. ”\n- Opr ah Winfr ey\nWhat's in this guide?\n1. Introduction: Farming potatoes in South Africa\n2. All about seed potatoes\n3. Growing potatoes in South Africa\n4. Challenges of potato farming\n5. Sustainable potato farming\nChapter 1\nIntroduction: F arming P otatoes in South Africa\nIn South Africa, maiz e meal and br ead ar e the most commonly consumed sour ces of\ncarbohy drates. Howe ver, South Africans ha ve eaten twice as many potat oes o ver the past\ndecade compar ed to the decade befor e it, buo yed b y a gr owing middle class.")

# Text Chunking

## Recursive chuncking

In [53]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

chunked_docs = text_splitter.split_documents(docs)
len(chunked_docs)

35

In [54]:
chunked_docs[:3]

[Document(metadata={'source': '/content/Farming Potatoes in South Africa_ What You Need to Know.pdf', 'page': 0}, page_content="\uf1bf\nFarming P otatoes In South Africa: What\nYou Need To Know\n“My idea of hea ven is a gr eat big bak ed potat o and\nsomeone t o shar e it with. ”\n- Opr ah Winfr ey\nWhat's in this guide?\n1. Introduction: Farming potatoes in South Africa\n2. All about seed potatoes\n3. Growing potatoes in South Africa\n4. Challenges of potato farming\n5. Sustainable potato farming\nChapter 1\nIntroduction: F arming P otatoes in South Africa\nIn South Africa, maiz e meal and br ead ar e the most commonly consumed sour ces of\ncarbohy drates. Howe ver, South Africans ha ve eaten twice as many potat oes o ver the past\ndecade compar ed to the decade befor e it, buo yed b y a gr owing middle class."),
 Document(metadata={'source': '/content/Farming Potatoes in South Africa_ What You Need to Know.pdf', 'page': 1}, page_content='This incr ease in consumption has occurr ed de

# Setting the environmental variables

In [55]:
# Get the API key from user data
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY') # https://console.groq.com/keys

# Initialize the embedding models

In [56]:
from langchain.embeddings import HuggingFaceEmbeddings

# Get the Hugging Face token from user data
hf_token = userdata.get('HF_TOKEN')

# Create embeddings using the specified model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings



HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

# Convert chuncks into vector embeddings and store in FAISS DB

In [57]:
faiss_db = FAISS.from_documents(chunked_docs, embeddings)
faiss_db

<langchain_community.vectorstores.faiss.FAISS at 0x7990f936d660>

# Initialize the LLM model (llama3)

In [73]:
from langchain_groq import ChatGroq
llm_llama = ChatGroq(temperature=0, model_name="llama3-70b-8192")
llm_llama

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7990f5c91780>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7990f5c922f0>, model_name='llama3-70b-8192', temperature=1e-08, groq_api_key=SecretStr('**********'))

# Create a chain

In [74]:
chain = load_qa_chain(llm=llm_llama, chain_type="stuff")
chain
#from langchain.prompts import PromptTemplate
#from langchain.chains import LLMChain
#llm = OpenAI()
#chain = StuffDocumentsChain(llm_chain = LLMChain(llm=llm, prompt=PromptTemplate.from_template("Summarize this content: {context}")), verbose=True)
#chain

StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x7990f5c91780>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x7990f5c922f0>, model_name='llama3-70b-8192', temperature=1e-08, groq_api_key=SecretStr('**********'))), document_variable_name='context')

# Query the vector db

In [85]:
query= "what are typical potato cost of production in hectares?"
results = faiss_db.similarity_search(query)
chain.invoke({'input_documents': results, 'question': query})['output_text']

'According to the provided context, the typical potato cost of production in hectares in South Africa is between R160,000 and R240,000 per hectare.'