<a href="https://colab.research.google.com/github/chueneelvin/Databricks/blob/main/Langsmith_plus_PDF_QnA_with_Langchain_and_Llama3_and_Hugging_face_embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install required packages

In [1]:
!pip -q install langchain pypdf langchain-community langchain-text-splitters langchain_experimental langchain_openai langchain-chroma langchain-pinecone python-dotenv chromadb faiss-cpu unstructured[pdf] poppler-utils langsmith tesseract sentence_transformers langchain_ollama langchain-groq langchain-huggingface

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.6/45.6 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m8.8 MB/s[0m 

# Importing the dependecies

In [2]:
from langchain_community.document_loaders import PyPDFLoader        # Loading the docuements
from langchain_community.document_loaders import DirectoryLoader   # Loading the documents from a directory
from langchain_text_splitters import RecursiveCharacterTextSplitter # Text chunks using recursive spliter
from langchain_experimental.text_splitter import SemanticChunker    # Semantic text chuncking
from langchain_openai import OpenAIEmbeddings                       # openai embedding models
from langchain_chroma import Chroma                                 # vector database Chromadb
from langchain.vectorstores import Pinecone                  # vector database Pinecone
from langchain_community.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain_openai import OpenAI
import pinecone
import os
from langchain_groq import ChatGroq


# Loading the data

## loading single files (pdf)

In [3]:
loader = PyPDFLoader("/content/Potato Market Value Chain Profile 2019.pdf")
docs = loader.load()

In [4]:
docs[0]

Document(metadata={'source': '/content/Potato Market Value Chain Profile 2019.pdf', 'page': 0}, page_content='1 \n A PROFILE OF THE SOUTH AFRICAN POTATO MARKET \nVALUE CHAIN  \n \n2019 \n \n \n \nDirectorate Marketing                                            Tel: 012 319 8455                         \nPrivate Bag X 15                                                     Fax: 012 319 8131                                                \nArcadia                                                                    E-mail:PA.D M@daff.gov.za                        \n0007                                                                         www.daff.gov.za                                                                                                                                                                               \n \n \n \n \n')

# Text Chunking

## Recursive chuncking

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

chunked_docs = text_splitter.split_documents(docs)
len(chunked_docs)

163

In [6]:
chunked_docs[:3]

[Document(metadata={'source': '/content/Potato Market Value Chain Profile 2019.pdf', 'page': 0}, page_content='1 \n A PROFILE OF THE SOUTH AFRICAN POTATO MARKET \nVALUE CHAIN  \n \n2019 \n \n \n \nDirectorate Marketing                                            Tel: 012 319 8455                         \nPrivate Bag X 15                                                     Fax: 012 319 8131                                                \nArcadia                                                                    E-mail:PA.D M@daff.gov.za                        \n0007                                                                         www.daff.gov.za'),
 Document(metadata={'source': '/content/Potato Market Value Chain Profile 2019.pdf', 'page': 1}, page_content='2 \n TABLE OF CONTENTS  \n \n1. DESCRIPTION OF THE INDUSTRY  3 \n1.1 Production areas  3 \n1.2 Total production  4 \n1.3 Pota to production vs. consumption  5 \n2. MARKET STRUCTURE  6 \n2.1 Domestic market  6 \n2.2 Exports  7

# Setting the environmental variables

In [7]:
# Get the API key from user data
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')
os.environ['PINECONE_API_KEY'] = userdata.get('PINECONE_API_KEY')
os.environ['GROQ_API_KEY'] = userdata.get('GROQ_API_KEY') # https://console.groq.com/keys


In [8]:
from google.colab import userdata
os.environ['LANGCHAIN_TRACING_V2'] = userdata.get('LANGCHAIN_TRACING_V2')
os.environ['LANGCHAIN_API_KEY'] = userdata.get('LANGCHAIN_API_KEY')
#os.environ['LANGCHAIN_ENDPOINT'] = userdata.get('LANGCHAIN_ENDPOINT')
#os.environ['LANGCHAIN_PROJECT'] = userdata.get('LANGCHAIN_PROJECT')

# Initialize the embedding models

In [9]:
from langchain_huggingface import HuggingFaceEmbeddings

# Get the Hugging Face token from user data
hf_token = userdata.get('HF_TOKEN')

# Create embeddings using the specified model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", cache_folder = "/content/cache")
embeddings

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder='/content/cache', model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

# Convert chuncks into vector embeddings and store in FAISS DB

In [10]:
faiss_db = FAISS.from_documents(chunked_docs, embeddings)
faiss_db

<langchain_community.vectorstores.faiss.FAISS at 0x782b6859e920>

# Initialize the LLM model (llama3)

In [11]:
from langchain_groq import ChatGroq
llm_llama = ChatGroq(temperature=0, model_name="llama3-70b-8192")
llm_llama

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x782b6c1b4bb0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x782b6c1b5870>, model_name='llama3-70b-8192', temperature=1e-08, groq_api_key=SecretStr('**********'))

# Create a chain

In [12]:
chain = load_qa_chain(llm=llm_llama, chain_type="stuff")
chain
#from langchain.prompts import PromptTemplate
#from langchain.chains import LLMChain
#llm = OpenAI()
#chain = StuffDocumentsChain(llm_chain = LLMChain(llm=llm, prompt=PromptTemplate.from_template("Summarize this content: {context}")), verbose=True)
#chain

stuff: https://python.langchain.com/v0.2/docs/versions/migrating_chains/stuff_docs_chain
map_reduce: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_reduce_chain
refine: https://python.langchain.com/v0.2/docs/versions/migrating_chains/refine_chain
map_rerank: https://python.langchain.com/v0.2/docs/versions/migrating_chains/map_rerank_docs_chain

See also guides on retrieval and question-answering here: https://python.langchain.com/v0.2/docs/how_to/#qa-with-rag
  chain = load_qa_chain(llm=llm_llama, chain_type="stuff")


StuffDocumentsChain(llm_chain=LLMChain(prompt=ChatPromptTemplate(input_variables=['context', 'question'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="Use the following pieces of context to answer the user's question. \nIf you don't know the answer, just say that you don't know, don't try to make up an answer.\n----------------\n{context}")), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))]), llm=ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x782b6c1b4bb0>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x782b6c1b5870>, model_name='llama3-70b-8192', temperature=1e-08, groq_api_key=SecretStr('**********'))), document_variable_name='context')

# Query the vector db

In [16]:
query= "give me valuable tips for planting potatoes and their impact on production?"
input_documents = faiss_db.similarity_search(query)
chain.invoke({'input_documents': input_documents, 'question': query})

{'input_documents': [Document(metadata={'source': '/content/Potato Market Value Chain Profile 2019.pdf', 'page': 48}, page_content='on the rise and there are farmers who have potential to farm commercially and they are currently \nreceiving support from Potatoes South Africa. During the past years, Potatoes South Africa focused \non the following projects in contribution to potato industry transformation: Enterprise development,   \nsmall grower development program, tertiary skills pipeline, farm based training and Black Economic \nEmpowerment (BEE) Baseline study.'),
  Document(metadata={'source': '/content/Potato Market Value Chain Profile 2019.pdf', 'page': 49}, page_content='An increasing number of countries in sub -Saharan Africa are reportedly also turning to South Africa \nas a reliable source of food. This may therefore considerably increase the possible market size for \nSouth African potatoes.  \n \n8.2 Chall enges  \n \nThe constraints and market failures hinders investment,