In [None]:
pip install  langchain langchain-community pypdf langchain-huggingface sentence_transformers huggingface_hub langchain-text-splitters langchain-pinecone

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-6.1.3-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.0-py3-none-any.whl.metadata (2.1 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.12-py3-none-any.whl.metadata (8.6 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
INFO: pip is look

In [None]:
import os
import getpass
from google.colab import files
from pinecone import Pinecone, ServerlessSpec
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint
import uuid


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [None]:
from google.colab import userdata


PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
HUGGINGFACE_API_KEY = userdata.get('HUGGINGFACE_API_KEY')
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["HUGGINGFACE_API_KEY"] = HUGGINGFACE_API_KEY
AZURE_OPENAI_ENDPOINT = userdata.get('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_KEY = userdata.get('AZURE_OPENAI_KEY')
DEPLOYMENT_NAME = userdata.get('DEPLOYMENT_NAME')



In [None]:
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings

model_dimension = 768
embeddings = HuggingFaceEndpointEmbeddings(
        model="sentence-transformers/all-mpnet-base-v2",
        huggingfacehub_api_token=HUGGINGFACE_API_KEY
    )

In [None]:
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

In [None]:
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

In [None]:
import time

index_name = "langchain-huggingface-index"  

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    print(f"Deleted existing index: {index_name}")

pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
        deletion_protection="disabled",  
    )


index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)
print(f"Creating index: {index_name}")


Creating index: langchain-huggingface-index


In [None]:
uploaded = files.upload()

pdf_filename = list(uploaded.keys())[0]

loader = PyPDFLoader(pdf_filename)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
texts = text_splitter.split_documents(documents)


Saving Random Forest ML.pdf to Random Forest ML.pdf


In [None]:
uuids = [str(uuid.uuid4()) for _ in range(len(texts))]
vector_store.add_documents(documents=texts, ids=uuids)
print(f"Successfully stored {len(texts)} document chunks in Pinecone!")

Successfully stored 26 document chunks in Pinecone!


In [None]:
repo_id = "mistralai/Mistral-7B-Instruct-v0.3"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    task="conversational",
    temperature=0.5,
    huggingfacehub_api_token=HUGGINGFACE_API_KEY
)

chat_model = ChatHuggingFace(llm=llm)

In [None]:
from langchain_core.messages import HumanMessage, SystemMessage

def ask_document_huggingface(question):
  """
  This function takes a user's question, retrieves relevant context,
  constructs a structured prompt with system and user messages,
  and returns the LLM's answer.
  """
  # 1. Retrieve relevant document chunks
  retriever = vector_store.as_retriever()
  retrieved_docs = retriever.invoke(question)

  context = "\n\n".join([doc.page_content for doc in retrieved_docs])

  messages = [
      SystemMessage(
          content=f"""
          You are a helpful assistant. Your task is to answer the user's question based only on the following context.
          If the answer is not available in the context, please say "I'm sorry, I don't have enough information to answer that question."

          Context:
          {context}
          """
      ),
      HumanMessage(content=question),
  ]


  result = chat_model.invoke(messages)
  return result.content

my_question = "What is the main topic of this document?"
answer = ask_document_huggingface(my_question)
print(f"Question: {my_question}")
print(f"Answer: {answer}")

Question: What is the main topic of this document?
Answer:  The main topic of this document appears to be Machine Learning Algorithms, specifically focusing on Bagging, Pasting, Random Forests, and Boosting.


In [None]:
import os
import uuid
from dotenv import load_dotenv
from google.colab import files
from pinecone import Pinecone, ServerlessSpec
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_core.messages import HumanMessage, SystemMessage

embeddings = AzureOpenAIEmbeddings(
    azure_deployment='text-embedding-3-small',
    api_key=AZURE_OPENAI_ENDPOINT,
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)


PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "langchain-azure-openai"

if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)
    print(f"Deleted existing index: {index_name}")

pc.create_index(
    name=index_name,
    metric="cosine",
    spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    deletion_protection="disabled",
    dimension=1536
)
print(f"Creating a new index: {index_name}")

index = pc.Index(index_name)
vector_store = PineconeVectorStore(index=index, embedding=embeddings)


Creating a new index: langchain-azure-openai


In [None]:
llm = AzureChatOpenAI(
    azure_deployment='gpt-4o',
    temperature=0.5,
    api_key=AZURE_OPENAI_KEY,
    api_version='2023-03-15-preview',
    azure_endpoint=AZURE_OPENAI_ENDPOINT
)


In [None]:

print("Please upload your PDF document.")
uploaded = files.upload()

pdf_filename = list(uploaded.keys())[0]

loader = PyPDFLoader(pdf_filename)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents)

uuids = [str(uuid.uuid4()) for _ in range(len(texts))]

vector_store.add_documents(documents=texts, ids=uuids)
print(f"Successfully stored {len(texts)} document chunks in Pinecone!")



Please upload your PDF document.


Saving Random Forest ML.pdf to Random Forest ML (1).pdf
Successfully stored 26 document chunks in Pinecone!


NotFoundError: Error code: 404 - {'error': {'code': '404', 'message': 'Resource not found'}}

In [None]:
def ask_document_with_azure(question: str) -> str:
    retriever = vector_store.as_retriever()
    retrieved_docs = retriever.invoke(question)
    context = "\n\n".join([doc.page_content for doc in retrieved_docs])
    messages = [
        SystemMessage(
            content=(
                "You are a helpful assistant. Your task is to answer the user's question "
                "based only on the following context. If the answer is not in the context, "
                'say "I do not have enough information to answer that question."'
                f"\n\nContext:\n{context}"
            )
        ),
        HumanMessage(content=question),
    ]
    result = llm.invoke(messages)
    return result.content

my_question = "What is the main topic of this document?"
answer = ask_document_with_azure(my_question)
print("\n" + "="*50)
print(f"Question: {my_question}")
print(f"Answer: {answer}")
print("="*50 + "\n")


Question: What is the main topic of this document?
Answer: The main topic of this document is "Ensemble Learning and Random Forests," which discusses techniques for combining predictions from multiple predictors to improve accuracy and introduces methods like bagging, pasting, and the Random Forest algorithm.

