## Video Referrence
- [Complete Tutorial on Vector Database - Learn ChromaDB, Pinecone & Weaviate | Generative AI](https://www.youtube.com/watch?v=8KrTO9bS91s)
- https://github.com/entbappy/Complete-Generative-AI-Course-on-YouTube/blob/main/Vector%20Database/2.Pinecone_demo.ipynb


## Import All the Required Libraries

In [1]:
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
from langchain.document_loaders import PyMuPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.vectorstores import Pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import os
import openai

In [2]:
from dotenv import load_dotenv

load_dotenv()

openAi_key = os.getenv('OPENAI_API_KEY')
pinecone_key = os.getenv('PINECONE_API_KEY')
# openAi_base = os.getenv("OPENAI_API_BASE")
# os.environ["OPENAI_API_KEY"] = "sk-xxx"  # Dummy key, actual is injected in the proxy
# os.environ["OPENAI_API_BASE"] = os.getenv("OPENAI_API_BASE")

## Extract the Text from the PDF's

In [None]:
def load_pdfs_from_folder(folder_path: str) -> list[Document]:
    all_docs: list[Document] = []
    for filename in os.listdir(folder_path):
        if not filename.lower().endswith(".pdf"):
            continue

        full_path = os.path.join(folder_path, filename)
        loader = PyMuPDFLoader(full_path)
        docs = loader.load()

        # Derive title from filename, e.g. "My Paper.pdf" -> "My Paper"
        title = os.path.splitext(filename)[0]

        # Add the title meta-field to each Document
        for doc in docs:
            # doc.metadata already contains things like 'source'; we just add 'Title'
            doc.metadata["Title"] = title

        all_docs.extend(docs)

    return all_docs


# ✅ Load all PDFs in the folder
data = load_pdfs_from_folder("website_content/")

In [None]:
# loader = PyPDFDirectoryLoader("website_content")
# # loader = PyPDFLoader("Website_Report_V1.pdf")
# data = loader.load()

In [None]:
data

## Clean Data

In [None]:
import re

def clean_page(text: str) -> str:
    # Remove zero-width space characters
    text = text.replace('\u200b', '')
    
    # Remove artificial newlines that break up sentences or words
    text = re.sub(r'\n+', ' ', text)             # Merge multiple newlines into one space
    text = re.sub(r'(?<=\w)-\s+(?=\w)', '', text) # Fix hyphenated line breaks (e.g., "subsi-\ndy" → "subsidy")
    text = re.sub(r'\s+', ' ', text)             # Normalize extra spaces
    return text.strip()

def clean_documents(docs: list[Document]) -> list[Document]:
    cleaned_docs = []
    for doc in docs:
        cleaned_text = clean_page(doc.page_content)
        cleaned_docs.append(Document(page_content=cleaned_text, metadata=doc.metadata))
    return cleaned_docs


In [None]:
clean_data = clean_documents(data)
print(clean_data)

In [None]:
print(clean_data[0])

## Split the Extracted Data into Text Chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
text_chunks = text_splitter.split_documents(clean_data)

In [None]:
print(len(text_chunks))
text_chunks[0]

## Load OPENAI API

In [None]:
# os.environ['OPENAI_API_KEY'] = 

## Download the Embeddings

In [3]:
# Use a specific embedding model
embeddings_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [None]:
# text = "LangChain is an AI framework for LLMs."
# vector = embeddings_model.embed_query(text)

# print(len(vector))
# print(vector[:5])  # Print first 5 values for readability


## Initializing the Pinecone

In [None]:


# # os.environ['PINECONE_API_KEY'] = "pcsk_7HmYTn_KS4n9fp4CzxTjTrKpYWaaBgHvP2JPRRx9fp5URALDkKuCC1yeZYhbZ557rEfjYT"
# # pc = pinecone.Pinecone(os.getenv('PINECONE_API_KEY'))
# pc = pinecone.Pinecone(pinecone_key)
index_name = "fit5120-tm01"
# index = pc.Index(index_name)

## Create Embeddings for each of the Text Chunk

In [None]:
# # Pinecone.from_texts() vs Pinecone.from_documents()
# # .from_documents() stores meta data while .from_texts() does not

# docsearch = Pinecone.from_texts([t.page_content for t in text_chunks], embeddings_model, index_name=index_name)

## If you already have an index, you can load it like this

In [None]:
docsearch = Pinecone.from_existing_index(index_name, embeddings_model)
docsearch

## Similarity Search

In [None]:
query = "How much is the diesel subisdy expenditure in 2024"

docs = docsearch.similarity_search(query, k=3)

In [None]:
docs

## Creating a LLM Model Wrapper

In [None]:
llm_gpt4 = ChatOpenAI(model="gpt-4.1-nano") #gpt-4.1-mini

qa = RetrievalQA.from_chain_type(llm=llm_gpt4, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={"k": 3}))

In [None]:
qa

## Q/A

In [None]:
query = "How much was the government expenditure?"
qa.invoke(query)

In [None]:
query = "How much is the diesel subisdy expenditure in 2024"
qa.invoke(query)

In [None]:
query = "How much is the diesel subsidy expenditure in 2024"
qa.invoke(query)

In [None]:
while True:
  user_input = input(f"Input Prompt: ")
  if user_input == 'exit':
    print('Exiting')
    break
  if user_input == '':
    continue
  result = qa.invoke({'query': user_input})
  print(f"Answer: {result['result']}")
     

### Test Proxy

In [None]:
import requests

# Replace this with your actual Cloudflare Worker URL
PROXY_BASE_URL = "https://policylensai.wanningc11.workers.dev"

In [None]:
embedding_url = f"{PROXY_BASE_URL}/v1/embeddings"

headers = {
    "Content-Type": "application/json"
}

payload = {
    "model": "text-embedding-3-small",  # match what your backend expects
    "input": "Test embedding input from Jupyter"
}

response = requests.post(embedding_url, headers=headers, json=payload)

print("Status Code:", response.status_code)
print("Response JSON:", response.json())

In [None]:
chat_url = f"{PROXY_BASE_URL}/v1/chat/completions"

chat_payload = {
    "model": "gpt-4.1-nano",  # or gpt-4 if you're using that in backend
    "messages": [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "What is the capital of France?"}
    ],
    "temperature": 0.7
}

chat_response = requests.post(chat_url, headers=headers, json=chat_payload)

print("Status Code:", chat_response.status_code)
print("Response JSON:", chat_response.json())


In [4]:
from langchain_openai import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(
    model="text-embedding-3-small",  # must match what you allow
)

result = embedding_model.embed_query("This is a test query from LangChain.")
print("Embedding vector (truncated):", result[:5])


Embedding vector (truncated): [0.008168850094079971, -0.00567884324118495, 0.03726189583539963, -0.003387290984392166, -0.004074247553944588]


In [None]:
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

chat_model = ChatOpenAI(
    model="gpt-4.1-nano",  # or "gpt-4" if you're using GPT-4
    temperature=0,
)

response = chat_model.invoke([
    HumanMessage(content="What is the capital of Japan?")
])

print("Response:", response.content)
