In [1]:
import openai 
import langchain 
import pinecone 

from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
import os

In [4]:
def read_doc(directory):
    loader = PyPDFDirectoryLoader(directory)
    documents = loader.load()
    return documents

In [5]:
doc = read_doc(r"./documents")
print(len(doc))
print(doc[0].page_content[:1200])


1
Company Policy Document
1. Refund Policy
Customers are eligible for a full refund within 14 days of purchase, provided the product is unused
and in its original packaging. Refunds are processed within 5–7 business days after approval.
2. Cancellation Policy
Orders may be cancelled within 24 hours of placement. After this period, cancellations are subject
to approval and may incur a processing fee.
3. Warranty Policy
All products come with a standard 6-month warranty covering manufacturing defects. Warranty
claims must be supported with proof of purchase.
4. Customer Support
For any issues related to orders, refunds, or warranties, customers may contact our support team
via email or phone. Support requests are typically resolved within 48 hours.


In [6]:
##divide the docs into chunks

def chunk_data(documents, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    documents = text_splitter.split_documents(documents)
    return documents

In [7]:
documents= chunk_data(documents = doc)
len(documents)

1

In [8]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 444.74it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [9]:
vec = embeddings.embed_query("How are you?")
print("Vector length:", len(vec))
print(vec[:10])

Vector length: 384
[0.007003925275057554, 0.010914229787886143, 0.08746258914470673, 0.0867992416024208, 0.02664851024746895, -0.06750527024269104, 0.07268453389406204, -0.025154881179332733, -0.08268840610980988, 0.016113022342324257]


In [10]:
from dotenv import load_dotenv
load_dotenv()


True

In [17]:
import os
print("PINECONE_API_KEY:", os.getenv("PINECONE_API_KEY"))
print("PINECONE_INDEX:", os.getenv("PINECONE_INDEX"))
print("PINECONE_HOST:", os.getenv("PINECONE_HOST"))


PINECONE_API_KEY: pcsk_6zx8bZ_2VgiZUsss6fWTdWUuBwyANCeJTKxc3BWU4ehNWC35rWP7CCVF8dRfS8JYFaCXa6
PINECONE_INDEX: langchainvector
PINECONE_HOST: https://langchainvector384-ebfgxzb.svc.aped-4627-b74a.pinecone.io


In [12]:
import os
print(os.getcwd())
print(os.listdir())


c:\Users\Huzaifa\Desktop\TCS
['.env', '.venv', 'documents', 'requirements.txt', 'test.ipynb']


In [18]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

load_dotenv()

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

INDEX_NAME = os.getenv("PINECONE_INDEX")
INDEX_HOST = os.getenv("PINECONE_HOST")

index = pc.Index(INDEX_NAME, host=INDEX_HOST)

print(index.describe_index_stats())


{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [19]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

load_dotenv()

INDEX_NAME = os.getenv("PINECONE_INDEX")
INDEX_HOST = os.getenv("PINECONE_HOST")

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index = pc.Index(INDEX_NAME, host=INDEX_HOST)

print("Connected to:", INDEX_NAME)
print(index.describe_index_stats())


Connected to: langchainvector
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}


In [21]:
from dotenv import load_dotenv
load_dotenv()

import os
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

INDEX_NAME = os.getenv("PINECONE_INDEX")   # should be your NEW 384 index name
INDEX_HOST = os.getenv("PINECONE_HOST")   # should be your NEW 384 index host

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pinecone_index = pc.Index(INDEX_NAME, host=INDEX_HOST)

vectorstore = PineconeVectorStore(
    index=pinecone_index,
    embedding=embeddings,
)
vectorstore.add_documents(documents)  # uploads chunks


Index host ignored when initializing with index object.


['161f68d2-0a41-4736-a2e3-be52fdaaba39']

In [22]:
print("INDEX_NAME:", INDEX_NAME)
print("INDEX_HOST:", INDEX_HOST)
print(pinecone_index.describe_index_stats())


INDEX_NAME: langchainvector
INDEX_HOST: https://langchainvector384-ebfgxzb.svc.aped-4627-b74a.pinecone.io
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1}},
 'total_vector_count': 1,
 'vector_type': 'dense'}


In [23]:
import os
print(os.getenv("PINECONE_INDEX"))
print(os.getenv("PINECONE_HOST"))


langchainvector
https://langchainvector384-ebfgxzb.svc.aped-4627-b74a.pinecone.io


In [25]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

load_dotenv()

INDEX_NAME = os.getenv("PINECONE_INDEX")   # your 384 index name
INDEX_HOST = os.getenv("PINECONE_HOST")    # your 384 index host

pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
pinecone_index = pc.Index(INDEX_NAME, host=INDEX_HOST)

print("Connected to:", INDEX_NAME)
print(pinecone_index.describe_index_stats())


Connected to: langchainvector
{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1}},
 'total_vector_count': 1,
 'vector_type': 'dense'}


In [26]:
len(documents)


1

In [27]:
# Cosine Similarity Retrieve Results (your modern equivalent)

def retrieve_query(query, k=2):
    results = vectorstore.similarity_search(query, k=k)
    return results

# test
matches = retrieve_query("What is the current refund policy?", k=2)

for i, d in enumerate(matches, 1):
    print(f"\n--- Result {i} ---")
    print(d.page_content[:500])
    print("metadata:", d.metadata)



--- Result 1 ---
Company Policy Document
1. Refund Policy
Customers are eligible for a full refund within 14 days of purchase, provided the product is unused
and in its original packaging. Refunds are processed within 5–7 business days after approval.
2. Cancellation Policy
Orders may be cancelled within 24 hours of placement. After this period, cancellations are subject
to approval and may incur a processing fee.
3. Warranty Policy
All products come with a standard 6-month warranty covering manufacturing defect
metadata: {'author': '(anonymous)', 'creationdate': '2026-02-18T13:25:10+00:00', 'creator': '(unspecified)', 'keywords': '', 'moddate': '2026-02-18T13:25:10+00:00', 'page': 0.0, 'page_label': '1', 'producer': 'ReportLab PDF Library - www.reportlab.com', 'source': 'documents\\company_policy.pdf', 'subject': '(unspecified)', 'title': '(anonymous)', 'total_pages': 1.0, 'trapped': '/False'}


In [28]:
def retrieve_query(query, k=2):
    return vectorstore.similarity_search_with_score(query, k=k)


In [33]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate

# Create LLM
llm = ChatOpenAI(
    model="gpt-3.5-turbo",
    temperature=0.5
)

# Create simple prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer using ONLY the provided context."),
    ("human", "Question: {question}\n\nContext:\n{context}")
])


In [38]:
def retrieve_answers(query, k=2):
    doc_search = retrieve_query(query, k=k)

    context = "\n\n".join([
        (d.page_content if hasattr(d, "page_content") else d[0].page_content)
        for d in doc_search
    ])

    messages = prompt.format_messages(question=query, context=context)
    response = llm.invoke(messages)
    return response.content


In [39]:
doc_search = retrieve_query("refund policy", k=2)
print(type(doc_search[0]))
print(doc_search[0])


<class 'tuple'>
(Document(id='161f68d2-0a41-4736-a2e3-be52fdaaba39', metadata={'author': '(anonymous)', 'creationdate': '2026-02-18T13:25:10+00:00', 'creator': '(unspecified)', 'keywords': '', 'moddate': '2026-02-18T13:25:10+00:00', 'page': 0.0, 'page_label': '1', 'producer': 'ReportLab PDF Library - www.reportlab.com', 'source': 'documents\\company_policy.pdf', 'subject': '(unspecified)', 'title': '(anonymous)', 'total_pages': 1.0, 'trapped': '/False'}, page_content='Company Policy Document\n1. Refund Policy\nCustomers are eligible for a full refund within 14 days of purchase, provided the product is unused\nand in its original packaging. Refunds are processed within 5–7 business days after approval.\n2. Cancellation Policy\nOrders may be cancelled within 24 hours of placement. After this period, cancellations are subject\nto approval and may incur a processing fee.\n3. Warranty Policy\nAll products come with a standard 6-month warranty covering manufacturing defects. Warranty\nclaims