In [None]:
!pip install sentence-transformers faiss-cpu PyMuPDF langchain


In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import fitz  # PyMuPDF

doc = fitz.open("AI Training Document.pdf")
text = ""
for page in doc:
    text += page.get_text()

print(text[:1000])  # pehle 1000 characters check karne ke liye

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " "]
)

chunks = text_splitter.create_documents([text])
print(f"Total chunks: {len(chunks)}")
print("First chunk:\n", chunks[0].page_content)

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
print("✅ Model loaded!")

texts = [chunk.page_content for chunk in chunks]
embeddings = model.encode(texts)
print(f"Embeddings shape: {embeddings.shape}")

In [None]:
import faiss
import numpy as np

# Convert embeddings to numpy array
embedding_matrix = np.array(embeddings).astype("float32")

# FAISS index initialize
index = faiss.IndexFlatL2(embedding_matrix.shape[1])  # 384 dim
index.add(embedding_matrix)

print("✅ FAISS index created with", index.ntotal, "vectors.")

In [None]:
def search_documents(query, top_k=3):
    # Query embedding
    query_embedding = model.encode([query]).astype("float32")

    # FAISS search
    distances, indices = index.search(query_embedding, top_k)

    print("\n🔍 Top Matches:")
    for i in indices[0]:
        print("\n-- Chunk", i, "--")
        print(chunks[i].page_content)
