In [None]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting tqdm (from sentence-transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.1-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.0-cp313-cp313-win_amd64.whl.metadata (14 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.15.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Col

In [5]:
!pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [6]:
from google import genai 
from sentence_transformers import SentenceTransformer, util
import PyPDF2



In [8]:
GOOGLE_API_KEY = 'AIzaSyBL2xAmCXhkMgekYTnbw2gBOkch55nfWVc'

In [9]:
client = genai.Client(api_key=GOOGLE_API_KEY)



In [10]:
# Step 1: Load and extract text from PDF
def extract_text_from_pdf(pdf_path):
    reader = PyPDF2.PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text



In [12]:
# Step 2: Split into chunks (simple split; can be improved with sentence splitting)
def chunk_text(text, max_tokens=200):
    words = text.split()
    return [' '.join(words[i:i + max_tokens]) for i in range(0, len(words), max_tokens)]

# Load PDF and process
pdf_text = extract_text_from_pdf("story.pdf")
chunks = chunk_text(pdf_text)


In [14]:

# Step 3: Embed the chunks
embedder = SentenceTransformer("all-MiniLM-L6-v2")
chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)

# Step 4: Ask a query
query = "What is Mira holding?"
query_embedding = embedder.encode(query, convert_to_tensor=True)

# Retrieve most relevant chunk
scores = util.cos_sim(query_embedding, chunk_embeddings)[0]
top_idx = scores.argmax().item()
retrieved_context = chunks[top_idx]

# Step 5: Send context + query to Gemini
prompt = f"""You are a helpful assistant. Use the following context to answer the question.

Context:
{retrieved_context}

Question: {query}
Answer:"""



In [16]:
print(retrieved_context)

Mira watched the last train leave the station, holding a small postcard in her hand. It was old and had a picture of a lighthouse by the sea. Her mother, who she had never met, once sent it. The little town by the ocean was quiet, and the salty wind blew gently through the streets. Mira walked past an old bookstore that the postcard talked about. Its windows were dusty, but inside she saw a light. She opened the door slowly and walked in, not knowing that something very special was waiting for her inside. 1


In [18]:
print(prompt)

You are a helpful assistant. Use the following context to answer the question.

Context:
Mira watched the last train leave the station, holding a small postcard in her hand. It was old and had a picture of a lighthouse by the sea. Her mother, who she had never met, once sent it. The little town by the ocean was quiet, and the salty wind blew gently through the streets. Mira walked past an old bookstore that the postcard talked about. Its windows were dusty, but inside she saw a light. She opened the door slowly and walked in, not knowing that something very special was waiting for her inside. 1

Question: What is Mira holding?
Answer:


In [19]:
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt)

print(response.text)

Mira is holding a small postcard.

