In [None]:
# !pip install google-cloud-storage google-cloud-vision google-generativeai PyPDF2

In [None]:

from google.cloud import storage
import json
import re

# Initialize storage client
storage_client = storage.Client()
bucket_name = "books-mcq"
bucket = storage_client.bucket(bucket_name)

print("Storage client initialized for bucket:", bucket_name)

In [None]:
# Get all parsed chunk files
chunk_files = [blob.name for blob in bucket.list_blobs(prefix="parsed_files/") 
               if blob.name.endswith("_chunks.json")]

if chunk_files:
    print(f"Found {len(chunk_files)} parsed chunk files:")
    for i, file in enumerate(chunk_files, 1):
        print(f"{i}. {file}")
else:
    print("No parsed chunk files found in bucket under 'parsed_files/'")

In [None]:
# List PDFs in bucket
pdf_files = [blob.name for blob in bucket.list_blobs() if blob.name.lower().endswith(".pdf")]
print(f"Found {len(pdf_files)} PDF files:")
for i, file in enumerate(pdf_files, 1):
    print(f"{i}. {file}")

# List existing chunk files
chunk_files = [blob.name for blob in bucket.list_blobs(prefix="parsed_files/") 
               if blob.name.endswith("_chunks.json")]
print(f"\nFound {len(chunk_files)} parsed chunk files:")
for i, file in enumerate(chunk_files, 1):
    print(f"{i}. {file}")

In [None]:
from crewai import Agent, Task, Crew
from crewai_tools import FileReadTool, DirectoryReadTool
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
from pydantic import BaseModel, Field
from typing import List, Dict, Type
from google import genai
from google.genai.types import EmbedContentConfig
from google.cloud import storage
import json

# GCS & Gemini setup
vector_prefix = "vector_store_cleaned_new/"
client = genai.Client(vertexai=True, project="summarize-39810", location="us-central1")
bucket = storage.Client().bucket("books-for-mcq")

# Input schema for duplicate checker
class DuplicateCheckerInput(BaseModel):
    chunks: List[Dict] = Field(..., description="List of text chunks to check for duplicates")

# Custom tool for duplicate detection
class DuplicateCheckerTool(BaseTool):
    name: str = "DuplicateChecker"
    description: str = "Detects duplicate chunk IDs or text."
    args_schema: Type[BaseModel] = DuplicateCheckerInput

    def _run(self, chunks: List[Dict]) -> Dict:
        seen_ids, seen_texts = set(), set()
        duplicates = {"ids": [], "texts": []}
        unique_chunks = []
        for chunk in chunks:
            cid, text = chunk["chunk_id"], chunk["text"]
            if cid in seen_ids:
                duplicates["ids"].append(cid)
            elif text in seen_texts:
                duplicates["texts"].append(cid)
            else:
                seen_ids.add(cid)
                seen_texts.add(text)
                unique_chunks.append(chunk)
        return {"unique_chunks": unique_chunks, "duplicates": duplicates}

# Embedding function
def generate_embeddings(texts):
    try:
        response = client.models.embed_content(
            model="text-embedding-005",
            contents=texts,
            config=EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT", output_dimensionality=768)
        )
        return [response.embeddings[i].values for i in range(len(texts))]
    except Exception as e:
        print(f"Error embedding: {e}")
        return [None] * len(texts)

# Load pre-parsed chunks from GCS
def load_parsed_chunks():
    prefix = "parsed_files/"
    chunk_files = [b.name for b in bucket.list_blobs(prefix=prefix) if b.name.endswith("_chunks.json")]
    all_chunks = []
    for fname in chunk_files:
        blob = bucket.blob(fname)
        all_chunks.extend(json.loads(blob.download_as_text()))
    return all_chunks

# Define agent
data_engineer = Agent(
    role="Data Engineer",
    goal="Deduplicate and embed pre-parsed chunks",
    backstory="Expert in data pipelines, GCS, and embeddings.",
    tools=[FileReadTool(), DirectoryReadTool(), DuplicateCheckerTool()],
    llm=llm,  # ✅ Pass your LLM here
    verbose=True
)


# Define tasks
task_deduplicate = Task(
    description="Load all chunked JSONs from 'parsed_files/', deduplicate chunk_ids and text.",
    expected_output="Deduplicated chunk list and report of duplicates.",
    agent=data_engineer
)

task_vectorize = Task(
    description="Generate embeddings for all unique chunks using text-embedding-005 and save with metadata.",
    expected_output="Vector list with chunk_id, embedding, and metadata.",
    agent=data_engineer
)

task_verify = Task(
    description="Verify total unique chunks (~20,944) and locate chunks mentioning 'reinforcement learning'.",
    expected_output="Count and sample of RL-relevant chunks.",
    agent=data_engineer
)

# Assemble crew
crew = Crew(
    agents=[data_engineer],
    tasks=[task_deduplicate, task_vectorize, task_verify],
    verbose=True
)

# Execute
print("Starting CrewAI pipeline...")
result = crew.kickoff()
print("Pipeline complete. Final result:", result)


In [None]:
# Embedding function / Testing with Weaviate
def generate_embeddings(texts):
    try:
        response = client.models.embed_content(
            model="text-embedding-005",
            contents=texts,
            config=EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT", output_dimensionality=768)
        )
        return [response.embeddings[i].values for i in range(len(texts))]
    except Exception as e:
        print(f"Error embedding: {e}")
        return [None] * len(texts)

# Test chunk
test_chunk = {
    "chunk_id": "test_chunk_1",
    "text": "Reinforcement learning is a machine learning paradigm where agents learn by interacting with an environment.",
    "metadata": {"source": "test", "page": 1}
}

# Create or get collection with nested metadata properties
collection_name = "TestBookChunks"
if weaviate_client.collections.exists(collection_name):
    weaviate_client.collections.delete(collection_name)  # Reset for clean test
weaviate_client.collections.create(
    name=collection_name,
    vectorizer_config=None,  # Custom embeddings
    properties=[
        Property(name="chunk_id", data_type=DataType.TEXT),
        Property(name="text", data_type=DataType.TEXT),
        Property(
            name="metadata",
            data_type=DataType.OBJECT,
            nested_properties=[
                Property(name="source", data_type=DataType.TEXT),
                Property(name="page", data_type=DataType.INT)
            ]
        )
    ]
)

# Store the test chunk
collection = weaviate_client.collections.get(collection_name)
embedding = generate_embeddings([test_chunk["text"]])[0]
if embedding:
    with collection.batch.dynamic() as batch:
        batch.add_object(
            properties={
                "chunk_id": test_chunk["chunk_id"],
                "text": test_chunk["text"],
                "metadata": test_chunk["metadata"]  # Nested dict
            },
            vector=embedding)
    print(f"Stored test chunk in Weaviate '{collection_name}'")
else:
    print("Failed to generate embedding for test chunk")

# Verify storage
total_count = collection.aggregate.over_all(total_count=True).total_count
print(f"Total objects in '{collection_name}': {total_count}")

# Query test
query = "reinforcement learning"
query_embedding = generate_embeddings([query])[0]
response = collection.query.near_vector(near_vector=query_embedding,limit=1,return_metadata=["distance"])
if response.objects:
    obj = response.objects[0]
    print("\nQueried result:")
    print(f"Chunk ID: {obj.properties['chunk_id']}")
    print(f"Text: {obj.properties['text']}")
    print(f"Metadata: {obj.properties['metadata']}")
    print(f"Distance: {obj.metadata.distance:.4f}")
else:
    print("No results found in query")

# Cleanup
weaviate_client.close()

In [None]:
# 5 chunks test working code with weaviate for deveopment purpose
from crewai import Agent, Task, Crew
from google.cloud import storage
import weaviate
from weaviate.classes.init import Auth
from weaviate.classes.config import Property, DataType
import json

# GCS & Weaviate setup
bucket = storage.Client().bucket("books-for-mcq")
weaviate_client = weaviate.connect_to_weaviate_cloud(
    cluster_url="https://oejgmjaer1on7rjlm7lw.c0.us-east1.gcp.weaviate.cloud",
    auth_credentials=Auth.api_key("jSNt4ttbezx7KXcnwq50Ll2qFxrHsYUE59V")
)

# Load 5 chunks
blob = bucket.blob("parsed_files/cleaned_combined_chunks.json")
chunks = json.loads(blob.download_as_text()).get("chunks", [])[:5]
# Define a custom tool for Weaviate storage
class WeaviateStoreTool(BaseTool):
    name: str = "WeaviateStore"
    description: str = "Stores a chunk in Weaviate with chunk_id, text, and metadata."

    def _run(self, chunk: dict) -> str:
        required_keys = {"chunk_id", "text", "metadata"}
        if not all(k in chunk for k in required_keys):
            raise ValueError(f"Chunk missing required keys: {required_keys - chunk.keys()}")
        collection = weaviate_client.collections.get("BookChunks")
        with collection.batch.dynamic() as batch:
            batch.add_object(properties=chunk, vector=[0.1]*768)
        return f"Stored chunk {chunk['chunk_id']}"

# Agent and Task
agent = Agent(
    role="Data Engineer",
    goal="Store chunks",
    backstory="Expert in managing data pipelines and vector storage.",
    llm="openrouter/deepseek/deepseek-chat"
)
task = Task(
    description="Store these 5 chunks in Weaviate one by one: " + json.dumps(chunks),
    expected_output="Stored 5 chunks",
    agent=agent,
    tools=[WeaviateStoreTool()]
)
# Crew execution
crew = Crew(agents=[agent], tasks=[task], verbose=True)
result = crew.kickoff()
print("Result:", result)

# Verify
collection = weaviate_client.collections.get("BookChunks")
print("Total in Weaviate:", collection.aggregate.over_all(total_count=True).total_count)
weaviate_client.close()

In [None]:
# -*- coding: utf-8 -*-
"""Query FAISS Index from Vectors Stored in GCS"""

from google.cloud import storage
from google import genai
from google.genai.types import EmbedContentConfig
import json
import numpy as np
import faiss

# Setup
project_id = "summarize-39910"
location = "us-central1"
bucket_name = "books-mcq"
vector_prefix = "vector_store_cleaned/"

client = genai.Client(
    vertexai=True,
    project=project_id,
    location=location
)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

# Load first vector file
for blob in bucket.list_blobs(prefix=vector_prefix, max_results=1):
    print(f"\n📂 Loading vector file: {blob.name}")
    vector_data = json.loads(blob.download_as_string())
    break

# Build FAISS index
dimension = 768
index = faiss.IndexHNSWFlat(dimension, 32)
index.hnsw.efConstruction = 100

ids = []
metas = []

print(f"📊 Building index from {len(vector_data)} vectors...")
for item in vector_data:
    vector = np.array(item['embedding'], dtype='float32')
    index.add(np.expand_dims(vector, axis=0))
    ids.append(item['chunk_id'])
    metas.append(item.get('metadata', {}))

print(f"✅ FAISS index built with {len(ids)} vectors.")

# Query
def generate_embedding(text):
    response = client.models.embed_content(
        model="text-embedding-005",
        contents=[text],
        config=EmbedContentConfig(task_type="RETRIEVAL_DOCUMENT", output_dimensionality=768)
    )
    return response.embeddings[0].values

query_text = input("\n🔍 Enter your query (e.g., 'What is reinforcement learning?'):\n> ")
query_embedding = np.array([generate_embedding(query_text)], dtype='float32')
index.hnsw.efSearch = 100
distances, indices = index.search(query_embedding, k=5)

print(f"\n🔎 Top 5 Matches for: '{query_text}'")
for rank, (idx, dist) in enumerate(zip(indices[0], distances[0]), 1):
    print(f"\n#{rank}: Chunk ID: {ids[idx]}")
    print(f"   Distance: {dist:.4f}")
    print(f"   Metadata: {metas[idx]}")

# Load original chunks to get text content
chunk_blob_name = blob.name.replace("_vectors.json", "_chunks.json").replace("vector_store_cleaned_new/", "parsed_files/")
chunk_blob = bucket.blob(chunk_blob_name)
chunk_data = json.loads(chunk_blob.download_as_string())
chunk_dict = {c['chunk_id']: c['text'] for c in chunk_data['chunks']}

# Show text for matched chunks
print(f"\n📄 Text content of top 5 matched chunks:\n")
for rank, idx in enumerate(indices[0]):
    cid = ids[idx]
    text = chunk_dict.get(cid, "[Text not found]")
    print(f"#{rank+1}: Chunk ID: {cid}")
    print(f"📘 Text Preview: {text[:30000]}")


In [None]:
# Augment whole dataset
from google.cloud import storage
import pandas as pd
import io

# Initialize storage client
bucket_name = "books-mcq"
file_path = "Quizxlsx/MCQsDataset.xlsx"
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(file_path)

# Download Excel file into memory
excel_bytes = blob.download_as_bytes()
df = pd.read_excel(io.BytesIO(excel_bytes))
mcq_json = df.to_dict(orient="records")
master_mcq_dataset_to_refer_string = json.dumps(mcq_json, indent=2)
print(master_mcq_dataset_to_refer_string[:1000])  

# Show basic info
print(f"✅ Loaded sheet with shape: {df.shape}")
print(df.head())


In [None]:
# Generate MCQ for FAANG interview for Artificial Intelligence
# Artificial Intelligence scientist applied role intervview Qs MCQs generate 10 Qsfrom google.genai.types import HttpOptions
import numpy as np
from google.cloud import storage
import time
start = time.time()
client = genai.Client(http_options=HttpOptions(api_version="v1"))
storage_client = storage.Client()
bucket_name = "books-for-mcq"
bucket = storage_client.bucket(bucket_name)

# Query FAISS and generate MCQ with user input
def generate_mcq(index, ids, metadata_list, top_k=3):
    query_text = input("Enter your query (e.g., 'Generate MCQ for FAANG interview for data scientist position'): ")
    
    query_embedding = np.array([generate_embedding(query_text)], dtype='float32')
    index.hnsw.efSearch = 100
    distances, indices = index.search(query_embedding, top_k)
    
    print(f"\nTop {top_k} matches for '{query_text}':")
    relevant_chunks = []
    # Reload chunk data from GCS
    for blob in bucket.list_blobs(prefix='parsed_files/', max_results=1):
        if blob.name.endswith('_chunks.json'):
            chunk_data = json.loads(blob.download_as_string())
            break
    
    for i, (dist, idx) in enumerate(zip(distances[0], indices[0])):
        chunk_info = f"Chunk ID: {ids[idx]}, Distance: {dist}, Text: {chunk_data['chunks'][idx].get('text', 'No text')}"
        print(f"#{i+1}: {chunk_info}")
        relevant_chunks.append(chunk_data['chunks'][idx]['text'])

    
    prompt = f"""
    Based on the following text chunks and the user query '{query_text}', and refering to the master dataset:"{master_mcq_dataset_to_refer_string}", generate a multiple-choice question (MCQ) with 4 options (A, B, C, D). Provide the question, options, and the correct answer in this format:\n"
        "Question: [Your question here]\n"
        "A) [Option A]\n"
        "B) [Option B]\n"
        "C) [Option C]\n"
        "D) [Option D]\n"
        "Correct Answer: [Letter]\n"
        "Explanation: [Brief justification based on chunk content]\n"
        "Source Chunk IDs: [List of chunk_ids used for answer]\n\n"
        "Example:\n"
        "Question: What is the primary goal of reinforcement learning?\n"
        "A) Minimize prediction error\n"
        "B) Maximize cumulative reward\n"
        "C) Classify data\n"
        "D) Reduce variance\n"
        "Correct Answer: B\n"
        "Explanation: This is stated in the text where it discusses reinforcement learning agents maximizing expected reward.\n"
        "Source Chunk IDs: 122_0, 1041_5\n\n"
        "Source Chunk Text excerpt used: \"Excerpt in 100 words\".\n"
        "Master Dataset to refer before producing the MCQs: {master_mcq_dataset_to_refer_string} and then refer chunks for context;
        "Chunks:\n" + "\n\n".join({relevant_chunks})
        Return first all MCQs generated by you, then chunks refered as above as a whole separate section.
        Return in pure json format with keys as MCQ, Reference_dataset, Reference_chunks and corresponding values. 
    """
    start = time.time()

    response = client.models.generate_content(
        model="gemini-1.5-pro", #  gemini-2.0-flash
        contents=prompt
    )
    
    print("\nGenerated MCQ:")
    print(response.text)
    end = time.time()
    print(f"⏱️ Response Time: {end - start:.2f} seconds")

# Run it 
if __name__ == "__main__":
    generate_mcq(index, ids, metadata_list)


In [None]:
# Create and load Cache 
from google.cloud import storage
from google import genai
from google.genai.types import HttpOptions, Content, CreateCachedContentConfig
import pandas as pd
import io
import json

# Initialize clients
storage_client = storage.Client()
genai_client = genai.Client(http_options=HttpOptions(api_version="v1"))
bucket = storage_client.bucket("books-for-mcq")

# Load master dataset from GCS
blob = bucket.blob("Quizxlsx/MCQsDatasetTIll3800.xlsx")
excel_bytes = blob.download_as_bytes()
df = pd.read_excel(io.BytesIO(excel_bytes))
master_mcq_dataset_to_refer_string = json.dumps(df.to_dict(orient="records"), indent=2)
print(f"Loaded dataset with {len(df)} records, preview: {master_mcq_dataset_to_refer_string[:100]}...")

# Create cache
# Create cache
system_instruction = "You are an expert MCQ generator for FAANG interviews, using a master dataset and book chunks."

contents = [
    Content(
        role="user",
        parts=[{"text": master_mcq_dataset_to_refer_string}]
    )
]

content_cache = genai_client.caches.create(
    model="gemini-1.5-flash-002",
    config=CreateCachedContentConfig(
        contents=contents,
        system_instruction=system_instruction,
        display_name="master_mcq_dataset_cache",
        ttl="86400s"  # 24 hours
    )
)

print(f"✅ Cache created: {content_cache.name}")


In [None]:
# Check Cache
from google import genai
from google.genai.types import HttpOptions

# Initialize client with beta version
client = genai.Client(http_options=HttpOptions(api_version="v1beta1"))

# List all caches in the current project and location
content_cache_list = client.caches.list()

# Loop through and print details
for content_cache in content_cache_list:
    print(f"🧠 Cache: {content_cache.name}")
    print(f"📦 Model: {content_cache.model}")
    print(f"🕒 Last updated: {content_cache.update_time}")
    print(f"⌛ Expires: {content_cache.expire_time}")
    
    if hasattr(content_cache, "usage_metadata"):
        usage = content_cache.usage_metadata
        print(f"🧮 Token Count: {usage.total_token_count}")
        print(f"🖼️ Image Count: {usage.image_count}")
        print(f"🎞️ Video Duration: {usage.video_duration_seconds}")
        print(f"🔊 Audio Duration: {usage.audio_duration_seconds}")
    print("─" * 60)


In [None]:
# Cache reduced 85s to 21s response time
from google import genai
from google.genai.types import GenerateContentConfig, HttpOptions
import time 
client = genai.Client(http_options=HttpOptions(api_version="v1beta1"))

# Use the cache created earlier
cache_name = "projects/493443117630/locations/us-central1/cachedContents/821217382044999680"
query_text = "Generate a MCQ for FAANG data scientist interview on supervised learning."

start = time.time()
# Call Gemini using cached master dataset
response = client.models.generate_content(
    model="gemini-1.5-flash-002",
    contents=[{"role": "user", "parts": [{"text": query_text}]}],
    config=GenerateContentConfig(
        cached_content=cache_name,
    ),
)
time_taken = time.time() - start

# Display response
print("🎯 Generated MCQ:\n")
print(response.text)

print(f"Time taken: {time_taken} seconds")



In [None]:
# Cache + vectors reduced 85s to 19s response time
from google import genai
from google.genai.types import GenerateContentConfig, HttpOptions
import numpy as np
import time
import json

# Initialize Gemini client (v1beta1 for cache support)
client = genai.Client(http_options=HttpOptions(api_version="v1beta1"))

# Pre-set: Cache with master MCQ dataset
cache_name = "projects/493443117630/locations/us-central1/cachedContents/821217538204499680"

# Get user query
query_text = "Generate 3 MCQs for FAANG interview with references."

# Use FAISS to retrieve top-k relevant chunks
query_embedding = np.array([generate_embedding(query_text)], dtype='float32')
index.hnsw.efSearch = 100
distances, indices = index.search(query_embedding, 5)

# Load chunks from GCS
for blob in bucket.list_blobs(prefix='parsed_files/', max_results=1):
    if blob.name.endswith('_chunks.json'):
        chunk_data = json.loads(blob.download_as_string())
        break

# Gather top chunks and their IDs
retrieved_chunks = []
retrieved_chunk_ids = []
for dist, idx in zip(distances[0], indices[0]):
    text = chunk_data['chunks'][idx].get("text", "")
    cid = chunk_data['chunks'][idx].get("chunk_id", "")
    retrieved_chunks.append(text)
    retrieved_chunk_ids.append(cid)

# Prepare the prompt for Gemini
prompt_text = f"""
User Query: {query_text}

Below are relevant textbook chunks retrieved via vector search:
{json.dumps(retrieved_chunks[:3], indent=2)}

Provide 3 MCQs in JSON format based on the above chunks and your cached master dataset.
Source preferred (as in cache : {cache_name}: use interviewquery, NeetcodeYT, Top 5 Algorithms Qs YT, Algorithm Class Notes,interviewquery.com
Topic preferred: Data Structures, Machine learning and Algorithms. 
After selection, apply a real life use case as it would be used in a research in Thomas Reuters.
Return format:
{{
  "MCQ": [...],
  "Source": Source,
  "Topic": topics used,
  "Reference_dataset": "Gemini cache used: {cache_name}",
  "Reference_chunks": {json.dumps(retrieved_chunk_ids)}
}}
"""

# Call Gemini model
start_time = time.time()
response = client.models.generate_content(
    model="gemini-1.5-flash-002",
    contents=[{"role": "user", "parts": [{"text": prompt_text}]}],
    config=GenerateContentConfig(
        cached_content=cache_name,
    ),
)
end_time = time.time()

# Output
print("🎯 Generated Response:\n")
print(response.text)
print(f"⏱️ Time taken: {end_time - start_time:.2f} seconds")
