# GFS Setup: File Search Store Creation

This notebook sets up Google Generative File Search (GFS) for the RAG comparison.

**Objectives**:
1. Initialize GFS client with API key
2. Create file search store
3. Upload documents from `data/raw/`
4. Verify indexing completion
5. Test basic queries

In [None]:
import sys
from pathlib import Path
import json
import time

# Add src to path
project_root = Path.cwd().parent
sys.path.append(str(project_root / "src"))

from gfs_client import GFSClient
from data_loader import scan_documents, check_gfs_compatibility
from utils import load_api_key

import polars as pl

print("Imports successful")

## 1. Initialize GFS Client

In [None]:
# Load API key from .env
api_key = load_api_key("GOOGLE_API_KEY", str(project_root / ".env"))

# Initialize client
gfs = GFSClient(api_key=api_key, model_id="gemini-2.0-flash-exp")

print("GFS client initialized")

## 2. Check Existing Stores

In [None]:
# List existing stores
existing_stores = gfs.list_stores()

print(f"Existing stores: {len(existing_stores)}")
for store in existing_stores:
    print(f"  - {store.display_name}: {store.name}")

## 3. Create New File Search Store

In [None]:
# Create store
store_display_name = "RAG Comparison Document Store"

store = gfs.create_file_search_store(display_name=store_display_name)

print(f"Created store: {store.display_name}")
print(f"Store name: {store.name}")
print(f"Created at: {store.create_time}")

# Save store metadata
store_metadata = {
    "display_name": store.display_name,
    "store_name": store.name,
    "create_time": str(store.create_time),
}

metadata_path = project_root / "models" / "gfs_stores" / "metadata.json"
with open(metadata_path, "w") as f:
    json.dump(store_metadata, f, indent=2)

print(f"\nMetadata saved to: {metadata_path}")

## 4. Scan and Upload Documents

In [None]:
# Scan documents
data_dir = project_root / "data" / "raw"
df = scan_documents(data_dir)

print(f"Total files found: {len(df)}")

if len(df) == 0:
    print("\nNo files found in data/raw/")
    print("Add documents to continue.")
else:
    # Check compatibility
    df_compat = check_gfs_compatibility(df)
    compatible_files = df_compat.filter(pl.col("gfs_compatible"))
    
    print(f"Compatible files: {len(compatible_files)}")
    print(f"Incompatible files: {len(df) - len(compatible_files)}")

In [None]:
# Upload compatible files to store
if len(df) > 0 and len(compatible_files) > 0:
    upload_results = []
    
    for i, row in enumerate(compatible_files.iter_rows(named=True)):
        file_path = Path(row["file_path"])
        print(f"\nUploading {i+1}/{len(compatible_files)}: {file_path.name}")
        
        try:
            start_time = time.time()
            operation = gfs.upload_to_store(
                store_name=store.name,
                file_path=file_path,
                wait_for_completion=True
            )
            elapsed = time.time() - start_time
            
            upload_results.append({
                "file_name": file_path.name,
                "status": "success",
                "upload_time_seconds": elapsed
            })
            
            print(f"  ✓ Uploaded successfully ({elapsed:.1f}s)")
            
        except Exception as e:
            upload_results.append({
                "file_name": file_path.name,
                "status": "failed",
                "error": str(e)
            })
            print(f"  ✗ Failed: {e}")
    
    # Save upload results
    results_path = project_root / "models" / "gfs_stores" / "upload_results.json"
    with open(results_path, "w") as f:
        json.dump(upload_results, f, indent=2)
    
    print(f"\nUpload results saved to: {results_path}")
else:
    print("No compatible files to upload")

## 5. Verify Store Status

In [None]:
# Get updated store info
store_info = gfs.get_store_info(store.name)

print("Store Status:")
print(f"  Display name: {store_info.display_name}")
print(f"  Size: {store_info.size_bytes / (1024*1024):.2f} MB")
print(f"  Active documents: {store_info.active_documents_count}")
print(f"  Pending documents: {store_info.pending_documents_count}")
print(f"  Failed documents: {store_info.failed_documents_count}")
print(f"  Last update: {store_info.update_time}")

## 6. Test Query

In [None]:
# Test query (only if documents are uploaded)
if len(df) > 0 and len(compatible_files) > 0:
    test_query = "What are the main topics covered in these documents?"
    
    print(f"Test query: {test_query}")
    print("=" * 60)
    
    response = gfs.query_with_file_search(
        query=test_query,
        store_names=[store.name],
        temperature=0.0
    )
    
    print(f"\nResponse:\n{response.text}")
    
    # Check for citations
    citations = gfs.extract_citations(response)
    if citations:
        print("\n[Sources cited from documents]")
    else:
        print("\n[No citations found]")
else:
    print("Skipping test query - no documents uploaded")

## Summary

**Completed**:
- Created GFS file search store
- Uploaded compatible documents
- Verified indexing status
- Tested basic query functionality

**Next Steps**:
- Proceed to `03_gfs_experiments.ipynb` for detailed RAG experiments
- Test various query patterns
- Measure latency and retrieval quality