# Fetch HTML from Firestore

# Create embedding model

In [1]:
from sentence_transformers import SentenceTransformer

# Load once (this can take time)
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Define the embedding funct ion ONCE
def embed_texts_fn(texts: list[str]) -> list[list[float]]:
    return embedding_model.encode(
        texts,
        show_progress_bar=False,
        normalize_embeddings=True
    ).tolist()


In [3]:
embedding_model.get_sentence_embedding_dimension()

384

# Read text file from GCS, hash the text and chunk the text

## Initialise Google Storage client

In [5]:
import json
import os
from dotenv import load_dotenv
from google.cloud import firestore, storage

# Load env vars (local dev only; safe to keep for API)
load_dotenv()

SERVICE_ACCOUNT_PATH = os.getenv("FIREBASE_SERVICE_ACCOUNT_JSON")
BUCKET_NAME = os.getenv("GCP_STORAGE_BUCKET")

if not BUCKET_NAME:
    raise RuntimeError("Missing GCP_STORAGE_BUCKET in environment")

# --- Client initialization ---
# Local dev: use service account JSON
# Production (Cloud Run / Functions): use default credentials
if SERVICE_ACCOUNT_PATH:
    db = firestore.Client.from_service_account_json(SERVICE_ACCOUNT_PATH)
    storage_client = storage.Client.from_service_account_json(SERVICE_ACCOUNT_PATH)
else:
    db = firestore.Client()
    storage_client = storage.Client()

print(SERVICE_ACCOUNT_PATH)
print(BUCKET_NAME)

C:/Users/hongn/idealy_new/idealy/backend/funwai-resume-firebase-adminsdk-fbsvc-a956eb6362.json
funwai-resume.firebasestorage.app


## Helpers: read GCS, hash text, chunk text

In [7]:
import hashlib
from google.cloud import storage
from langchain_text_splitters import RecursiveCharacterTextSplitter

def read_gcs_text(storage_client: storage.Client, bucket_name: str, object_path: str) -> str:
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(object_path)
    return blob.download_as_text(encoding="utf-8")

def sha256_text(s: str) -> str:
    return hashlib.sha256(s.encode("utf-8")).hexdigest()

def chunk_text(text: str, chunk_size=1200, chunk_overlap=150):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
    )
    return splitter.split_text(text)


In [9]:
# Firestore manifest check
from google.cloud import firestore

def manifest_doc(db: firestore.Client, ticker: str, year: int):
    return db.collection("ingestion").document("10k").collection("files").document(f"{ticker}_{year}")

def already_ingested(db: firestore.Client, ticker: str, year: int, file_hash: str) -> bool:
    doc = manifest_doc(db, ticker, year).get()
    if not doc.exists:
        return False
    data = doc.to_dict() or {}
    return data.get("sha256") == file_hash and data.get("status") == "success"


# Create Pinecone Index

In [11]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec

# Load environment variables
load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

if not PINECONE_API_KEY:
    raise RuntimeError("Missing PINECONE_API_KEY")

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Get index handle
index = pc.Index(PINECONE_INDEX_NAME)

print("‚úÖ Pinecone index ready:", PINECONE_INDEX_NAME)

‚úÖ Pinecone index ready: 10k-text-rag


## If we want to delete the existing index and create a new one

In [23]:
pc = Pinecone(api_key=PINECONE_API_KEY)

# Delete index if it already exists (ONLY do this in dev)
if pc.has_index(PINECONE_INDEX_NAME):
    print(f"‚ö†Ô∏è Deleting existing index: {PINECONE_INDEX_NAME}")
    pc.delete_index(PINECONE_INDEX_NAME)

# Create index (384 dims = SentenceTransformer all-MiniLM-L6-v2)
print(f"üöÄ Creating Pinecone index: {PINECONE_INDEX_NAME}")
pc.create_index(
    name=PINECONE_INDEX_NAME,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1",
    ),
)

üöÄ Creating Pinecone index: 10k-text-rag


{
    "name": "10k-text-rag",
    "metric": "cosine",
    "host": "10k-text-rag-fqh5rav.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [13]:
# Define function that ingests text into Pinecone with deterministic IDs
def upsert_chunks_to_pinecone(index, vectors, namespace: str):
    # vectors: list of (id, vector, metadata)
    # batch upserts for efficiency
    BATCH = 100
    for i in range(0, len(vectors), BATCH):
        index.upsert(vectors=vectors[i:i+BATCH], namespace=namespace)


### Putting it all together

In [15]:
def ingest_10k_from_storage(
    *,
    ticker: str,
    year: int,
    bucket_name: str,
    object_path: str,
    db,
    storage_client,
    pinecone_index,
    embed_texts_fn,
    namespace: str = "10k",
):
    # 1) Read 10-K text from Firebase Storage
    text = read_gcs_text(storage_client, bucket_name, object_path)

    # 2) Hash for deduplication
    file_hash = sha256_text(text)

    # 3) Skip if already ingested
    if already_ingested(db, ticker, year, file_hash):
        print(f"‚è≠Ô∏è Skip {ticker} {year}: already ingested (same hash).")
        return

    # 4) Mark as running
    manifest_doc(db, ticker, year).set({
        "ticker": ticker,
        "year": year,
        "sourceGsPath": f"gs://{bucket_name}/{object_path}",
        "sha256": file_hash,
        "status": "running",
        "updatedAt": firestore.SERVER_TIMESTAMP,
    }, merge=True)

    # 5) Chunk the text + 6) Embed chunks
    chunks = chunk_text(text)
    embeddings = embed_texts_fn(chunks)

    # 7) Build Pinecone vectors
    vectors = []
    for i, (chunk, vec) in enumerate(zip(chunks, embeddings)):
        vid = f"{ticker}:{year}:10K:{i}"  # deterministic ID
        meta = {
            "ticker": ticker,
            "year": year,
            "docType": "10-K",
            "chunk": i,
            "source": f"gs://{bucket_name}/{object_path}",
            "sha256": file_hash,
        }
        vectors.append((vid, vec, meta))

    # 8) Upsert into Pinecone
    upsert_chunks_to_pinecone(
        pinecone_index,
        vectors,
        namespace=namespace
    )

    # 9) Mark success
    manifest_doc(db, ticker, year).set({
        "status": "success",
        "chunkCount": len(chunks),
        "updatedAt": firestore.SERVER_TIMESTAMP,
    }, merge=True)

    print(f"‚úÖ Ingested {ticker} {year}: {len(chunks)} chunks")


In [37]:
ticker_input = "ADI"
year_input = 2025
object_path_input = "filings/"+ticker_input+"/"+str(year_input)+"/10K.txt"

ingest_10k_from_storage(
    ticker = ticker_input,
    year = year_input,
    bucket_name = os.getenv("GCP_STORAGE_BUCKET"),
    object_path = object_path_input,
    db = db,
    storage_client = storage_client,
    pinecone_index = index,
    embed_texts_fn = embed_texts_fn,
)

‚úÖ Ingested ADI 2025: 476 chunks
