In [None]:
# ✅ Install compatible versions for LlamaIndex, LangChain, and Sentence Transformers
# ✅ Clean install without the broken retriever package
!pip install llama-index==0.10.39
!pip install llama-index-embeddings-huggingface==0.1.4
!pip install langchain-community==0.0.34
!pip install sentence-transformers==2.5.1
!pip install beautifulsoup4 nest-asyncio PyMuPDF pdfplumber lxml requests
!pip install beautifulsoup4
!pip install nest-asyncio
!pip install PyMuPDF
!pip install pdfplumber
!pip install lxml
!pip install requests

Collecting llama-index-embeddings-huggingface==0.1.4
  Downloading llama_index_embeddings_huggingface-0.1.4-py3-none-any.whl.metadata (806 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3.0.0,>=2.1.2->llama-index-embeddings-huggingface==0.1.4)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3.0.0,>=2.1.2->llama-index-embeddings-huggingface==0.1.4)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3.0.0,>=2.1.2->llama-index-embeddings-huggingface==0.1.4)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3.0.0,>=2.1.2->llama-index-embeddings-huggingface==0.1.4)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting n

Collecting sentence-transformers==2.5.1
  Downloading sentence_transformers-2.5.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 4.1.0
    Uninstalling sentence-transformers-4.1.0:
      Successfully uninstalled sentence-transformers-4.1.0
Successfully installed sentence-transformers-2.5.1
Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading 

In [5]:
!pip install llama-index==0.10.39
!pip install llama-index-embeddings-huggingface==0.1.4
!pip install langchain-community==0.0.34
!pip install sentence-transformers==2.5.1
!pip install beautifulsoup4 nest-asyncio PyMuPDF pdfplumber lxml requests



In [1]:
import os
import requests
import nest_asyncio
import json
import re
import time
import pickle
from bs4 import BeautifulSoup
from datetime import datetime
from llama_index.core import Document

nest_asyncio.apply()

class CocaColaFilingProcessor:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({'User-Agent': 'GPTResearchBot contact@example.com'})
        self.target_years = list(range(2015, 2025))
        self.parsed_documents = []

    def get_filing_metadata(self):
        cik = "21344"
        url = f"https://data.sec.gov/submissions/CIK{cik.zfill(10)}.json"
        filings = []

        r = self.session.get(url)
        data = r.json()
        recent = data.get('filings', {}).get('recent', {})

        for form, acc, date in zip(recent['form'], recent['accessionNumber'], recent['filingDate']):
            if form == "10-K":
                year = int(date[:4])
                if year in self.target_years:
                    acc_clean = acc.replace("-", "")
                    html_link = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_clean}/{acc}-index.html"
                    filings.append({
                        "year": year,
                        "date": date,
                        "url": html_link,
                        "accession": acc
                    })

        return filings

    def get_html_url(self, index_url):
        r = self.session.get(index_url)
        soup = BeautifulSoup(r.content, "html.parser")
        table = soup.find("table", class_="tableFile")
        for row in table.find_all("tr"):
            cells = row.find_all("td")
            if len(cells) >= 4 and "10-K" in cells[3].text and "Amendment" not in cells[3].text:
                link = cells[2].find("a")["href"]
                return "https://www.sec.gov" + link
        return None

    def extract_text(self, html_url):
        r = self.session.get(html_url)
        soup = BeautifulSoup(r.content, "html.parser")
        for tag in soup(["script", "style"]): tag.decompose()
        text = soup.get_text(separator="\n")
        lines = [line.strip() for line in text.splitlines() if len(line.strip()) > 5]
        return "\n".join(lines)

    def chunk_text(self, text, chunk_size=2000, overlap=200):
        chunks, start = [], 0
        while start < len(text):
            end = start + chunk_size
            end = text.rfind("\n", start + chunk_size - 200, end)
            end = end if end != -1 else start + chunk_size
            chunks.append(text[start:end].strip())
            start = end - overlap
        return chunks

    def run(self):
        filings = self.get_filing_metadata()
        for filing in filings:
            print(f"📄 Processing {filing['year']}...")
            html_url = self.get_html_url(filing['url'])
            if not html_url:
                print(f"❌ HTML not found for {filing['year']}")
                continue
            text = self.extract_text(html_url)
            if len(text) < 1000:
                print(f"⚠️ Too little text for {filing['year']}")
                continue
            chunks = self.chunk_text(text)
            for i, chunk in enumerate(chunks):
                doc = Document(text=chunk)
                doc.metadata = {
                    "year": filing["year"],
                    "chunk_index": i,
                    "total_chunks": len(chunks),
                    "filing_date": filing["date"],
                    "accession": filing["accession"]
                }
                self.parsed_documents.append(doc)

        with open("/content/processed_documents.pkl", "wb") as f:
            pickle.dump(self.parsed_documents, f)

        print(f"✅ Saved {len(self.parsed_documents)} chunks to /content/processed_documents.pkl")

# 🔄 Run the processor
processor = CocaColaFilingProcessor()
processor.run()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /usr/local/lib/python3.11/dist-
[nltk_data]     packages/llama_index/core/_static/nltk_cache...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


📄 Processing 2024...
⚠️ Too little text for 2024
📄 Processing 2023...
⚠️ Too little text for 2023
📄 Processing 2022...
⚠️ Too little text for 2022
📄 Processing 2021...
⚠️ Too little text for 2021
📄 Processing 2020...
⚠️ Too little text for 2020
📄 Processing 2019...
📄 Processing 2018...
📄 Processing 2017...
✅ Saved 1153 chunks to /content/processed_documents.pkl


In [10]:
import pickle
from llama_index.core import VectorStoreIndex, ServiceContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding  # ✅ confirmed compatible

# ✅ Load chunks
with open("/content/processed_documents.pkl", "rb") as f:
    documents = pickle.load(f)
print(f"✅ Loaded {len(documents)} document chunks")

# ✅ Use small, fast HuggingFace embedding model directly (CPU friendly)
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)

# ✅ Standard index and engine
index = VectorStoreIndex.from_documents(documents, service_context=service_context)
standard_engine = index.as_query_engine(similarity_top_k=4)

# ✅ Sentence window engine (subset for speed)
parser = SentenceWindowNodeParser.from_defaults(window_size=3, window_metadata_key="window")
sentence_nodes = parser.get_nodes_from_documents(documents[:50])  # Limit for CPU
sentence_index = VectorStoreIndex(sentence_nodes, service_context=service_context)
sentence_engine = sentence_index.as_query_engine(similarity_top_k=4)

# ✅ Test query
query = "What are Coca-Cola's main business segments?"

print("\n🔍 Standard Chunk Retrieval Result:")
print(standard_engine.query(query).response)

print("\n🔍 Sentence-Window Result:")
print(sentence_engine.query(query).response)


✅ Loaded 1153 document chunks


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

  service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=None)


LLM is explicitly disabled. Using MockLLM.

🔍 Standard Chunk Retrieval Result:
Context information is below.
---------------------
year: 2019
chunk_index: 232
total_chunks: 388
filing_date: 2019-02-21
accession: 0000021344-19-000014

e for cash, excluding production facilities included in the Southwest Transaction.
During the years ended
December 31, 2018
December 31, 2017
December 31, 2016
, cash proceeds from these sales totaled
$3 million
$2,860 million
$1,017 million
, respectively. Included in the cash proceeds for the years ended
December 31, 2017
December 31, 2016
$336 million
$279 million
, respectively, from Coca-Cola Bottling Co. Consolidated now known as Coca-Cola Consolidated, Inc. ("CCCI"), an equity method investee. Also included in the cash
proceeds for the year ended December 31, 2017 was
$220 million
from AC Bebidas and
$39 million
from Liberty Coca-Cola Beverages.
Under the applicable accounting guidance, we were required to derecognize all of the tangible assets sold

In [11]:
# ✅ Install sentence-transformers (if not already)
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.0.0-py3-none-any.whl (470 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
  Attempting uninstall: sentence-transformers
    Found existing installation: sentence-transformers 2.5.1
    Uninstalling sentence-transformers-2.5.1:
      Successfully uninstalled sentence-transformers-2.5.1
Successfully installed sentence-transformers-5.0.0


In [18]:
import pickle
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings  # ✅ Correct new config style for v0.10.39+

# ✅ Load your saved documents
with open("/content/processed_documents.pkl", "rb") as f:
    documents = pickle.load(f)

# ✅ Define open-source embedding model
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# ✅ Set global settings with NO LLM
Settings.embed_model = embed_model
Settings.llm = None  # 🔒 No OpenAI

# ✅ Build the index
index = VectorStoreIndex.from_documents(documents)

# ✅ Use default retriever
retriever = index.as_retriever(similarity_top_k=4)

# ✅ Sample query
query = "What are Coca-Cola's main business segments?"
retrieved_nodes = retriever.retrieve(query)

# ✅ Show result
for i, node in enumerate(retrieved_nodes):
    print(f"\n--- Result {i+1} ---\n{node.get_text()[:500]}")

LLM is explicitly disabled. Using MockLLM.

--- Result 1 ---
-Cola Company, our operations and our present business environment. MD&A is provided as a supplement to — and should be read in conjunction with — our consolidated financial statements and the accompanying notes thereto contained in "Item 8. Financial Statements and Supplementary Data" of this report. This overview summarizes the MD&A, which includes the following sections:
Our Business
— a general description of our business and the nonalcoholic beverage segment of the commercial beverage indus

--- Result 2 ---
the accompanying notes thereto contained in "Item 8. Financial Statements and Supplementary Data" of this report. This overview summarizes the MD&A, which includes the following sections:
Our Business
— a general description of our business and the nonalcoholic beverage segment of the commercial beverage industry; our objective; our strategic priorities; our core capabilities; and challenges and risks of our business

In [19]:
from sentence_transformers import CrossEncoder

# ✅ Use a fast and widely supported reranker
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

# ✅ Prepare query–document pairs
pairs = [(query, node.get_text()) for node in retrieved_nodes]

# ✅ Get scores
scores = reranker.predict(pairs)

# ✅ Sort nodes by score
reranked = [node for _, node in sorted(zip(scores, retrieved_nodes), key=lambda x: x[0], reverse=True)]

# ✅ Show reranked results
for i, node in enumerate(reranked):
    print(f"\n🔁 Reranked Result {i+1} (score: {scores[i]:.4f})\n{node.get_text()[:500]}")


🔁 Reranked Result 1 (score: 3.8549)
-Cola Company, our operations and our present business environment. MD&A is provided as a supplement to — and should be read in conjunction with — our consolidated financial statements and the accompanying notes thereto contained in "Item 8. Financial Statements and Supplementary Data" of this report. This overview summarizes the MD&A, which includes the following sections:
Our Business
— a general description of our business and the nonalcoholic beverage segment of the commercial beverage indus

🔁 Reranked Result 2 (score: 2.4070)
The Coca-Cola Company and all entities included in our consolidated financial statements.
General
The Coca-Cola Company is the world's largest beverage company. We own or license and market more than
nonalcoholic beverage brands including sparkling beverages and a variety of still beverages such as waters, flavored waters and enhanced waters, juices and juice drinks, ready-to-drink teas and coffees, sports drinks, dairy, 

In [25]:
!pip install llama-index==0.10.39 llama-index-embeddings-huggingface==0.1.4



In [31]:
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.tools import QueryEngineTool, ToolMetadata

# Wrap the existing retriever
retriever_engine = RetrieverQueryEngine.from_args(retriever=retriever)

# Set up a simple tool
tool = QueryEngineTool(
    query_engine=retriever_engine,
    metadata=ToolMetadata(
        name="coca_cola_reports",
        description="Answers general questions about Coca-Cola's business from 10-K filings."
    )
)

# Use RetrieverQueryEngine directly as our query engine
query_engine = retriever_engine

# 🔎 Run your simple query
query = "Describe Coca-Cola’s business model and its main product categories."
response = query_engine.query(query)

print("🧠 Answer:\n", response)

🧠 Answer:
 Context information is below.
---------------------
year: 2017
chunk_index: 86
total_chunks: 384
filing_date: 2017-02-24
accession: 0000021344-17-000009

-Cola Company, our operations and our present business environment. MD&A is provided as a supplement to — and should be read in conjunction with — our consolidated financial statements and the accompanying notes thereto contained in "Item 8. Financial Statements and Supplementary Data" of this report. This overview summarizes the MD&A, which includes the following sections:
Our Business
— a general description of our business and the nonalcoholic beverage segment of the commercial beverage industry; our objective; our strategic priorities; our core capabilities; and challenges and risks of our business.
Critical Accounting Policies and Estimates
— a discussion of accounting policies that require critical judgments and estimates.
Operations Review
— an analysis of our Company's consolidated results of operations for the thre

In [32]:
# 🔍 Evaluation: Precision@K

def precision_at_k(query, expected_keywords, retrieved_nodes, k=3):
    top_k_nodes = retrieved_nodes[:k]
    hits = 0

    for node in top_k_nodes:
        text = node.get_text().lower()
        if any(keyword.lower() in text for keyword in expected_keywords):
            hits += 1

    return hits / k

# 🧪 Example evaluation set
eval_queries = [
    {
        "query": "What are Coca-Cola’s main business segments?",
        "expected_keywords": ["beverage", "sparkling", "juice", "energy drink", "coffee"]
    },
    {
        "query": "Who is Coca-Cola’s Chief Information Officer?",
        "expected_keywords": ["chief information officer", "Barry Simpson"]
    },
    {
        "query": "How does Coca-Cola make money?",
        "expected_keywords": ["revenue", "sales", "distribution", "marketing"]
    }
]

# 🧮 Run evaluation
results = []
for q in eval_queries:
    nodes = retriever.retrieve(q["query"])
    score = precision_at_k(q["query"], q["expected_keywords"], nodes, k=3)
    results.append((q["query"], score))
    print(f"\n📌 Query: {q['query']}")
    print(f"🎯 Precision@3 Score: {score:.2f}")

# 🔚 Summary
avg_score = sum(score for _, score in results) / len(results)
print(f"\n✅ Average Precision@3: {avg_score:.2f}")


📌 Query: What are Coca-Cola’s main business segments?
🎯 Precision@3 Score: 1.00

📌 Query: Who is Coca-Cola’s Chief Information Officer?
🎯 Precision@3 Score: 0.00

📌 Query: How does Coca-Cola make money?
🎯 Precision@3 Score: 1.00

✅ Average Precision@3: 0.67
