In [85]:
# ---[ 1. Imports ]---
import os
import faiss
import numpy as np
import ollama
from bs4 import BeautifulSoup
from ebooklib import epub
from sentence_transformers import SentenceTransformer
from langgraph.graph import StateGraph
from typing import TypedDict, Annotated
from langgraph.checkpoint.sqlite import SqliteSaver
from langgraph.store.memory import InMemoryStore
import tqdm

In [70]:

# ---[ 2. Config ]---
DB_PATH = "cfa_agent_memory.db"
INDEX_PATH = "cfa_vector.index"
TEXT_DUMP_PATH = "cfa_text_dump.txt"
MODEL = "all-MiniLM-L6-v2"
EPUB_DIR = "cfa_epub"

['',
 '',
 '',
 '',
 '',
 'Table of Contents',
 '',
 'Cover',
 'Title Page',
 'Copyright Page',
 'Table of Contents',
 'How to Use the CFA Program Curriculum',
 'CFA Institute Learning Ecosystem (LES)',
 'Designing Your Personal Study Program',
 'Errata',
 'Other Feedback',
 'Accessibility Statement for the 2025 CFA Program Curriculum',
 'Our Commitment to Accessibility Standards',
 'Ongoing Improvements',
 'Feedback',
 '',
 '',
 'Quantitative Methods',
 'Rates and Returns',
 'Learning Outcomes',
 '1. Introduction',
 '2. Interest Rates and Time Value of Money',
 '2.1. Determinants of Interest Rates',
 '',
 '',
 '3. Rates of Return',
 '3.1. Holding Period Return',
 '3.2. Arithmetic or Mean Return',
 '3.3. Geometric Mean Return',
 '3.4. The Harmonic Mean',
 '',
 '',
 '4. Money-Weighted and Time-Weighted Return',
 '4.1. Calculating the Money Weighted Return',
 '4.1.1. Money-Weighted Return for a Dividend-Paying Stock',
 '4.1.2. Time-Weighted Returns',
 '4.1.2.1. Computing Time-Weighted Re

In [87]:
# ---[ 3. EPUB Parsing ]---


def epub_object_text(path):
    return [i for i in BeautifulSoup(
        items_book[0].get_content(), 
        'html.parser'
    ).get_text().split('\n') if len(i)>1]


def extract_text_from_epub_folder(folder_path):
    all_chunks = []
    
    for filename in tqdm.tqdm(os.listdir(folder_path)):
        if filename.lower().endswith(".epub"):
            epub_path = os.path.join(folder_path, filename)
            book = epub.read_epub(epub_path)
            for item in book.get_items():
                try:
                    all_chunks.extend(epub_object_text(item))
                except:
                    pass
    
    
    return all_chunks

In [88]:
chunks = extract_text_from_epub_folder(EPUB_DIR)
len(chunks)

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:15<00:00,  1.51s/it]


522665

In [89]:
# ---[ 4. FAISS Index Building ]---
def build_vector_store(chunks):
    model = SentenceTransformer(MODEL)
    embeddings = model.encode(chunks, show_progress_bar=True)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings))
    faiss.write_index(index, INDEX_PATH)
    with open(TEXT_DUMP_PATH, "w", encoding="utf-8") as f:
        for chunk in chunks:
            f.write(chunk + "\n")

In [90]:
build_vector_store(chunks)

Batches: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 16334/16334 [02:00<00:00, 135.24it/s]
