In [383]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import tqdm
import ebooklib
from ebooklib import epub
import json

In [189]:
import re

def clean_text(text):
    # Replace newlines and multiple spaces with a single space
    cleaned = re.sub(r'\s+', ' ', text)
    return cleaned.strip()

In [391]:
def extract_subtopics(epub_path):
    book = epub.read_epub(f"cfa_epub/{epub_path}")
    subtopics = []

    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        soup = BeautifulSoup(item.get_content(), 'html.parser')

        # Extract headings and their corresponding paragraphs
        for heading in soup.find_all(['h1', 'h2', 'h3']):
            heading_text = heading.get_text(strip=True)
            content = ''
            for sibling in heading.find_next_siblings():
                if sibling.name and sibling.name.startswith('h'):
                    break
                if sibling.name == 'p':
                    content += sibling.get_text(strip=True) + ' '
            if content and heading_text not in ['Learning Outcome', 'Learning Outcomes', '1.Introduction']:
                subtopics.append({'title': clean_text(heading_text), 'content': clean_text(content.strip())})

    return subtopics


In [392]:
epub_files = sorted(os.listdir("cfa_epub/"))
epub_files

['cfa_2025_L0.ePub',
 'cfa_2025_L1.ePub',
 'cfa_2025_L2.ePub',
 'cfa_2025_L3.ePub',
 'cfa_2025_L4.ePub',
 'cfa_2025_L5.ePub',
 'cfa_2025_L6.ePub',
 'cfa_2025_L7.ePub',
 'cfa_2025_L8.ePub',
 'cfa_2025_L9.ePub']

In [393]:
%%time

subtopics = {
    i: extract_subtopics(i) for i in tqdm.tqdm(epub_files)
}

  for root_file in tree.findall('//xmlns:rootfile[@media-type]', namespaces={'xmlns': NAMESPACES['CONTAINERNS']}):
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:04<00:00,  2.20it/s]

CPU times: user 4.45 s, sys: 63.2 ms, total: 4.51 s
Wall time: 4.56 s





In [394]:
subtopics.keys()

dict_keys(['cfa_2025_L0.ePub', 'cfa_2025_L1.ePub', 'cfa_2025_L2.ePub', 'cfa_2025_L3.ePub', 'cfa_2025_L4.ePub', 'cfa_2025_L5.ePub', 'cfa_2025_L6.ePub', 'cfa_2025_L7.ePub', 'cfa_2025_L8.ePub', 'cfa_2025_L9.ePub'])

In [395]:
tpcs = { k:([i for i in v[:20] if 
           ( "© 2023" in i['content'] ) or ( "© 2024" in i['content'] ) 
        ])[0]['title'] for k,v in subtopics.items()}

tpcs

{'cfa_2025_L0.ePub': 'Quantitative Methods',
 'cfa_2025_L1.ePub': 'Economics',
 'cfa_2025_L2.ePub': 'Corporate Issuers',
 'cfa_2025_L3.ePub': 'Financial Statement Analysis',
 'cfa_2025_L4.ePub': 'Equity Investments',
 'cfa_2025_L5.ePub': 'Fixed Income',
 'cfa_2025_L6.ePub': 'Derivatives',
 'cfa_2025_L7.ePub': 'Alternative Investments',
 'cfa_2025_L8.ePub': 'Portfolio Management',
 'cfa_2025_L9.ePub': 'Ethical and Professional Standards'}

In [396]:
{v:k for k,v in tpcs.items()}

{'Quantitative Methods': 'cfa_2025_L0.ePub',
 'Economics': 'cfa_2025_L1.ePub',
 'Corporate Issuers': 'cfa_2025_L2.ePub',
 'Financial Statement Analysis': 'cfa_2025_L3.ePub',
 'Equity Investments': 'cfa_2025_L4.ePub',
 'Fixed Income': 'cfa_2025_L5.ePub',
 'Derivatives': 'cfa_2025_L6.ePub',
 'Alternative Investments': 'cfa_2025_L7.ePub',
 'Portfolio Management': 'cfa_2025_L8.ePub',
 'Ethical and Professional Standards': 'cfa_2025_L9.ePub'}

In [397]:
boooks_map = {
    k:subtopics[v][
                {
            'Quantitative Methods': 9,
         'Economics': 5,
         'Corporate Issuers': 9,
         'Financial Statement Analysis': 9,
         'Equity Investments': 5,
         'Fixed Income': 5,
         'Derivatives': 5,
         'Alternative Investments': 6,
         'Portfolio Management': 5,
         'Ethical and Professional Standards': 6,
        }[k]+1:]
    for v,k in tpcs.items()
}

In [398]:
nested_dict = {
    topic: {
        entry["title"]: entry["content"]
        for entry in entries
    }
    for topic, entries in boooks_map.items()
}

nested_dict.keys()

dict_keys(['Quantitative Methods', 'Economics', 'Corporate Issuers', 'Financial Statement Analysis', 'Equity Investments', 'Fixed Income', 'Derivatives', 'Alternative Investments', 'Portfolio Management', 'Ethical and Professional Standards'])

In [399]:
# save to a file
with open("cfa2025.json", "w", encoding="utf-8") as f:
    json.dump(nested_dict, f, ensure_ascii=False, indent=4)

In [400]:
print(str(nested_dict)[:1000])

{'Quantitative Methods': {'2.Interest Rates and Time Value of Money': 'The time value of money establishes the equivalence between cash flows occurring on different dates. As cash received today is preferred to cash promised in the future, we must establish a consistent basis for this trade-off to compare financial instruments in cases in which cash is paid or received at different times. Aninterest rate(or yield), denotedr, is a rate of return that reflects the relationship between differently dated – timed – cash flows. If USD 9,500 today and USD 10,000 in one year are equivalent in value, then USD 10,000 – USD 9,500 = USD 500 is the required compensation for receiving USD 10,000 in one year rather than now. The interest rate (i.e., the required compensation stated as a rate of return) is USD 500/USD 9,500 = 0.0526 or 5.26 percent. Interest rates can be thought of in three ways:', '2.1.Determinants of Interest Rates': 'Economics tells us that interest rates are set by the forces of s