In [13]:
import os
import pandas as pd
from bs4 import BeautifulSoup
import tqdm
import ebooklib
from ebooklib import epub
import json
import re
from collections import defaultdict
dd = defaultdict

In [48]:
def parse_books_and_modules(text: str):
    lines = text.strip().split("\n")
    result = defaultdict(list)

    current_book = None

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if line.startswith("Book"):
            current_book = line
            result[current_book] = []
        elif line.startswith("Learning Module") and current_book:
            result[current_book].append(line.strip("Learning Module ").replace(":","."))

    return dict(result)

In [49]:
with open("book_titles.txt", "r") as f:  # Your full CFA text
    text_input = f.read()


In [50]:
books_modules = parse_books_and_modules(text_input)


In [51]:
books_modules.keys()

dict_keys(['Book 1: Quantitative Methods', 'Book 2: Economics', 'Book 3: Corporate Issuers', 'Book 4: Financial Statement Analysis', 'Book 5: Equity Investments', 'Book 6: Fixed Income', 'Book 7: Derivatives', 'Book 8: Alternative Investments', 'Book 9: Portfolio Management', 'Book 10: Ethical and Professional Standards'])

In [54]:
with open("cfa2025.json", "r") as file:
    book_json = eval(file.read())

book_values = {o:f"Book {i+1}: {o}" for i,o in enumerate(book_json.keys())}
book_json = {book_values[k]:v for k,v in book_json.items()}

In [199]:
import re

def clean_text(text):
    # Remove numbers and periods
    text = re.sub(r'[\d\.]', '', text)
    # Replace newlines and multiple spaces with a single space
    cleaned = re.sub(r'\s+', ' ', text)
    return cleaned.strip().replace("Book : ","")


In [200]:
def get_chap_level(x):
    x = x.split(".")

    if x[0].isnumeric():
        level1 = int(x[0])
        if x[1].isnumeric(): 
            level2 = int(x[1])
            if x[2].isnumeric(): 
                level3 = int(x[2])
                return [level1,level2,level3]
            else:
                return [level1,level2]
        else:
            return [level1]
    else:
        return []

In [245]:
import json
from collections.abc import MutableMapping

class BookNode(MutableMapping):
    def __init__(self, name="", content=None, dad_k=""):
        self.name = name
        self.content = content
        self._children = {}
        self.k = 1
        self.dad_k = dad_k

    def __getitem__(self, key):
        return self._children[key]

    def __setitem__(self, key, value):
        if isinstance(value, BookNode):
            self._children[key] = value
        else:
            # Create a child node with value as content
            self._children[key] = BookNode(name=key, content=value)

    def __delitem__(self, key):
        del self._children[key]

    def __iter__(self):
        return iter(self._children)

    def __len__(self):
        return len(self._children)

    def to_dict(self, max_levels=2, _level=1):
        if _level > max_levels:
            return len(self)
        return { key: child.to_dict(max_levels, _level + 1)
                 for key, child in self._children.items() }

    def __repr__(self,max_levels=2):
        # Pretty-printed JSON-like representation of the tree structure
        return json.dumps(self.to_dict(max_levels=max_levels), indent=4)

    def add_child(self, key, name=None, content=None):
        """
        Add a child node under `key`. If `name` is provided, use it as the node's name;
        otherwise default to `key`.
        Returns the newly created BookNode.
        """
        node_name = name if name is not None else key
        current_k = self.k
        self.k += 1
        key = node_name = f"{current_k}. {clean_text(node_name)}"

        
        node = BookNode(name=node_name, content=content)
        self._children[key] = node
        return node

    def __str__(self):
        """Return a string representation of the tree path (for debugging)."""
        lines = []
        def walk(node, depth):
            indent = "  " * depth
            lines.append(f"{indent}- {node.name}")
            for child in node._children.values():
                walk(child, depth + 1)
        walk(self, 0)
        return "\n".join(lines)


    def flatten_paths(
        self,
        levels: (3, 4),
        _prefix = None,
        _depth: int = 1
    ):
        """
        Walk the tree and return a list of strings "Book -> Module -> Chapter -> Subchapter"
        for every node whose depth is in `levels`.  Root is depth=0 so your Book titles are
        depth=1, Modules depth=2, chapters depth=3, subchapters depth=4.
        """
        _prefix = _prefix if _prefix else []
        paths = []
        for key, child in self._children.items():
            cur_path = _prefix + [key]
            if _depth in levels:
                # join only up to this level
                paths.append(" -> ".join(cur_path))
            # keep descending
            paths.extend(child.flatten_paths(levels, cur_path, _depth + 1))
        return paths


In [246]:
cfa_book = BookNode(name="CFA Book")

for book in book_json.keys():
    current_book = cfa_book.add_child(book)
    modules = list(books_modules[book])
    chapter_list = list(book_json[book].keys())
    chapter_content = list(book_json[book].values())
    
    module_i = 0
    previous_chap_level = 20
    for k,con in zip(chapter_list,chapter_content):
        chap_level = get_chap_level(k)
        if not chap_level: continue

        if (previous_chap_level > chap_level[0]) and (module_i<len(modules)):
            current_module = modules[module_i]
            current_module = current_book.add_child(current_module)
            # print(modules[module_i])
            module_i += 1
        
        previous_chap_level = chap_level[0]

        c1 = c2 = c3 = ""
        if len(chap_level)==1:
            c1 = k
            c2 = ""
            # print("====="*1,k)
            current_c1 = current_module.add_child(k, content=con)

        if len(chap_level)==2:
            c2 = k
            c3 = ""
            # print("====="*2,k)
            current_c2 = current_c1.add_child(k, content=con)
            

        if len(chap_level)==3:
            c3 = k
            # print("====="*3,k)
            current_c3 = current_c2.add_child(k, content=con)
        

        # if c1 and not c2 and not c3:
        #     book_dict[book][module][c1]["content"] = book_json[book][k]

        
    # break
    pass        

In [249]:
flat_sections = cfa_book.flatten_paths(levels=(3, 7))

In [250]:
flat_sections[0] 

'1. Quantitative Methods -> 1. Rates and Returns -> 1. Interest Rates and Time Value of Money'