In [None]:
import fitz
import re
import csv
import os
from typing import List, Dict, Any

def extract_structured_content(pdf_path):
  
    doc = fitz.open(pdf_path)
    sections = []
    
    # First collect all blocks with their metadata
    all_blocks = []
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        blocks = page.get_text("dict")["blocks"]
        
        for block in blocks:
            if "lines" not in block:
                continue
            
            # Process each line separately to handle multi-line bullet points
            for line_idx, line in enumerate(block["lines"]):
                line_text = ""
                first_char = ""
                max_font_size = 0
                
                for span in line["spans"]:
                    if not first_char and span["text"].strip():
                        first_char = span["text"].strip()[0]
                    max_font_size = max(max_font_size, span["size"])
                    line_text += span["text"] + " "
                
                line_text = line_text.strip()
                if not line_text:
                    continue
                
                # Skip page numbers and footers
                if (re.match(r'^[\d]+$', line_text) or 
                    'LECTURE' in line_text and len(line_text) < 25 or
                    'MIT OpenCourseWare' in line_text):
                    continue
                
                # Calculate x position (for indentation)
                x_pos = block["bbox"][0]
                if line["spans"]:
                    x_pos = line["spans"][0]["bbox"][0]
                
                # Check if this is a bullet point by first character
                is_bullet = (first_char in "■◦•o-*" or 
                             line_text.lstrip().startswith(("■", "◦", "•", "o", "-", "*")))
                
                # Add to collection
                all_blocks.append({
                    "text": line_text,
                    "page": page_num,
                    "bbox": line["bbox"] if "bbox" in line else block["bbox"],
                    "font_size": max_font_size,
                    "x_pos": x_pos,  # Left position
                    "y_pos": line["bbox"][1] if "bbox" in line else block["bbox"][1],  # Top position
                    "first_char": first_char,
                    "is_bullet": is_bullet
                })
    
    # Sort by page and vertical position
    all_blocks.sort(key=lambda b: (b["page"], b["y_pos"]))
    
    # Detect main headings (typically larger font)
    font_sizes = [b["font_size"] for b in all_blocks if b["font_size"] > 0]
    if font_sizes:
        font_sizes.sort(reverse=True)
        main_heading_threshold = font_sizes[min(3, len(font_sizes)-1)]
    else:
        main_heading_threshold = 15 
    
    current_section = None
    current_content = []
    
    for i, block in enumerate(all_blocks):
        is_heading = (block["font_size"] >= main_heading_threshold or 
                      (block["text"].isupper() and len(block["text"]) > 3 and 
                       not block["text"].startswith("◦") and 
                       not block["text"].startswith("o")))
        
        indent_level = 0
        if block["x_pos"] > 70:
            indent_level = 1
        if block["x_pos"] > 130:
            indent_level = 2
        
        formatted_text = ""
        for _ in range(indent_level):
            formatted_text += "\t"
        
        if block["is_bullet"]:
            clean_text = re.sub(r'^[\s■◼▪◦•o\-*]+', '', block["text"])
            formatted_text += "- " + clean_text
        else:
            formatted_text += block["text"]
        
        if is_heading:
            if current_section and current_content:
                section_content = current_section + "\n" + "\n".join(current_content)
                sections.append({"title": current_section, "content": section_content})
            
            # Start new section
            current_section = block["text"]
            current_content = []
        else:
            if current_section:
                current_content.append(formatted_text)
            else:
                current_section = block["text"]
    
    if current_section and current_content:
        section_content = current_section + "\n" + "\n".join(current_content)
        sections.append({"title": current_section, "content": section_content})
    
    processed_sections = []
    i = 0
    while i < len(sections):
        section = sections[i]
        
        if '>>>' in section["title"] or 'In [' in section["title"]:
            if i + 1 < len(sections):
                next_section = sections[i+1]
                section = {
                    "title": section["title"],
                    "content": section["content"] + "\n" + next_section["content"]
                }
                i += 2
            else:
                i += 1
            processed_sections.append(section)
            continue
        
        if len(section["content"].split("\n")) <= 2 and i + 1 < len(sections):
            next_section = sections[i+1]
            section = {
                "title": section["title"],
                "content": section["content"] + "\n" + next_section["content"]
            }
            i += 2
            processed_sections.append(section)
            continue
        
        processed_sections.append(section)
        i += 1
    
    return processed_sections

def classify_section(section):
    title = section["title"]
    content = section["content"]
    combined = title + "\n" + content
    
    # Check for code patterns
    code_patterns = [
        r'^\s*(def|if|for|while|print|import|return)\b',
        r'==|!=|<=|>=|\+=|-=|\*=|/=',
        r'\brange\(|\bbreak\b|\breturn\b',
        r'^[a-zA-Z_][a-zA-Z0-9_]*\s*=\s*[^=]'
    ]
    
    if any(re.search(pattern, combined, re.MULTILINE) for pattern in code_patterns):
        return 'code'
    
    # Check for metadata
    metadata_patterns = [
        r'MIT OpenCourseWare',
        r'copyright',
        r'https?://',
        r'License',
        r'Terms of Use'
    ]
    
    if any(re.search(pattern, combined, re.IGNORECASE) for pattern in metadata_patterns):
        return 'metadata'
    
    # Check for examples
    example_patterns = [
        r'EXAMPLE|for example',
        r'e\.g\.',
        r'^\d+\)\s+'
    ]
    
    if any(re.search(pattern, combined, re.IGNORECASE | re.MULTILINE) for pattern in example_patterns):
        return 'example'
    
    return 'context'

def save_to_csv(sections, output_file="lecture_content.csv"):
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, delimiter=';', quoting=csv.QUOTE_MINIMAL)
        writer.writerow(["Text", "Label", "Source"])
        
        for section in sections:
            label = classify_section(section)
            
            text = section["content"]
            
            writer.writerow([text, label, "MIT Open Courseware"])
    
    print(f"Saved {len(sections)} sections to {output_file}")

def save_to_text_files(sections, output_dir="lecture_content"):
    os.makedirs(output_dir, exist_ok=True)
    
    # Group by label
    labeled_sections = {}
    for section in sections:
        label = classify_section(section)
        if label not in labeled_sections:
            labeled_sections[label] = []
        labeled_sections[label].append(section)
    
    # Save files by label
    for label, label_sections in labeled_sections.items():
        filename = os.path.join(output_dir, f"{label}_content.txt")
        
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"=== {label.upper()} CONTENT ===\n\n")
            
            for i, section in enumerate(label_sections):
                title = section["title"]
                content = section["content"]
                
                f.write(f"--- Section {i+1}: {title} ---\n\n")
                f.write(content)
                f.write("\n\n" + "="*50 + "\n\n")
    
    print(f"Saved content by category to {output_dir}/ directory")

def process_pdf(pdf_path, output_csv="lecture_content.csv", output_dir="lecture_content"):

    print(f"Processing {pdf_path}...")
    
    # Extract content
    sections = extract_structured_content(pdf_path)
    print(f"Extracted {len(sections)} sections")
    
    # Save to CSV
    save_to_csv(sections, output_csv)
    
    # Save to text files
    save_to_text_files(sections, output_dir)
    
    # Print classification summary
    labels = [classify_section(section) for section in sections]
    label_counts = {}
    for label in labels:
        label_counts[label] = label_counts.get(label, 0) + 1
    
    print("\nContent classification summary:")
    for label, count in label_counts.items():
        print(f"- {label}: {count} sections ({count/len(sections)*100:.1f}%)")
    
    return sections

In [3]:
pdf_path = "data/slides/01_ML_intro.pdf"
output_csv = "lecture_content.csv"
output_dir = "lecture_content"
sections = process_pdf(pdf_path, output_csv, output_dir)

Processing data/slides/01_ML_intro.pdf...
Extracted 14 sections
Saved 14 sections to lecture_content.csv
Saved content by category to lecture_content/ directory

Content classification summary:
- context: 13 sections (92.9%)
- example: 1 sections (7.1%)
