In [1]:
import re
from collections import defaultdict, Counter
import json

def parse_excel_data(file_path):
    """Parse the AI-ready Excel data file"""
    sheets = {}
    current_sheet = None
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
                
            # Detect sheet headers
            if line.startswith('=== Sheet:') and line.endswith('==='):
                current_sheet = line.split('Sheet: ')[1].split(' ===')[0]
                sheets[current_sheet] = {}
            elif current_sheet and ':' in line:
                # Parse cell data
                cell_ref, content = line.split(':', 1)
                cell_ref = cell_ref.strip()
                content = content.strip()
                sheets[current_sheet][cell_ref] = content
    
    return sheets

def extract_formula_references(formula):
    """Extract cell and sheet references from formulas"""
    references = []
    
    # Pattern for sheet references like 'SheetName'!A1 or SheetName!A1
    sheet_ref_pattern = r"(?:'([^']+)'|([A-Za-z0-9_\s]+))!([A-Z]+\d+(?::[A-Z]+\d+)?)"
    sheet_refs = re.findall(sheet_ref_pattern, formula)
    
    for match in sheet_refs:
        sheet_name = match[0] if match[0] else match[1]
        cell_range = match[2]
        references.append(('sheet_ref', sheet_name, cell_range))
    
    # Pattern for local cell references like A1, B2:C10
    local_ref_pattern = r"\b([A-Z]+\d+(?::[A-Z]+\d+)?)\b"
    local_refs = re.findall(local_ref_pattern, formula)
    
    for ref in local_refs:
        # Skip if it's part of a sheet reference
        if not any(ref in sr[2] for sr in sheet_refs):
            references.append(('local_ref', None, ref))
    
    return references

def build_dependency_graph(sheets):
    """Build dependency relationships between cells and sheets"""
    dependencies = defaultdict(list)
    sheet_connections = defaultdict(set)
    formula_complexity = {}
    
    for sheet_name, cells in sheets.items():
        for cell_ref, content in cells.items():
            # Check if it's a formula (contains function names or operators)
            if any(op in content for op in ['SUM', 'IF', 'VLOOKUP', 'INDEX', 'MATCH', '+', '-', '*', '/', '=']):
                refs = extract_formula_references(content)
                formula_complexity[f"{sheet_name}.{cell_ref}"] = len(refs)
                
                for ref_type, ref_sheet, ref_cell in refs:
                    if ref_type == 'sheet_ref':
                        # Cross-sheet dependency
                        dependencies[f"{sheet_name}.{cell_ref}"].append(f"{ref_sheet}.{ref_cell}")
                        sheet_connections[sheet_name].add(ref_sheet)
                    else:
                        # Local dependency
                        dependencies[f"{sheet_name}.{cell_ref}"].append(f"{sheet_name}.{ref_cell}")
    
    return dependencies, sheet_connections, formula_complexity

def generate_mermaid_diagram(sheets, dependencies, sheet_connections, formula_complexity):
    """Generate Mermaid diagram code"""
    mermaid_lines = ["graph TD"]
    
    # Add sheet nodes with styling
    sheet_colors = ['fill:#e1f5fe', 'fill:#f3e5f5', 'fill:#e8f5e8', 'fill:#fff3e0', 'fill:#fce4ec']
    for i, sheet_name in enumerate(sheets.keys()):
        color = sheet_colors[i % len(sheet_colors)]
        safe_name = sheet_name.replace(' ', '_').replace('-', '_')
        mermaid_lines.append(f"    {safe_name}[{sheet_name}]")
        mermaid_lines.append(f"    style {safe_name} {color}")
    
    # Add cross-sheet connections
    for source_sheet, target_sheets in sheet_connections.items():
        safe_source = source_sheet.replace(' ', '_').replace('-', '_')
        for target_sheet in target_sheets:
            safe_target = target_sheet.replace(' ', '_').replace('-', '_')
            mermaid_lines.append(f"    {safe_source} --> {safe_target}")
    
    # Add high-complexity formulas as special nodes
    high_complexity = {k: v for k, v in formula_complexity.items() if v > 3}
    for formula_ref, complexity in high_complexity.items():
        sheet_part, cell_part = formula_ref.split('.', 1)
        safe_sheet = sheet_part.replace(' ', '_').replace('-', '_')
        safe_ref = formula_ref.replace(' ', '_').replace('-', '_').replace('.', '_')
        mermaid_lines.append(f"    {safe_ref}{{'{cell_part}\\n({complexity} refs)'}} ")
        mermaid_lines.append(f"    {safe_sheet} -.-> {safe_ref}")
        mermaid_lines.append(f"    style {safe_ref} fill:#ffcdd2")
    
    return '\n'.join(mermaid_lines)

def generate_detailed_network(dependencies, formula_complexity):
    """Generate detailed variable network diagram"""
    mermaid_lines = ["graph LR"]
    
    # Group by sheet for subgraphs
    sheet_groups = defaultdict(list)
    for var in dependencies.keys():
        sheet_name = var.split('.')[0]
        sheet_groups[sheet_name].append(var)
    
    # Create subgraphs for each sheet
    for sheet_name, variables in sheet_groups.items():
        safe_sheet = sheet_name.replace(' ', '_').replace('-', '_')
        mermaid_lines.append(f"    subgraph {safe_sheet}[{sheet_name}]")
        
        for var in variables:
            safe_var = var.replace(' ', '_').replace('-', '_').replace('.', '_')
            cell_ref = var.split('.', 1)[1]
            complexity = formula_complexity.get(var, 0)
            
            if complexity > 2:
                mermaid_lines.append(f"        {safe_var}[{cell_ref}]")
                mermaid_lines.append(f"        style {safe_var} fill:#ffeb3b")
            else:
                mermaid_lines.append(f"        {safe_var}({cell_ref})")
        
        mermaid_lines.append("    end")
    
    # Add dependencies
    for source, targets in dependencies.items():
        safe_source = source.replace(' ', '_').replace('-', '_').replace('.', '_')
        for target in targets[:3]:  # Limit to avoid clutter
            safe_target = target.replace(' ', '_').replace('-', '_').replace('.', '_')
            mermaid_lines.append(f"    {safe_source} --> {safe_target}")
    
    return '\n'.join(mermaid_lines)

def analyze_relationships(file_path):
    """Main analysis function"""
    print("📊 Analyzing Excel relationships...")
    
    # Parse data
    sheets = parse_excel_data(file_path)
    print(f"Found {len(sheets)} sheets")
    
    # Build dependency graph
    dependencies, sheet_connections, formula_complexity = build_dependency_graph(sheets)
    
    # Generate statistics
    total_formulas = len(formula_complexity)
    cross_sheet_deps = sum(1 for deps in dependencies.values() for dep in deps if '.' in dep and dep.split('.')[0] != dep.split('.')[0])
    
    print(f"📈 Analysis Results:")
    print(f"   • Total formulas: {total_formulas}")
    print(f"   • Cross-sheet dependencies: {len(sheet_connections)}")
    print(f"   • Complex formulas (>3 refs): {len([c for c in formula_complexity.values() if c > 3])}")
    
    # Generate diagrams
    sheet_diagram = generate_mermaid_diagram(sheets, dependencies, sheet_connections, formula_complexity)
    detailed_diagram = generate_detailed_network(dependencies, formula_complexity)
    
    # Save diagrams
    with open('sheet_relationships.mmd', 'w') as f:
        f.write(sheet_diagram)
    
    with open('detailed_network.mmd', 'w') as f:
        f.write(detailed_diagram)
    
    # Save analysis data
    analysis_data = {
        'sheets': list(sheets.keys()),
        'dependencies': dict(dependencies),
        'sheet_connections': {k: list(v) for k, v in sheet_connections.items()},
        'formula_complexity': formula_complexity,
        'statistics': {
            'total_sheets': len(sheets),
            'total_formulas': total_formulas,
            'cross_sheet_connections': len(sheet_connections),
            'complex_formulas': len([c for c in formula_complexity.values() if c > 3])
        }
    }
    
    with open('analysis_data.json', 'w') as f:
        json.dump(analysis_data, f, indent=2)
    
    print(f"\n✅ Generated:")
    print(f"   • sheet_relationships.mmd - High-level sheet connections")
    print(f"   • detailed_network.mmd - Detailed variable network")
    print(f"   • analysis_data.json - Raw analysis data")
    
    return sheet_diagram, detailed_diagram, analysis_data

# Usage
if __name__ == "__main__":
    input_file = 'model_ai_ready.txt'  # Output from your Excel converter
    analyze_relationships(input_file)

📊 Analyzing Excel relationships...
Found 16 sheets
📈 Analysis Results:
   • Total formulas: 8183
   • Cross-sheet dependencies: 9
   • Complex formulas (>3 refs): 358

✅ Generated:
   • sheet_relationships.mmd - High-level sheet connections
   • detailed_network.mmd - Detailed variable network
   • analysis_data.json - Raw analysis data
