In [9]:
#getting flat txt for AI and removes the =None

import zipfile
import xml.etree.ElementTree as ET
from pathlib import Path

xlsx_path = 'model.xlsx'
unpack_dir = Path('model_unpacked')
output_txt = 'model_ai_ready5.txt'

# Step 1: Unpack
with zipfile.ZipFile(xlsx_path, 'r') as zip_ref:
    zip_ref.extractall(unpack_dir)

# Step 2: Load shared strings
shared_strings_path = unpack_dir / 'xl' / 'sharedStrings.xml'
shared_strings = []
if shared_strings_path.exists():
    tree = ET.parse(shared_strings_path)
    root = tree.getroot()
    ns = {'a': root.tag.split('}')[0].strip('{')}
    for si in root.findall('a:si', ns):
        text_elems = si.findall('.//a:t', ns)
        text = ''.join(t.text for t in text_elems if t.text)
        shared_strings.append(text)

# Step 3: Load sheet names and their relationship IDs
sheet_rid_to_name = {}
workbook_path = unpack_dir / 'xl' / 'workbook.xml'
tree = ET.parse(workbook_path)
root = tree.getroot()
ns = {'a': root.tag.split('}')[0].strip('{')}
# Look for r:id attribute which links to relationships
for sheet in root.findall('.//a:sheet', ns):
    r_id = sheet.attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id')
    if not r_id:
        # Try without namespace prefix
        r_id = sheet.attrib.get('r:id')
    name = sheet.attrib['name']
    if r_id:
        sheet_rid_to_name[r_id] = name

# Step 4: Load relationships to map rIds to actual files
rid_to_file = {}
rels_path = unpack_dir / 'xl' / '_rels' / 'workbook.xml.rels'
if rels_path.exists():
    tree = ET.parse(rels_path)
    root = tree.getroot()
    ns = {'a': root.tag.split('}')[0].strip('{')}
    for relationship in root.findall('a:Relationship', ns):
        r_id = relationship.attrib['Id']
        target = relationship.attrib['Target']
        if 'worksheets/' in target:
            # Extract just the filename
            filename = target.split('/')[-1]
            rid_to_file[r_id] = filename

# Step 5: Create final mapping from filename to sheet name
file_to_name = {}
for r_id, sheet_name in sheet_rid_to_name.items():
    if r_id in rid_to_file:
        filename = rid_to_file[r_id]
        file_to_name[filename] = sheet_name

# Step 6: Read sheet data and formulas
output_lines = []
sheets_dir = unpack_dir / 'xl' / 'worksheets'
for sheet_file in sorted(sheets_dir.glob('sheet*.xml')):
    filename = sheet_file.name
    sheet_name = file_to_name.get(filename, filename.replace('.xml', ''))
    output_lines.append(f"\n=== Sheet: {sheet_name} ===\n")
    
    tree = ET.parse(sheet_file)
    root = tree.getroot()
    ns = {'a': root.tag.split('}')[0].strip('{')}
    
    for c in root.findall('.//a:sheetData//a:row//a:c', ns):
        cell_ref = c.attrib.get('r', '')
        cell_type = c.attrib.get('t', '')
        formula = c.find('a:f', ns)
        value = c.find('a:v', ns)
        
        # Get cell value
        if cell_type == 's' and value is not None:
            val = shared_strings[int(value.text)]
        elif value is not None:
            val = value.text
        else:
            val = ''
        
        # Only output if we have meaningful content
        if formula is not None and formula.text is not None and formula.text.strip():
            output_lines.append(f"{cell_ref}: {formula.text.strip()}")
        elif val and str(val).strip() and str(val).strip() != '0':
            output_lines.append(f"{cell_ref}: {str(val).strip()}")

# Step 7: Save
with open(output_txt, 'w', encoding='utf-8') as f:
    f.write('\n'.join(output_lines))

print(f"✅ Structured AI-friendly data written to: {output_txt}")
print(f"📊 Found {len(file_to_name)} sheets with names: {list(file_to_name.values())}")

✅ Structured AI-friendly data written to: model_ai_ready5.txt
📊 Found 16 sheets with names: ['Assumptions & Inputs', 'SF Breakdown', 'Sources & Uses', 'Monthly PF Cash Flow', 'Pro Forma (Annual)', 'Untrended Pro Forma', 'Investor Returns - Pari Passu', 'Investor Returns - Sep Pref Pay', 'Budget to Convert Existing Bldg', 'Budget for New Bldg', 'Property Data', 'Rental Income Model', '5 Year Model', '5 Year Summary', 'RE Tax UW', 'Comps']


In [None]:
# Excel formulas only 

import openpyxl
from openpyxl import load_workbook

def extract_formulas_and_text(file_path):
    """
    Extract formulas and text labels from Excel file in compact format
    Skip empty cells and cells with only values
    """
    wb = load_workbook(file_path, data_only=False)  # data_only=False to get formulas
    results = []
    
    for sheet_name in wb.sheetnames:
        sheet = wb[sheet_name]
        
        # Iterate through all cells with content
        for row in sheet.iter_rows():
            for cell in row:
                if cell.value is not None:
                    cell_address = cell.coordinate
                    
                    # Check if cell contains a formula
                    if isinstance(cell.value, str) and cell.value.startswith('='):
                        formula = cell.value
                        results.append(f"{sheet_name}|{cell_address}|{formula}")
                    
                    # Check if cell contains text (labels) - not numbers, not formulas
                    elif isinstance(cell.value, str) and not cell.value.startswith('='):
                        text_value = cell.value.strip()
                        if text_value:  # Skip empty strings
                            results.append(f"{sheet_name}|{cell_address}|{text_value}")
    
    return results

def save_to_file(results, output_file):
    """Save results to text file"""
    with open(output_file, 'w', encoding='utf-8') as f:
        for line in results:
            f.write(line + '\n')

# Usage
if __name__ == "__main__":
    file_path = "model.xlsx"  # Replace with your file path
    output_file = "extracted_formulas.txt"
    
    try:
        results = extract_formulas_and_text(file_path)
        
        # Print results
        for result in results:
            print(result)
        
        # Save to file
        save_to_file(results, output_file)
        print(f"\nExtracted {len(results)} items and saved to {output_file}")
        
    except Exception as e:
        print(f"Error: {e}")

In [None]:
import openpyxl
from openpyxl import load_workbook

def create_sheet_abbreviation(sheet_name, sheet_index):
    """
    Create abbreviated sheet name: s{number}-{2 letters}
    e.g., 'Investor Returns - Pari Passu' -> 's1-ir'
    """
    # Take first 2 letters of the sheet name (ignoring spaces and special chars)
    clean_name = ''.join(c.lower() for c in sheet_name if c.isalpha())
    abbrev = clean_name[:2] if len(clean_name) >= 2 else clean_name
    return f"s{sheet_index}-{abbrev}"

def replace_sheet_names_in_formulas(text, sheet_mapping):
    """
    Replace all sheet names in formulas with abbreviated versions
    """
    # Create reverse mapping (full name -> abbreviation)
    reverse_mapping = {full_name: abbrev for abbrev, full_name in sheet_mapping.items()}
    
    # Sort by length (longest first) to avoid partial replacements
    sorted_sheets = sorted(reverse_mapping.keys(), key=len, reverse=True)
    
    for full_name in sorted_sheets:
        abbrev = reverse_mapping[full_name]
        # Replace sheet references in formulas like 'Sheet Name'! with abbrev!
        text = text.replace(f"'{full_name}'!", f"{abbrev}!")
        # Also handle unquoted sheet names (if no spaces)
        if ' ' not in full_name and '-' not in full_name:
            text = text.replace(f"{full_name}!", f"{abbrev}!")
    
    return text

def extract_formulas_and_text(file_path):
    """
    Extract formulas and text labels from Excel file in compact format
    Skip empty cells and cells with only values
    """
    wb = load_workbook(file_path, data_only=False)  # data_only=False to get formulas
    results = []
    sheet_mapping = {}
    
    # First pass: create sheet mapping
    for sheet_index, sheet_name in enumerate(wb.sheetnames, 1):
        sheet_abbrev = create_sheet_abbreviation(sheet_name, sheet_index)
        sheet_mapping[sheet_abbrev] = sheet_name
    
    # Second pass: extract data
    for sheet_index, sheet_name in enumerate(wb.sheetnames, 1):
        sheet = wb[sheet_name]
        sheet_abbrev = create_sheet_abbreviation(sheet_name, sheet_index)
        
        # Iterate through all cells with content
        for row in sheet.iter_rows():
            for cell in row:
                if cell.value is not None:
                    cell_address = cell.coordinate
                    
                    # Check if cell contains a formula
                    if isinstance(cell.value, str) and cell.value.startswith('='):
                        formula = cell.value
                        # Replace sheet names in formula
                        formula = replace_sheet_names_in_formulas(formula, sheet_mapping)
                        results.append(f"{sheet_abbrev}|{cell_address}|{formula}")
                    
                    # Check if cell contains text (labels) - not numbers, not formulas
                    elif isinstance(cell.value, str) and not cell.value.startswith('='):
                        text_value = cell.value.strip()
                        if text_value:  # Skip empty strings
                            results.append(f"{sheet_abbrev}|{cell_address}|{text_value}")
    
    return results, sheet_mapping

def save_to_file(results, sheet_mapping, output_file):
    """Save results to text file with sheet mapping"""
    with open(output_file, 'w', encoding='utf-8') as f:
        # Write sheet mapping at the top
        f.write("=== SHEET MAPPING ===\n")
        for abbrev, full_name in sheet_mapping.items():
            f.write(f"{abbrev} = {full_name}\n")
        f.write("\n=== FORMULAS & TEXT ===\n")
        
        # Write extracted data
        for line in results:
            f.write(line + '\n')

# Usage
if __name__ == "__main__":
    file_path = "model.xlsx"  # Replace with your file path
    output_file = "extracted_formulas3.txt"
    
    try:
        results, sheet_mapping = extract_formulas_and_text(file_path)
        
        # Print sheet mapping
        print("=== SHEET MAPPING ===")
        for abbrev, full_name in sheet_mapping.items():
            print(f"{abbrev} = {full_name}")
        print("\n=== FORMULAS & TEXT ===")
        
        # Print results
        for result in results:
            print(result)
        
        # Save to file
        save_to_file(results, sheet_mapping, output_file)
        print(f"\nExtracted {len(results)} items and saved to {output_file}")
        
    except Exception as e:
        print(f"Error: {e}")

In [18]:
import networkx as nx
import re
from collections import defaultdict

class ExcelDependencyAnalyzer:
    def __init__(self):
        self.graph = nx.DiGraph()
        self.formulas = {}
        self.labels = {}
        self.sheet_mapping = {}
    
    def parse_cell_reference(self, ref):
        """Parse cell reference like s2-sf!E11 or E11"""
        if '!' in ref:
            sheet, cell = ref.split('!')
            return f"{sheet}!{cell}"
        return ref
    
    def extract_cell_references(self, formula):
        """Extract all cell references from a formula"""
        # Pattern to match cell references like A1, $A$1, A1:B5, s2-sf!A1, etc.
        pattern = r'(?:s\d+-\w+!)?(?:\$?[A-Z]+\$?\d+(?::\$?[A-Z]+\$?\d+)?)'
        references = re.findall(pattern, formula)
        
        # Clean up and expand ranges
        cells = []
        for ref in references:
            if ':' in ref:
                # Handle ranges - for simplicity, just take the first cell
                # In practice, you might want to expand the full range
                cells.append(ref.split(':')[0])
            else:
                cells.append(ref)
        
        return [self.parse_cell_reference(cell) for cell in cells]
    
    def parse_input_data(self, data_lines):
        """Parse the input format: s2-sf|E11|=C11+D11"""
        for line in data_lines:
            line = line.strip()
            if not line or line.startswith('==='):
                continue
            
            parts = line.split('|')
            if len(parts) != 3:
                continue
                
            sheet_cell = f"{parts[0]}!{parts[1]}"
            content = parts[2]
            
            if content.startswith('='):
                # It's a formula
                self.formulas[sheet_cell] = content
                dependencies = self.extract_cell_references(content)
                
                # Add edges to graph
                for dep in dependencies:
                    # Make sure dependency has sheet prefix
                    if '!' not in dep:
                        dep = f"{parts[0]}!{dep}"
                    
                    self.graph.add_edge(dep, sheet_cell)
            else:
                # It's a label/text
                self.labels[sheet_cell] = content
                # Add as node (might be referenced by formulas)
                self.graph.add_node(sheet_cell)
    
    def find_output_cells(self):
        """Find cells that are likely outputs (no outgoing dependencies)"""
        output_cells = []
        for node in self.graph.nodes():
            if self.graph.out_degree(node) == 0 and node in self.formulas:
                output_cells.append(node)
        return output_cells
    
    def find_input_cells(self):
        """Find cells that are likely inputs (no incoming dependencies)"""
        input_cells = []
        for node in self.graph.nodes():
            if self.graph.in_degree(node) == 0:
                input_cells.append(node)
        return input_cells
    
    def get_critical_paths(self, max_paths=10):
        """Find critical paths from inputs to outputs"""
        if not nx.is_directed_acyclic_graph(self.graph):
            print("Warning: Graph contains cycles!")
            # Remove cycles for analysis
            self.graph = nx.DiGraph([(u, v) for u, v in self.graph.edges() 
                                   if not nx.has_path(self.graph, v, u)])
        
        outputs = self.find_output_cells()
        inputs = self.find_input_cells()
        
        critical_paths = []
        
        for output in outputs[:max_paths]:  # Limit outputs to analyze
            for input_cell in inputs:
                if nx.has_path(self.graph, input_cell, output):
                    try:
                        path = nx.shortest_path(self.graph, input_cell, output)
                        if len(path) > 2:  # Only show meaningful paths
                            critical_paths.append({
                                'path': path,
                                'length': len(path),
                                'input': input_cell,
                                'output': output
                            })
                    except nx.NetworkXNoPath:
                        continue
        
        # Sort by path length (longest first)
        critical_paths.sort(key=lambda x: x['length'], reverse=True)
        return critical_paths
    
    def analyze_cell_impact(self, cell):
        """Analyze what cells are affected if this cell changes"""
        if cell not in self.graph:
            return []
        
        # Find all descendants (cells that depend on this cell)
        descendants = list(nx.descendants(self.graph, cell))
        return descendants
    
    def trace_cell_dependencies(self, cell):
        """Trace all dependencies of a cell backwards"""
        if cell not in self.graph:
            return []
        
        # Find all ancestors (cells this cell depends on)
        ancestors = list(nx.ancestors(self.graph, cell))
        return ancestors
    
    def print_critical_paths(self, max_paths=5):
        """Print the critical paths in a readable format"""
        paths = self.get_critical_paths(max_paths)
        
        print(f"=== TOP {min(len(paths), max_paths)} CRITICAL PATHS ===\n")
        
        for i, path_info in enumerate(paths[:max_paths], 1):
            path = path_info['path']
            print(f"Path {i} (Length: {path_info['length']}):")
            
            for j, cell in enumerate(path):
                if j < len(path) - 1:
                    print(f"  {cell} →")
                else:
                    print(f"  {cell} (OUTPUT)")
                
                # Show formula if available
                if cell in self.formulas:
                    print(f"    Formula: {self.formulas[cell]}")
                elif cell in self.labels:
                    print(f"    Label: {self.labels[cell]}")
            print()
    
    def print_impact_analysis(self, cell):
        """Print impact analysis for a specific cell"""
        impact = self.analyze_cell_impact(cell)
        dependencies = self.trace_cell_dependencies(cell)
        
        print(f"=== IMPACT ANALYSIS FOR {cell} ===")
        print(f"Depends on {len(dependencies)} cells:")
        for dep in dependencies[:10]:  # Show first 10
            print(f"  ← {dep}")
        if len(dependencies) > 10:
            print(f"  ... and {len(dependencies) - 10} more")
        print()
        
        print(f"Affects {len(impact)} cells:")
        for imp in impact[:10]:  # Show first 10
            print(f"  → {imp}")
        if len(impact) > 10:
            print(f"  ... and {len(impact) - 10} more")
        print()

# Usage example
def analyze_excel_dependencies(input_data):
    analyzer = ExcelDependencyAnalyzer()
    
    # Parse input data
    if isinstance(input_data, str):
        lines = input_data.strip().split('\n')
    else:
        lines = input_data
    
    analyzer.parse_input_data(lines)
    
    print(f"Parsed {len(analyzer.formulas)} formulas and {len(analyzer.labels)} labels")
    print(f"Graph has {analyzer.graph.number_of_nodes()} nodes and {analyzer.graph.number_of_edges()} edges\n")
    
    # Find and display critical paths
    analyzer.print_critical_paths()
    
    # Example impact analysis for a specific cell
    outputs = analyzer.find_output_cells()
    if outputs:
        print(f"=== SAMPLE IMPACT ANALYSIS ===")
        analyzer.print_impact_analysis(outputs[0])
    
    return analyzer

# Example usage
if __name__ == "__main__":
    # Sample data in your format
    sample_data = """
s2-sf|E11|=C11+D11
s2-sf|K11|Mech New
s2-sf|C11|=s1-as!B5*1.2
s2-sf|D11|=s1-as!B6*0.8
s1-as|B5|Base Cost
s1-as|B6|Additional Cost
s3-ca|A1|=s2-sf!E11*s2-sf!K11
s4-ou|Z99|=s3-ca!A1+s2-sf!E11
"""
    
    analyzer = analyze_excel_dependencies(sample_data)

Parsed 5 formulas and 3 labels
Graph has 8 nodes and 8 edges

=== TOP 3 CRITICAL PATHS ===

Path 1 (Length: 4):
  s1-as!B5 →
    Label: Base Cost
  s2-sf!C11 →
    Formula: =s1-as!B5*1.2
  s2-sf!E11 →
    Formula: =C11+D11
  s4-ou!Z99 (OUTPUT)
    Formula: =s3-ca!A1+s2-sf!E11

Path 2 (Length: 4):
  s1-as!B6 →
    Label: Additional Cost
  s2-sf!D11 →
    Formula: =s1-as!B6*0.8
  s2-sf!E11 →
    Formula: =C11+D11
  s4-ou!Z99 (OUTPUT)
    Formula: =s3-ca!A1+s2-sf!E11

Path 3 (Length: 3):
  s2-sf!K11 →
    Label: Mech New
  s3-ca!A1 →
    Formula: =s2-sf!E11*s2-sf!K11
  s4-ou!Z99 (OUTPUT)
    Formula: =s3-ca!A1+s2-sf!E11

=== SAMPLE IMPACT ANALYSIS ===
=== IMPACT ANALYSIS FOR s4-ou!Z99 ===
Depends on 7 cells:
  ← s2-sf!D11
  ← s1-as!B5
  ← s2-sf!C11
  ← s3-ca!A1
  ← s1-as!B6
  ← s2-sf!E11
  ← s2-sf!K11

Affects 0 cells:

