# 📊 Excel Analysis Report

**File:** `Business Accounting.xlsx`
**Analysis Date:** 2025-08-07 15:19:23
**Processing Time:** 0.73s

In [1]:
import pandas as pd
from pathlib import Path

# Load the Excel file (path relative to repo root)
excel_path = Path("test_assets/collection/business-accounting/Business Accounting.xlsx")
sheet_index = 0

print(f"Loading data from: {excel_path}")
print(f"Loading sheet at index: {sheet_index}")

try:
    # First, get information about all sheets
    xl_file = pd.ExcelFile(excel_path)
    sheet_names = xl_file.sheet_names
    print(f"\nAvailable sheets: {sheet_names}")

    if sheet_index >= len(sheet_names):
        print(f"\nError: Sheet index {sheet_index} is out of range. File has {len(sheet_names)} sheets.")
        print("Please use --sheet-index with a value between 0 and {}")
        df = None
    else:
        # Load the specified sheet
        selected_sheet = sheet_names[sheet_index]
        print(f"\nLoading sheet: '{selected_sheet}'")

        df = pd.read_excel(excel_path, sheet_name=sheet_index)
        print(f"\nLoaded data with shape: {df.shape}")
        print(f"Columns: {list(df.columns)}")

        # Display first few rows
        print("\nFirst 5 rows:")
        display(df.head())

        # Basic info
        print("\nData types:")
        print(df.dtypes)

except Exception as e:
    print(f"Error loading Excel file: {e}")
    df = None


Loading data from: test_assets/collection/business-accounting/Business Accounting.xlsx
Loading sheet at index: 0

Available sheets: ['Yiriden Transactions 2025', 'Yiriden Transactions 2023', 'Yiriden 2023 Loans', 'Sanoun Transactions 2024', 'Sanoun Transactions 2025', '2024 Shea butter shipping', 'Yiriden mileages', 'Truck Revenue Projections', 'Yiriden 2022', 'Real Estate - Horton Rd']

Loading sheet: 'Yiriden Transactions 2025'

Loaded data with shape: (10, 9)
Columns: ['Date', 'Description', 'USD Amount', 'Transaction type', 'Category', 'TOTAL REVENUES', 0, 'TOTAL EXPENSES', -2031.6299999999999]

First 5 rows:


Unnamed: 0,Date,Description,USD Amount,Transaction type,Category,TOTAL REVENUES,0,TOTAL EXPENSES,-2031.63
0,2025-01-03,Mali Apartment work - Cement,-43.51,EXP,,,,,
1,2025-01-15,Rebtel International Phone charges,-9.49,EXP,,,,,
2,2025-01-21,Mali Apartment work - Cement,-8.57,EXP,,,,,
3,2025-01-22,Mali Apartment Work - Tile Install (Kitchen),-48.78,EXP,,,,,
4,2025-01-23,Aluminum fabricator - Issa Maiga,-424.34,EXP,,,,,



Data types:
 Date                datetime64[ns]
 Description                 object
 USD Amount                 float64
 Transaction type            object
 Category                   float64
 TOTAL REVENUES             float64
 0                          float64
 TOTAL EXPENSES             float64
-2031.63                    float64
dtype: object


In [2]:
# Query interface for formula dependency analysis
# The pipeline has already analyzed all formulas and cached the results

import pickle
from pathlib import Path
from spreadsheet_analyzer.graph_db.query_interface import create_enhanced_query_interface

# Load cached formula analysis
cache_file = Path(r".pipeline_cache/Business Accounting_formula_analysis.pkl")
with open(cache_file, 'rb') as f:
    formula_analysis = pickle.load(f)

query_interface = create_enhanced_query_interface(formula_analysis)

# Convenience functions for graph queries
def get_cell_dependencies(sheet, cell_ref):
    """Get complete dependency information for a specific cell."""
    result = query_interface.get_cell_dependencies(sheet, cell_ref)
    print(f"\nCell {sheet}!{cell_ref}:")
    print(f"  Has formula: {result.has_formula}")
    if result.formula:
        print(f"  Formula: {result.formula}")
    if result.direct_dependencies:
        print(f"  Direct dependencies: {', '.join(result.direct_dependencies[:5])}")
        if len(result.direct_dependencies) > 5:
            print(f"    ...and {len(result.direct_dependencies) - 5} more")
    if result.direct_dependents:
        print(f"  Cells that depend on this: {', '.join(result.direct_dependents[:5])}")
        if len(result.direct_dependents) > 5:
            print(f"    ...and {len(result.direct_dependents) - 5} more")
    return result

def find_cells_affecting_range(sheet, start_cell, end_cell):
    """Find all cells that affect any cell within the specified range."""
    result = query_interface.find_cells_affecting_range(sheet, start_cell, end_cell)
    print(f"\nCells affecting range {sheet}!{start_cell}:{end_cell}:")
    for cell, deps in list(result.items())[:5]:
        print(f"  {cell} depends on: {', '.join(deps[:3])}")
        if len(deps) > 3:
            print(f"    ...and {len(deps) - 3} more")
    if len(result) > 5:
        print(f"  ...and {len(result) - 5} more cells")
    return result

def get_formula_statistics():
    """Get comprehensive statistics about formulas in the workbook."""
    stats = query_interface.get_formula_statistics_with_ranges()
    print("\nFormula Statistics:")
    print(f"  Total formulas: {stats['total_formulas']:,}")
    print(f"  Formulas with dependencies: {stats['formulas_with_dependencies']:,}")
    print(f"  Unique cells referenced: {stats['unique_cells_referenced']:,}")
    print(f"  Max dependency depth: {stats['max_dependency_depth']} levels")
    print(f"  Circular references: {stats['circular_reference_chains']}")
    print(f"  Formula complexity score: {stats['complexity_score']}/100")
    return stats

def find_empty_cells_in_formula_ranges(sheet):
    """Find empty cells that are part of formula ranges."""
    result = query_interface.find_empty_cells_in_formula_ranges(sheet)
    print(f"\nEmpty cells in formula ranges for sheet '{sheet}':")
    if result:
        print(f"  Found {len(result)} empty cells")
        # Group by rows for display
        rows = {}
        for cell in list(result)[:20]:
            row_num = ''.join(filter(str.isdigit, cell))
            if row_num not in rows:
                rows[row_num] = []
            rows[row_num].append(cell)
        for row, cells in list(rows.items())[:5]:
            print(f"  Row {row}: {', '.join(cells)}")
        if len(result) > 20:
            print(f"  ...and {len(result) - 20} more")
    else:
        print("  No empty cells found in formula ranges")
    return result


## 🔍 Formula Analysis Tools

You have TWO approaches available for formula analysis:

### 1️⃣ Graph-Based Dependency Analysis (Recommended for Complex Files)
The deterministic pipeline has analyzed all formulas and created a dependency graph. These tools are robust and handle complex Excel files:

- **get_cell_dependencies** - Analyze what a cell depends on and what depends on it
- **find_cells_affecting_range** - Find all cells that affect a specific range
- **find_empty_cells_in_formula_ranges** - Find gaps in data that formulas reference
- **get_formula_statistics** - Get overall statistics about formulas
- **find_circular_references** - Find all circular reference chains

### 2️⃣ Formulas Library for Advanced Formula Evaluation (Recommended)
Robust formula evaluation using the 'formulas' library that handles complex Excel files:

- **load_excel_with_formulas** - Load Excel file for formula evaluation
- **evaluate_cell** - Get calculated cell values and formulas
- **set_cell_and_recalculate** - What-if analysis with recalculation
- **get_cell_dependencies_formulas** - Track formula dependencies
- **export_formulas_model** - Export model to JSON
- **get_formulas_help** - Get detailed help

✅ **Recommended**: The formulas library handles complex Excel files much better than other alternatives.

### Usage:
All tools are available through the tool-calling interface. Use graph-based analysis for quick dependency queries, and the formulas library for accurate formula evaluation and what-if analysis.


## --- LLM Analysis Start ---

In [3]:
# Step 1: Perform multi-table detection to identify if the sheet contains multiple tables.
# Examine the structure with more context.
print(f"Sheet dimensions: {df.shape}")
print("\n--- First 30 rows ---")
print(df.head(30))

# Check for empty rows that could separate tables
empty_rows = df.isnull().all(axis=1)
empty_row_groups = empty_rows.groupby((~empty_rows).cumsum()).sum()
print(f"\nEmpty row blocks: {empty_row_groups[empty_row_groups > 0].to_dict()}")

# Check for empty columns (potential separators)
empty_cols = df.isnull().all(axis=0)
print(f"Empty columns: {list(df.columns[empty_cols])}")

# Semantic analysis: look for shifts in data types or content that suggest multiple tables
# Check for different entity types or data patterns
for col in df.columns:
    non_null = df[col].dropna()
    print(f"Column: {col}")
    print(f"  Non-null count: {len(non_null)}")
    print(f"  Data types: {non_null.apply(type).value_counts().to_dict()}")
    print(f"  Sample values: {non_null.head(3).tolist()}")
    print()

Sheet dimensions: (10, 9)

--- First 30 rows ---
        Date                                   Description   USD Amount  \
0 2025-01-03                  Mali Apartment work - Cement       -43.51   
1 2025-01-15            Rebtel International Phone charges        -9.49   
2 2025-01-21                  Mali Apartment work - Cement        -8.57   
3 2025-01-22  Mali Apartment Work - Tile Install (Kitchen)       -48.78   
4 2025-01-23              Aluminum fabricator - Issa Maiga      -424.34   
5 2025-01-23  Mali Apartment work - Kitchen Wall Crepisage       -57.75   
6 2025-01-23                  Mali Apartment work - Cement       -25.90   
7 2025-01-26            Rebtel International Phone charges        -4.99   
8 2025-01-14                      Toyota Prado - 4 x tires     -1095.34   
9 2025-01-14        Toyota Prado - 4 x rims and spare tire      -312.96   

   Transaction type   Category   TOTAL REVENUES   0   TOTAL EXPENSES  -2031.63  
0               EXP        NaN              

In [4]:
# Based on the structure, check for semantic shifts in data
# For example, look at the first few rows to identify different entity types or sections
print("--- Sample data for semantic analysis ---")
print(df.iloc[:20])

# Check for changes in column content that might indicate different tables or sections
# For example, look for rows with different patterns or headers
sample_rows = df.iloc[:20]
for idx, row in sample_rows.iterrows():
    print(f"Row {idx}:")
    print(row.values)
    print()

--- Sample data for semantic analysis ---
        Date                                   Description   USD Amount  \
0 2025-01-03                  Mali Apartment work - Cement       -43.51   
1 2025-01-15            Rebtel International Phone charges        -9.49   
2 2025-01-21                  Mali Apartment work - Cement        -8.57   
3 2025-01-22  Mali Apartment Work - Tile Install (Kitchen)       -48.78   
4 2025-01-23              Aluminum fabricator - Issa Maiga      -424.34   
5 2025-01-23  Mali Apartment work - Kitchen Wall Crepisage       -57.75   
6 2025-01-23                  Mali Apartment work - Cement       -25.90   
7 2025-01-26            Rebtel International Phone charges        -4.99   
8 2025-01-14                      Toyota Prado - 4 x tires     -1095.34   
9 2025-01-14        Toyota Prado - 4 x rims and spare tire      -312.96   

   Transaction type   Category   TOTAL REVENUES   0   TOTAL EXPENSES  -2031.63  
0               EXP        NaN              NaN NaN

In [5]:
# Descriptive statistics for numerical columns
print("--- Descriptive Statistics ---")
df.describe()

--- Descriptive Statistics ---


Unnamed: 0,Date,USD Amount,Category,TOTAL REVENUES,0,TOTAL EXPENSES,-2031.63
count,10,10.0,0.0,0.0,0.0,0.0,0.0
mean,2025-01-18 09:36:00,-203.163,,,,,
min,2025-01-03 00:00:00,-1095.34,,,,,
25%,2025-01-14 06:00:00,-249.1575,,,,,
50%,2025-01-21 12:00:00,-46.145,,,,,
75%,2025-01-23 00:00:00,-13.5925,,,,,
max,2025-01-26 00:00:00,-4.99,,,,,
std,,345.365017,,,,,


### 🔍 Cell Dependencies Analysis

**Query:** `get_cell_dependencies(sheet='Yiriden Transactions 2025', cell_ref='F1')`

**Result:**
**Cell:** `Yiriden Transactions 2025!F1`
**Has Formula:** No

### 🔍 Cell Dependencies Analysis

**Query:** `get_cell_dependencies(sheet='Yiriden Transactions 2025', cell_ref='H1')`

**Result:**
**Cell:** `Yiriden Transactions 2025!H1`
**Has Formula:** No