In [1]:

import pandas as pd
from pathlib import Path

excel_path = Path(r"/Users/cheickberthe/PycharmProjects/spreadsheet-analyzer/test_assets/collection/business-accounting/Business Accounting.xlsx")
df = pd.read_excel(excel_path, sheet_name=0)
print(f"Loaded sheet with shape: {df.shape}")

# Quick preview for LLM context
print("\nFirst 10 rows:")
print(df.head(10).to_string())
print("\nLast 5 rows:")
print(df.tail(5).to_string())

# Get basic info
sheet_name = "None"
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"


Loaded sheet with shape: (10, 9)

First 10 rows:
        Date                                   Description   USD Amount  Transaction type   Category   TOTAL REVENUES   0   TOTAL EXPENSES  -2031.63
0 2025-01-03                  Mali Apartment work - Cement       -43.51               EXP        NaN              NaN NaN              NaN       NaN
1 2025-01-15            Rebtel International Phone charges        -9.49               EXP        NaN              NaN NaN              NaN       NaN
2 2025-01-21                  Mali Apartment work - Cement        -8.57               EXP        NaN              NaN NaN              NaN       NaN
3 2025-01-22  Mali Apartment Work - Tile Install (Kitchen)       -48.78               EXP        NaN              NaN NaN              NaN       NaN
4 2025-01-23              Aluminum fabricator - Issa Maiga      -424.34               EXP        NaN              NaN NaN              NaN       NaN
5 2025-01-23  Mali Apartment work - Kitchen Wall Crepisag

In [2]:

excel_file_name = excel_path.name
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"
sheet_name


'None'

In [3]:

# Table Detection Analysis
print("=== DATA STRUCTURE ANALYSIS ===")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n=== FIRST 10 ROWS ===")
print(df.head(10).to_string())
print("\n=== LAST 5 ROWS ===")
print(df.tail(5).to_string())

print("\n=== COLUMN ANALYSIS ===")
# Check for empty columns that might separate tables
for i, col in enumerate(df.columns):
    non_null_count = df[col].notna().sum()
    print(f"Column {i} ('{col}'): {non_null_count}/{len(df)} non-null values")

print("\n=== EMPTY ROW ANALYSIS ===")
# Check for empty rows that might separate tables
empty_rows = df.isnull().all(axis=1)
if empty_rows.any():
    empty_row_indices = empty_rows[empty_rows].index.tolist()
    print(f"Empty rows found at indices: {empty_row_indices[:10]}")
else:
    print("No completely empty rows found")

print("\n=== SIDE-BY-SIDE ANALYSIS ===")
# Check for potential side-by-side tables by examining column groups
left_cols = df.iloc[:, 0:7]  # First 7 columns
right_cols = df.iloc[:, 7:]  # Remaining columns
print(f"Left section shape: {left_cols.shape}")
print(f"Right section shape: {right_cols.shape}")
print(f"Left section non-null density: {left_cols.notna().sum().sum() / (left_cols.shape[0] * left_cols.shape[1]):.2f}")
print(f"Right section non-null density: {right_cols.notna().sum().sum() / (right_cols.shape[0] * right_cols.shape[1]):.2f}")


=== DATA STRUCTURE ANALYSIS ===
DataFrame shape: (10, 9)
Columns: ['Date', 'Description', 'USD Amount', 'Transaction type', 'Category', 'TOTAL REVENUES', 0, 'TOTAL EXPENSES', -2031.6299999999999]

=== FIRST 10 ROWS ===
        Date                                   Description   USD Amount  Transaction type   Category   TOTAL REVENUES   0   TOTAL EXPENSES  -2031.63
0 2025-01-03                  Mali Apartment work - Cement       -43.51               EXP        NaN              NaN NaN              NaN       NaN
1 2025-01-15            Rebtel International Phone charges        -9.49               EXP        NaN              NaN NaN              NaN       NaN
2 2025-01-21                  Mali Apartment work - Cement        -8.57               EXP        NaN              NaN NaN              NaN       NaN
3 2025-01-22  Mali Apartment Work - Tile Install (Kitchen)       -48.78               EXP        NaN              NaN NaN              NaN       NaN
4 2025-01-23              Aluminum f

In [4]:
df.head()

Unnamed: 0,Date,Description,USD Amount,Transaction type,Category,TOTAL REVENUES,0,TOTAL EXPENSES,-2031.63
0,2025-01-03,Mali Apartment work - Cement,-43.51,EXP,,,,,
1,2025-01-15,Rebtel International Phone charges,-9.49,EXP,,,,,
2,2025-01-21,Mali Apartment work - Cement,-8.57,EXP,,,,,
3,2025-01-22,Mali Apartment Work - Tile Install (Kitchen),-48.78,EXP,,,,,
4,2025-01-23,Aluminum fabricator - Issa Maiga,-424.34,EXP,,,,,


In [5]:
df.tail()

Unnamed: 0,Date,Description,USD Amount,Transaction type,Category,TOTAL REVENUES,0,TOTAL EXPENSES,-2031.63
5,2025-01-23,Mali Apartment work - Kitchen Wall Crepisage,-57.75,EXP,,,,,
6,2025-01-23,Mali Apartment work - Cement,-25.9,EXP,,,,,
7,2025-01-26,Rebtel International Phone charges,-4.99,EXP,,,,,
8,2025-01-14,Toyota Prado - 4 x tires,-1095.34,EXP,,,,,
9,2025-01-14,Toyota Prado - 4 x rims and spare tire,-312.96,EXP,,,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Date                 10 non-null     datetime64[ns]
 1   Description          10 non-null     object        
 2   USD Amount           10 non-null     float64       
 3   Transaction type     10 non-null     object        
 4   Category             0 non-null      float64       
 5   TOTAL REVENUES       0 non-null      float64       
 6   0                    0 non-null      float64       
 7   TOTAL EXPENSES       0 non-null      float64       
 8   -2031.6299999999999  0 non-null      float64       
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 852.0+ bytes


In [7]:
# Identify blocks separated by empty rows
empty_rows = df.isnull().all(axis=1)
empty_row_indices = empty_rows[empty_rows].index.tolist()

# Look for consecutive empty rows (2+ indicate table boundary)
# Empty columns can also indicate horizontal table separation
empty_cols = df.isnull().all(axis=0)
empty_col_indices = empty_cols[empty_cols].index.tolist()

empty_row_indices, empty_col_indices

([], ['Category', 'TOTAL REVENUES', 0, 'TOTAL EXPENSES', -2031.6299999999999])

In [8]:
# Analyze column patterns and entity types
left_cols = df.iloc[:, 0:7]  # First potential table
right_cols = df.iloc[:, 7:]  # Second potential table

# Check for different header patterns or data types
left_cols_info = left_cols.dtypes
right_cols_info = right_cols.dtypes

left_cols_info, right_cols_info

(Date                datetime64[ns]
 Description                 object
 USD Amount                 float64
 Transaction type            object
 Category                   float64
 TOTAL REVENUES             float64
 0                          float64
 dtype: object,
  TOTAL EXPENSES    float64
 -2031.63           float64
 dtype: object)

In [9]:

# Extract detection results created by LLM
# The detector should have created a 'detected_tables' variable with the results

# CLAUDE-TEST-WORKAROUND: Validate that detected_tables exists and is properly formatted
if 'detected_tables' in globals():
    # Validate it's a list
    if isinstance(detected_tables, list):
        detection_results = detected_tables
        print(f"✅ Found {len(detection_results)} tables from LLM detection")
        # Validate first table has required fields (if any tables exist)
        if detection_results:
            required_fields = ['table_id', 'description', 'start_row', 'end_row', 'start_col', 'end_col']
            first_table = detection_results[0]
            missing_fields = [f for f in required_fields if f not in first_table]
            if missing_fields:
                print(f"⚠️ Warning: First table missing fields: {missing_fields}")
    else:
        print(f"❌ Error: detected_tables is not a list, it's a {type(detected_tables)}")
        detection_results = []
else:
    # CLAUDE-GOTCHA: Gemini sometimes fails to create the variable even after multiple prompts
    print("❌ No 'detected_tables' variable found - LLM failed to complete detection")
    detection_results = []

detection_results


❌ No 'detected_tables' variable found - LLM failed to complete detection


[]

In [10]:
df_shape = df.shape; df_shape

(10, 9)


# Table Detection Results

Detected 1 tables:


## Table 1: Full spreadsheet (fallback)
- Location: Rows 0-9, Columns 0-8
- Type: detail
- Entity: data
- Confidence: 0.50
