In [1]:

import pandas as pd
from pathlib import Path

excel_path = Path(r"/Users/cheickberthe/PycharmProjects/spreadsheet-analyzer/test_assets/collection/business-accounting/Business Accounting.xlsx")
df = pd.read_excel(excel_path, sheet_name=8)
print(f"Loaded sheet with shape: {df.shape}")

# Quick preview for LLM context
print("\nFirst 10 rows:")
print(df.head(10).to_string())
print("\nLast 5 rows:")
print(df.tail(5).to_string())

# Get basic info
sheet_name = "None"
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"


Loaded sheet with shape: (18, 7)

First 10 rows:
        Date            Item   Amount   Note  Unnamed: 4  Total  12530.45
0 2022-02-22     Animal Feed   971.47    NaN         NaN    NaN       NaN
1 2022-02-22     Animal Feed    72.11    NaN         NaN    NaN       NaN
2 2022-02-08     Animal Feed   992.89    NaN         NaN    NaN       NaN
3 2022-03-03     Animal Feed   712.30    NaN         NaN    NaN       NaN
4 2022-03-09     Animal Feed   194.90    NaN         NaN    NaN       NaN
5 2022-03-07   Workshop Shed  2662.81    NaN         NaN    NaN       NaN
6 2022-02-18  Tractor Supply   317.01    NaN         NaN    NaN       NaN
7 2022-02-21  Tractor Supply   202.09    NaN         NaN    NaN       NaN
8 2022-03-05  Tractor Supply   386.61    NaN         NaN    NaN       NaN
9 2022-03-11       Livestock   342.65  Ducks         NaN    NaN       NaN

Last 5 rows:
         Date                           Item   Amount Note  Unnamed: 4  Total  12530.45
13 2022-02-23                 Tract

In [2]:

excel_file_name = excel_path.name
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"
sheet_name


'None'

In [3]:

# Table Detection Analysis
print("=== DATA STRUCTURE ANALYSIS ===")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n=== FIRST 10 ROWS ===")
print(df.head(10).to_string())
print("\n=== LAST 5 ROWS ===")
print(df.tail(5).to_string())

print("\n=== COLUMN ANALYSIS ===")
# Check for empty columns that might separate tables
for i, col in enumerate(df.columns):
    non_null_count = df[col].notna().sum()
    print(f"Column {i} ('{col}'): {non_null_count}/{len(df)} non-null values")

print("\n=== EMPTY ROW ANALYSIS ===")
# Check for empty rows that might separate tables
empty_rows = df.isnull().all(axis=1)
if empty_rows.any():
    empty_row_indices = empty_rows[empty_rows].index.tolist()
    print(f"Empty rows found at indices: {empty_row_indices[:10]}")
else:
    print("No completely empty rows found")

print("\n=== SIDE-BY-SIDE ANALYSIS ===")
# Check for potential side-by-side tables by examining column groups
left_cols = df.iloc[:, 0:7]  # First 7 columns
right_cols = df.iloc[:, 7:]  # Remaining columns
print(f"Left section shape: {left_cols.shape}")
print(f"Right section shape: {right_cols.shape}")
print(f"Left section non-null density: {left_cols.notna().sum().sum() / (left_cols.shape[0] * left_cols.shape[1]):.2f}")
print(f"Right section non-null density: {right_cols.notna().sum().sum() / (right_cols.shape[0] * right_cols.shape[1]):.2f}")


=== DATA STRUCTURE ANALYSIS ===
DataFrame shape: (18, 7)
Columns: ['Date', 'Item', 'Amount', 'Note', 'Unnamed: 4', 'Total', 12530.449999999999]

=== FIRST 10 ROWS ===
        Date            Item   Amount   Note  Unnamed: 4  Total  12530.45
0 2022-02-22     Animal Feed   971.47    NaN         NaN    NaN       NaN
1 2022-02-22     Animal Feed    72.11    NaN         NaN    NaN       NaN
2 2022-02-08     Animal Feed   992.89    NaN         NaN    NaN       NaN
3 2022-03-03     Animal Feed   712.30    NaN         NaN    NaN       NaN
4 2022-03-09     Animal Feed   194.90    NaN         NaN    NaN       NaN
5 2022-03-07   Workshop Shed  2662.81    NaN         NaN    NaN       NaN
6 2022-02-18  Tractor Supply   317.01    NaN         NaN    NaN       NaN
7 2022-02-21  Tractor Supply   202.09    NaN         NaN    NaN       NaN
8 2022-03-05  Tractor Supply   386.61    NaN         NaN    NaN       NaN
9 2022-03-11       Livestock   342.65  Ducks         NaN    NaN       NaN

=== LAST 5 ROWS ==

  print(f"Right section non-null density: {right_cols.notna().sum().sum() / (right_cols.shape[0] * right_cols.shape[1]):.2f}")


In [4]:
# Load the first few rows and info to understand the structure
head = df.head()
info = df.info()
head, info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date                18 non-null     datetime64[ns]
 1   Item                18 non-null     object        
 2   Amount              18 non-null     float64       
 3   Note                2 non-null      object        
 4   Unnamed: 4          0 non-null      float64       
 5   Total               0 non-null      float64       
 6   12530.449999999999  0 non-null      float64       
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 1.1+ KB


(        Date         Item  Amount Note  Unnamed: 4  Total  12530.45
 0 2022-02-22  Animal Feed  971.47  NaN         NaN    NaN       NaN
 1 2022-02-22  Animal Feed   72.11  NaN         NaN    NaN       NaN
 2 2022-02-08  Animal Feed  992.89  NaN         NaN    NaN       NaN
 3 2022-03-03  Animal Feed  712.30  NaN         NaN    NaN       NaN
 4 2022-03-09  Animal Feed  194.90  NaN         NaN    NaN       NaN,
 None)

In [5]:
# Check for empty rows and columns to identify potential table boundaries
empty_rows = df.isnull().all(axis=1)
empty_row_indices = empty_rows[empty_rows].index.tolist()
empty_cols = df.isnull().all(axis=0)
empty_col_indices = empty_cols[empty_cols].index.tolist()
empty_row_indices, empty_col_indices

([], ['Unnamed: 4', 'Total', 12530.449999999999])

In [6]:
# Based on the info, the main data table likely spans from row 0 to 4 and columns 0 to 5
# The last row seems to be a summary or total, so consider only up to row 4 for the main table
# The columns of interest are from 0 to 5
# Check for other potential tables by examining empty rows/columns
# Since no other clear boundaries are indicated, focus on this main table

# Define the main table boundaries
detected_tables = [
    {
        "table_id": "table_1",
        "description": "Expenses for Animal Feed in February and March 2022",
        "start_row": 0,
        "end_row": 4,
        "start_col": 0,
        "end_col": 5,
        "confidence": 0.95,
        "table_type": "DETAIL",
        "entity_type": "expenses"
    }
]

# Output the detected tables
detected_tables

[{'table_id': 'table_1',
  'description': 'Expenses for Animal Feed in February and March 2022',
  'start_row': 0,
  'end_row': 4,
  'start_col': 0,
  'end_col': 5,
  'confidence': 0.95,
  'table_type': 'DETAIL',
  'entity_type': 'expenses'}]

In [7]:

# Extract detection results created by LLM
# The detector should have created a 'detected_tables' variable with the results

# CLAUDE-TEST-WORKAROUND: Validate that detected_tables exists and is properly formatted
if 'detected_tables' in globals():
    # Validate it's a list
    if isinstance(detected_tables, list):
        detection_results = detected_tables
        print(f"✅ Found {len(detection_results)} tables from LLM detection")
        # Validate first table has required fields (if any tables exist)
        if detection_results:
            required_fields = ['table_id', 'description', 'start_row', 'end_row', 'start_col', 'end_col']
            first_table = detection_results[0]
            missing_fields = [f for f in required_fields if f not in first_table]
            if missing_fields:
                print(f"⚠️ Warning: First table missing fields: {missing_fields}")
    else:
        print(f"❌ Error: detected_tables is not a list, it's a {type(detected_tables)}")
        detection_results = []
else:
    # CLAUDE-GOTCHA: Gemini sometimes fails to create the variable even after multiple prompts
    print("❌ No 'detected_tables' variable found - LLM failed to complete detection")
    detection_results = []

detection_results


✅ Found 1 tables from LLM detection


[{'table_id': 'table_1',
  'description': 'Expenses for Animal Feed in February and March 2022',
  'start_row': 0,
  'end_row': 4,
  'start_col': 0,
  'end_col': 5,
  'confidence': 0.95,
  'table_type': 'DETAIL',
  'entity_type': 'expenses'}]


# Table Detection Results

Detected 1 tables:


## Table 1: Expenses for Animal Feed in February and March 2022
- Location: Rows 0-4, Columns 0-5
- Type: detail
- Entity: expenses
- Confidence: 0.95
