In [1]:

import pandas as pd
from pathlib import Path

excel_path = Path(r"/Users/cheickberthe/PycharmProjects/spreadsheet-analyzer/test_assets/collection/business-accounting/Business Accounting.xlsx")
df = pd.read_excel(excel_path, sheet_name=2)
print(f"Loaded sheet with shape: {df.shape}")

# Quick preview for LLM context
print("\nFirst 10 rows:")
print(df.head(10).to_string())
print("\nLast 5 rows:")
print(df.tail(5).to_string())

# Get basic info
sheet_name = "None"
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"


Loaded sheet with shape: (121, 14)

First 10 rows:
                 Lender   Amount   Term    Rate  Payment Origination Date     Truck  Unnamed: 7 Unnamed: 8           Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12 Unnamed: 13
0                   BHG  89395.0  120.0  0.1824  1624.60       2023-04-18  Actros 1         NaN  Payment #                 Date      Amount    Interest   Principal     Balance
1  Fidelity 401(k) loan  36000.0   59.0  0.0925   350.63       2023-07-16  Actros 2         NaN          1  2023-05-20 00:00:00      1624.6         NaN         NaN         NaN
2                   NaN      NaN    NaN     NaN      NaN              NaT       NaN         NaN          2  2023-06-20 00:00:00      1624.6         NaN         NaN         NaN
3                   NaN      NaN    NaN     NaN      NaN              NaT       NaN         NaN          3  2023-07-20 00:00:00      1624.6         NaN         NaN         NaN
4                   NaN      NaN    NaN     NaN      NaN             

In [2]:

excel_file_name = excel_path.name
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"
sheet_name


'None'

In [3]:

# Table Detection Analysis
print("=== DATA STRUCTURE ANALYSIS ===")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n=== FIRST 10 ROWS ===")
print(df.head(10).to_string())
print("\n=== LAST 5 ROWS ===")
print(df.tail(5).to_string())

print("\n=== COLUMN ANALYSIS ===")
# Check for empty columns that might separate tables
for i, col in enumerate(df.columns):
    non_null_count = df[col].notna().sum()
    print(f"Column {i} ('{col}'): {non_null_count}/{len(df)} non-null values")

print("\n=== EMPTY ROW ANALYSIS ===")
# Check for empty rows that might separate tables
empty_rows = df.isnull().all(axis=1)
if empty_rows.any():
    empty_row_indices = empty_rows[empty_rows].index.tolist()
    print(f"Empty rows found at indices: {empty_row_indices[:10]}")
else:
    print("No completely empty rows found")

print("\n=== SIDE-BY-SIDE ANALYSIS ===")
# Check for potential side-by-side tables by examining column groups
left_cols = df.iloc[:, 0:7]  # First 7 columns
right_cols = df.iloc[:, 7:]  # Remaining columns
print(f"Left section shape: {left_cols.shape}")
print(f"Right section shape: {right_cols.shape}")
print(f"Left section non-null density: {left_cols.notna().sum().sum() / (left_cols.shape[0] * left_cols.shape[1]):.2f}")
print(f"Right section non-null density: {right_cols.notna().sum().sum() / (right_cols.shape[0] * right_cols.shape[1]):.2f}")


=== DATA STRUCTURE ANALYSIS ===
DataFrame shape: (121, 14)
Columns: ['Lender', 'Amount', 'Term', 'Rate', 'Payment', 'Origination Date', 'Truck', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13']

=== FIRST 10 ROWS ===
                 Lender   Amount   Term    Rate  Payment Origination Date     Truck  Unnamed: 7 Unnamed: 8           Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12 Unnamed: 13
0                   BHG  89395.0  120.0  0.1824  1624.60       2023-04-18  Actros 1         NaN  Payment #                 Date      Amount    Interest   Principal     Balance
1  Fidelity 401(k) loan  36000.0   59.0  0.0925   350.63       2023-07-16  Actros 2         NaN          1  2023-05-20 00:00:00      1624.6         NaN         NaN         NaN
2                   NaN      NaN    NaN     NaN      NaN              NaT       NaN         NaN          2  2023-06-20 00:00:00      1624.6         NaN         NaN         NaN
3                   NaN   

In [4]:
detected_tables = []

# Based on the analysis, there are two main sections side-by-side:
# Left section (columns 0-6): Main loan data
# Right section (columns 7-13): Payment schedule details

# Detect the main loan data table
# It starts at row 0, includes header, and continues until the last non-empty row of the main data
# The header is likely at row 0, with data following
# The data seems to extend to row 120, with some non-null entries in the first 7 columns

# Main loan data table boundaries
main_loan_table = {
    "table_id": "table_1",
    "description": "Main loan data with lender and loan details",
    "start_row": 0,
    "end_row": 120,
    "start_col": 0,
    "end_col": 6,
    "confidence": 0.9,
    "table_type": "DETAIL",
    "entity_type": "loan_details"
}

# Payment schedule table boundaries
# It appears to be in columns 7-13, starting from row 0, with data from row 1 to 120
payment_schedule_table = {
    "table_id": "table_2",
    "description": "Payment schedule details",
    "start_row": 0,
    "end_row": 120,
    "start_col": 7,
    "end_col": 13,
    "confidence": 0.9,
    "table_type": "DETAIL",
    "entity_type": "payment_schedule"
}

detected_tables = [main_loan_table, payment_schedule_table]


In [5]:
# No additional tables detected, only the side-by-side main data and payment schedule
# Confirming the detection
print(detected_tables)

[{'table_id': 'table_1', 'description': 'Main loan data with lender and loan details', 'start_row': 0, 'end_row': 120, 'start_col': 0, 'end_col': 6, 'confidence': 0.9, 'table_type': 'DETAIL', 'entity_type': 'loan_details'}, {'table_id': 'table_2', 'description': 'Payment schedule details', 'start_row': 0, 'end_row': 120, 'start_col': 7, 'end_col': 13, 'confidence': 0.9, 'table_type': 'DETAIL', 'entity_type': 'payment_schedule'}]


In [6]:

# Extract detection results created by LLM
# The detector should have created a 'detected_tables' variable with the results

# CLAUDE-TEST-WORKAROUND: Validate that detected_tables exists and is properly formatted
if 'detected_tables' in globals():
    # Validate it's a list
    if isinstance(detected_tables, list):
        detection_results = detected_tables
        print(f"✅ Found {len(detection_results)} tables from LLM detection")
        # Validate first table has required fields (if any tables exist)
        if detection_results:
            required_fields = ['table_id', 'description', 'start_row', 'end_row', 'start_col', 'end_col']
            first_table = detection_results[0]
            missing_fields = [f for f in required_fields if f not in first_table]
            if missing_fields:
                print(f"⚠️ Warning: First table missing fields: {missing_fields}")
    else:
        print(f"❌ Error: detected_tables is not a list, it's a {type(detected_tables)}")
        detection_results = []
else:
    # CLAUDE-GOTCHA: Gemini sometimes fails to create the variable even after multiple prompts
    print("❌ No 'detected_tables' variable found - LLM failed to complete detection")
    detection_results = []

detection_results


✅ Found 2 tables from LLM detection


[{'table_id': 'table_1',
  'description': 'Main loan data with lender and loan details',
  'start_row': 0,
  'end_row': 120,
  'start_col': 0,
  'end_col': 6,
  'confidence': 0.9,
  'table_type': 'DETAIL',
  'entity_type': 'loan_details'},
 {'table_id': 'table_2',
  'description': 'Payment schedule details',
  'start_row': 0,
  'end_row': 120,
  'start_col': 7,
  'end_col': 13,
  'confidence': 0.9,
  'table_type': 'DETAIL',
  'entity_type': 'payment_schedule'}]


# Table Detection Results

Detected 2 tables:


## Table 1: Main loan data with lender and loan details
- Location: Rows 0-120, Columns 0-6
- Type: detail
- Entity: loan_details
- Confidence: 0.90

## Table 2: Payment schedule details
- Location: Rows 0-120, Columns 7-13
- Type: detail
- Entity: payment_schedule
- Confidence: 0.90
