In [1]:

import pandas as pd
from pathlib import Path

excel_path = Path(r"/Users/cheickberthe/PycharmProjects/spreadsheet-analyzer/test_assets/collection/business-accounting/Business Accounting.xlsx")
df = pd.read_excel(excel_path, sheet_name=1)
print(f"Loaded sheet with shape: {df.shape}")

# Quick preview for LLM context
print("\nFirst 10 rows:")
print(df.head(10).to_string())
print("\nLast 5 rows:")
print(df.tail(5).to_string())

# Get basic info
sheet_name = "None"
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"


Loaded sheet with shape: (769, 11)

First 10 rows:
        Date                           Description  Transaction type   USD Amount     XOF Amount   Unnamed: 5              Category   TOTAL REVENUES   84202.53   TOTAL EXPENSES  -385526.333434
0 2023-01-03                        iCloud Storage               EXP        -9.99   -6214.329474   622.055002                   NaN              NaN        NaN              NaN             NaN
1 2023-01-03                   Amazon web services               EXP      -284.83 -177179.926325   622.055002                   NaN              NaN        NaN              NaN             NaN
2 2023-01-03                Diesel for return trip               EXP      -526.48    -327,500CFA          NaN                  Fuel              NaN        NaN              NaN             NaN
3 2023-01-03                         Road expenses               EXP      -120.57     -75,000CFA          NaN             Toll Fees              NaN        NaN              NaN 

In [2]:

excel_file_name = excel_path.name
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"
sheet_name


'None'

In [3]:

# Table Detection Analysis
print("=== DATA STRUCTURE ANALYSIS ===")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n=== FIRST 10 ROWS ===")
print(df.head(10).to_string())
print("\n=== LAST 5 ROWS ===")
print(df.tail(5).to_string())

print("\n=== COLUMN ANALYSIS ===")
# Check for empty columns that might separate tables
for i, col in enumerate(df.columns):
    non_null_count = df[col].notna().sum()
    print(f"Column {i} ('{col}'): {non_null_count}/{len(df)} non-null values")

print("\n=== EMPTY ROW ANALYSIS ===")
# Check for empty rows that might separate tables
empty_rows = df.isnull().all(axis=1)
if empty_rows.any():
    empty_row_indices = empty_rows[empty_rows].index.tolist()
    print(f"Empty rows found at indices: {empty_row_indices[:10]}")
else:
    print("No completely empty rows found")

print("\n=== SIDE-BY-SIDE ANALYSIS ===")
# Check for potential side-by-side tables by examining column groups
left_cols = df.iloc[:, 0:7]  # First 7 columns
right_cols = df.iloc[:, 7:]  # Remaining columns
print(f"Left section shape: {left_cols.shape}")
print(f"Right section shape: {right_cols.shape}")
print(f"Left section non-null density: {left_cols.notna().sum().sum() / (left_cols.shape[0] * left_cols.shape[1]):.2f}")
print(f"Right section non-null density: {right_cols.notna().sum().sum() / (right_cols.shape[0] * right_cols.shape[1]):.2f}")


=== DATA STRUCTURE ANALYSIS ===
DataFrame shape: (769, 11)
Columns: ['Date', 'Description', 'Transaction type', 'USD Amount', 'XOF Amount', 'Unnamed: 5', 'Category', 'TOTAL REVENUES', 84202.53000000001, 'TOTAL EXPENSES', -385526.33343393693]

=== FIRST 10 ROWS ===
        Date                           Description  Transaction type   USD Amount     XOF Amount   Unnamed: 5              Category   TOTAL REVENUES   84202.53   TOTAL EXPENSES  -385526.333434
0 2023-01-03                        iCloud Storage               EXP        -9.99   -6214.329474   622.055002                   NaN              NaN        NaN              NaN             NaN
1 2023-01-03                   Amazon web services               EXP      -284.83 -177179.926325   622.055002                   NaN              NaN        NaN              NaN             NaN
2 2023-01-03                Diesel for return trip               EXP      -526.48    -327,500CFA          NaN                  Fuel              NaN        

In [4]:
# Based on the analysis, the data appears to have one main table spanning all rows and columns 0-6
# Columns 7-10 are completely empty and do not form a table

# Define the detected table boundaries and details

detected_tables = [
    {
        "table_id": "table_1",
        "description": "Detailed transaction records including date, description, transaction type, amounts, and categories",
        "start_row": 0,
        "end_row": 768,
        "start_col": 0,
        "end_col": 6,
        "confidence": 0.95,
        "table_type": "DETAIL",
        "entity_type": "transactions"
    }
]

print(f"Detected {len(detected_tables)} tables")
for i, table in enumerate(detected_tables, 1):
    print(f"Table {i}: {table['description']} at rows {table['start_row']}-{table['end_row']}, cols {table['start_col']}-{table['end_col']}")

Detected 1 tables
Table 1: Detailed transaction records including date, description, transaction type, amounts, and categories at rows 0-768, cols 0-6


In [5]:

# Extract detection results created by LLM
# The detector should have created a 'detected_tables' variable with the results

# Check if the LLM created the expected variable
if 'detected_tables' in globals():
    # Return the detected tables
    detection_results = detected_tables
    print(f"Found {len(detection_results)} tables from LLM detection")
else:
    # If no detection was made, return empty list to trigger fallback
    print("No 'detected_tables' variable found - LLM may not have completed detection")
    detection_results = []

detection_results


Found 1 tables from LLM detection


[{'table_id': 'table_1',
  'description': 'Detailed transaction records including date, description, transaction type, amounts, and categories',
  'start_row': 0,
  'end_row': 768,
  'start_col': 0,
  'end_col': 6,
  'confidence': 0.95,
  'table_type': 'DETAIL',
  'entity_type': 'transactions'}]


# Table Detection Results

Detected 1 tables:


## Table 1: Detailed transaction records including date, description, transaction type, amounts, and categories
- Location: Rows 0-768, Columns 0-6
- Type: detail
- Entity: transactions
- Confidence: 0.95
