In [1]:

import pandas as pd
from pathlib import Path

excel_path = Path(r"/Users/cheickberthe/PycharmProjects/spreadsheet-analyzer/test_assets/collection/business-accounting/Business Accounting.xlsx")
df = pd.read_excel(excel_path, sheet_name=0)
print(f"Loaded sheet with shape: {df.shape}")

# Quick preview for LLM context
print("\nFirst 10 rows:")
print(df.head(10).to_string())
print("\nLast 5 rows:")
print(df.tail(5).to_string())

# Get basic info
sheet_name = "None"
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"


Loaded sheet with shape: (10, 9)

First 10 rows:
        Date                                   Description   USD Amount  Transaction type   Category   TOTAL REVENUES   0   TOTAL EXPENSES  -2031.63
0 2025-01-03                  Mali Apartment work - Cement       -43.51               EXP        NaN              NaN NaN              NaN       NaN
1 2025-01-15            Rebtel International Phone charges        -9.49               EXP        NaN              NaN NaN              NaN       NaN
2 2025-01-21                  Mali Apartment work - Cement        -8.57               EXP        NaN              NaN NaN              NaN       NaN
3 2025-01-22  Mali Apartment Work - Tile Install (Kitchen)       -48.78               EXP        NaN              NaN NaN              NaN       NaN
4 2025-01-23              Aluminum fabricator - Issa Maiga      -424.34               EXP        NaN              NaN NaN              NaN       NaN
5 2025-01-23  Mali Apartment work - Kitchen Wall Crepisag

In [2]:

excel_file_name = excel_path.name
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"
sheet_name


'None'

In [3]:

# Table Detection Analysis
print("=== DATA STRUCTURE ANALYSIS ===")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n=== FIRST 10 ROWS ===")
print(df.head(10).to_string())
print("\n=== LAST 5 ROWS ===")
print(df.tail(5).to_string())

print("\n=== COLUMN ANALYSIS ===")
# Check for empty columns that might separate tables
for i, col in enumerate(df.columns):
    non_null_count = df[col].notna().sum()
    print(f"Column {i} ('{col}'): {non_null_count}/{len(df)} non-null values")

print("\n=== EMPTY ROW ANALYSIS ===")
# Check for empty rows that might separate tables
empty_rows = df.isnull().all(axis=1)
if empty_rows.any():
    empty_row_indices = empty_rows[empty_rows].index.tolist()
    print(f"Empty rows found at indices: {empty_row_indices[:10]}")
else:
    print("No completely empty rows found")

print("\n=== SIDE-BY-SIDE ANALYSIS ===")
# Check for potential side-by-side tables by examining column groups
left_cols = df.iloc[:, 0:7]  # First 7 columns
right_cols = df.iloc[:, 7:]  # Remaining columns
print(f"Left section shape: {left_cols.shape}")
print(f"Right section shape: {right_cols.shape}")
print(f"Left section non-null density: {left_cols.notna().sum().sum() / (left_cols.shape[0] * left_cols.shape[1]):.2f}")
print(f"Right section non-null density: {right_cols.notna().sum().sum() / (right_cols.shape[0] * right_cols.shape[1]):.2f}")


=== DATA STRUCTURE ANALYSIS ===
DataFrame shape: (10, 9)
Columns: ['Date', 'Description', 'USD Amount', 'Transaction type', 'Category', 'TOTAL REVENUES', 0, 'TOTAL EXPENSES', -2031.6299999999999]

=== FIRST 10 ROWS ===
        Date                                   Description   USD Amount  Transaction type   Category   TOTAL REVENUES   0   TOTAL EXPENSES  -2031.63
0 2025-01-03                  Mali Apartment work - Cement       -43.51               EXP        NaN              NaN NaN              NaN       NaN
1 2025-01-15            Rebtel International Phone charges        -9.49               EXP        NaN              NaN NaN              NaN       NaN
2 2025-01-21                  Mali Apartment work - Cement        -8.57               EXP        NaN              NaN NaN              NaN       NaN
3 2025-01-22  Mali Apartment Work - Tile Install (Kitchen)       -48.78               EXP        NaN              NaN NaN              NaN       NaN
4 2025-01-23              Aluminum f

In [4]:
# Further analyze the data to detect table boundaries
import pandas as pd
import numpy as np

# Examine columns more closely to identify tables
print("Column data types:")
print(df.dtypes)

# Check pattern of data in left vs right sections
print("\nFirst 5 rows of columns 0-4:")
print(df.iloc[:5, 0:5])

print("\nFirst 5 rows of columns 5-8:")
print(df.iloc[:5, 5:])

# Check if right section columns represent summary/totals
print("\nValue in TOTAL EXPENSES column header:")
print(df.columns[7], "=", df.columns[8])

# Analyze transaction types
print("\nUnique transaction types:")
print(df['Transaction type'].unique())

# Calculate sum of expenses to see if it matches total
total_expenses = df['USD Amount'].sum()
print(f"\nSum of USD Amount column: {total_expenses}")
print(f"Value in TOTAL EXPENSES column: {df.columns[8]}")

Column data types:
 Date                datetime64[ns]
 Description                 object
 USD Amount                 float64
 Transaction type            object
 Category                   float64
 TOTAL REVENUES             float64
 0                          float64
 TOTAL EXPENSES             float64
-2031.63                    float64
dtype: object

First 5 rows of columns 0-4:
        Date                                   Description  USD Amount  \
0 2025-01-03                  Mali Apartment work - Cement      -43.51   
1 2025-01-15            Rebtel International Phone charges       -9.49   
2 2025-01-21                  Mali Apartment work - Cement       -8.57   
3 2025-01-22  Mali Apartment Work - Tile Install (Kitchen)      -48.78   
4 2025-01-23              Aluminum fabricator - Issa Maiga     -424.34   

  Transaction type  Category  
0              EXP       NaN  
1              EXP       NaN  
2              EXP       NaN  
3              EXP       NaN  
4            

In [5]:
# Look for visual patterns in the data
# Print out a simplified representation of non-null vs null values

def get_data_pattern(df):
    """Create a visual representation of data patterns with X for data and . for NaN"""
    pattern = ""
    for i in range(min(10, len(df))):
        row = ""
        for j in range(len(df.columns)):
            if pd.isna(df.iloc[i, j]):
                row += "."
            else:
                row += "X"
        pattern += row + "\n"
    return pattern

print("Data pattern (X=value, .=NaN):")
print(get_data_pattern(df))

# Check if the right side (columns 5-8) contains a summary
# Extract column names for analysis
print("\nLeft side columns:")
print(df.columns[0:5].tolist())
print("\nRight side columns:")
print(df.columns[5:].tolist())

# Investigate if the second table is a summary of the first
expense_sum = df["USD Amount"].sum()
print(f"\nSum of all expenses: {expense_sum}")
print(f"Value in column header: {df.columns[8]}")

# Create a visual of the layout
print("\nTable layout visualization:")
for i, col in enumerate(df.columns):
    print(f"Col {i}: {col}")

Data pattern (X=value, .=NaN):
XXXX.....
XXXX.....
XXXX.....
XXXX.....
XXXX.....
XXXX.....
XXXX.....
XXXX.....
XXXX.....
XXXX.....


Left side columns:
['Date', 'Description', 'USD Amount', 'Transaction type', 'Category']

Right side columns:
['TOTAL REVENUES', 0, 'TOTAL EXPENSES', -2031.6299999999999]

Sum of all expenses: -2031.63
Value in column header: -2031.6299999999999

Table layout visualization:
Col 0: Date
Col 1: Description
Col 2: USD Amount
Col 3: Transaction type
Col 4: Category
Col 5: TOTAL REVENUES
Col 6: 0
Col 7: TOTAL EXPENSES
Col 8: -2031.6299999999999


In [6]:
# Create the detected_tables list with our findings
detected_tables = [
    {
        "table_id": "table_1",
        "description": "Expense transactions with 10 records",
        "start_row": 0,
        "end_row": 9,
        "start_col": 0,
        "end_col": 4,
        "confidence": 0.95,
        "table_type": "DETAIL",
        "entity_type": "expenses"
    },
    {
        "table_id": "table_2",
        "description": "Expense summary totals",
        "start_row": 0,
        "end_row": 0,  # Only the header row contains data
        "start_col": 5,
        "end_col": 8,
        "confidence": 0.85,
        "table_type": "SUMMARY",
        "entity_type": "summary"
    }
]

# Print out the detected tables
print(f"Detected {len(detected_tables)} tables")
for i, table in enumerate(detected_tables, 1):
    print(f"Table {i}: {table['description']} at rows {table['start_row']}-{table['end_row']}, cols {table['start_col']}-{table['end_col']}")
    
# Additional verification - let's look at the content of each detected table
print("\nTable 1 (Main expense details):")
print(df.iloc[0:10, 0:5].head(3))

print("\nTable 2 (Summary totals - headers only):")
print(df.columns[5:9].tolist())

Detected 2 tables
Table 1: Expense transactions with 10 records at rows 0-9, cols 0-4
Table 2: Expense summary totals at rows 0-0, cols 5-8

Table 1 (Main expense details):
        Date                         Description  USD Amount Transaction type  \
0 2025-01-03        Mali Apartment work - Cement      -43.51              EXP   
1 2025-01-15  Rebtel International Phone charges       -9.49              EXP   
2 2025-01-21        Mali Apartment work - Cement       -8.57              EXP   

   Category  
0       NaN  
1       NaN  
2       NaN  

Table 2 (Summary totals - headers only):
['TOTAL REVENUES', 0, 'TOTAL EXPENSES', -2031.6299999999999]


In [7]:

# Extract detection results created by LLM
# The detector should have created a 'detected_tables' variable with the results

# Check if the LLM created the expected variable
if 'detected_tables' in globals():
    # Return the detected tables
    detection_results = detected_tables
    print(f"Found {len(detection_results)} tables from LLM detection")
else:
    # If no detection was made, return empty list to trigger fallback
    print("No 'detected_tables' variable found - LLM may not have completed detection")
    detection_results = []

detection_results


Found 2 tables from LLM detection


[{'table_id': 'table_1',
  'description': 'Expense transactions with 10 records',
  'start_row': 0,
  'end_row': 9,
  'start_col': 0,
  'end_col': 4,
  'confidence': 0.95,
  'table_type': 'DETAIL',
  'entity_type': 'expenses'},
 {'table_id': 'table_2',
  'description': 'Expense summary totals',
  'start_row': 0,
  'end_row': 0,
  'start_col': 5,
  'end_col': 8,
  'confidence': 0.85,
  'table_type': 'SUMMARY',
  'entity_type': 'summary'}]


# Table Detection Results

Detected 2 tables:


## Table 1: Expense transactions with 10 records
- Location: Rows 0-9, Columns 0-4
- Type: detail
- Entity: expenses
- Confidence: 0.95

## Table 2: Expense summary totals
- Location: Rows 0-0, Columns 5-8
- Type: summary
- Entity: summary
- Confidence: 0.85
