In [1]:

import pandas as pd
from pathlib import Path

excel_path = Path(r"/Users/cheickberthe/PycharmProjects/spreadsheet-analyzer/test_assets/collection/business-accounting/Business Accounting.xlsx")
df = pd.read_excel(excel_path, sheet_name=5)
print(f"Loaded sheet with shape: {df.shape}")

# Quick preview for LLM context
print("\nFirst 10 rows:")
print(df.head(10).to_string())
print("\nLast 5 rows:")
print(df.tail(5).to_string())

# Get basic info
sheet_name = "None"
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"


Loaded sheet with shape: (47, 5)

First 10 rows:
                  Date                                       Item Amount (USD) Amount (CFA) Unnamed: 4
0  2023-01-29 00:00:00                              Western Union    82.817929        50000        NaN
1  2023-02-16 00:00:00                              Western Union       400.58       210000        NaN
2  2023-04-27 00:00:00                               Sent to Papa     17096.69     10000000        NaN
3  2023-05-30 00:00:00                   Sent to Yaya via Atchima       3414.2      2058000        NaN
4  2023-06-02 00:00:00          Sent to Yaya via XE/Ria in Guinea       518.46       304168        NaN
5  2023-06-06 00:00:00  Western Union to Abdoulaye Bamba for Yaya        48.41        28615        NaN
6  2023-06-13 00:00:00                   Sent to Yaya via Atchima         5000      2994000        NaN
7  2023-06-14 00:00:00                 Sent to Yaya via MoneyGram      1488.69       874000        NaN
8  2023-06-21 00:00:00  

In [2]:

excel_file_name = excel_path.name
sheet_dimensions = f"{df.shape[0]} rows x {df.shape[1]} columns"
sheet_name


'None'

In [3]:

# Table Detection Analysis
print("=== DATA STRUCTURE ANALYSIS ===")
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n=== FIRST 10 ROWS ===")
print(df.head(10).to_string())
print("\n=== LAST 5 ROWS ===")
print(df.tail(5).to_string())

print("\n=== COLUMN ANALYSIS ===")
# Check for empty columns that might separate tables
for i, col in enumerate(df.columns):
    non_null_count = df[col].notna().sum()
    print(f"Column {i} ('{col}'): {non_null_count}/{len(df)} non-null values")

print("\n=== EMPTY ROW ANALYSIS ===")
# Check for empty rows that might separate tables
empty_rows = df.isnull().all(axis=1)
if empty_rows.any():
    empty_row_indices = empty_rows[empty_rows].index.tolist()
    print(f"Empty rows found at indices: {empty_row_indices[:10]}")
else:
    print("No completely empty rows found")

print("\n=== SIDE-BY-SIDE ANALYSIS ===")
# Check for potential side-by-side tables by examining column groups
left_cols = df.iloc[:, 0:7]  # First 7 columns
right_cols = df.iloc[:, 7:]  # Remaining columns
print(f"Left section shape: {left_cols.shape}")
print(f"Right section shape: {right_cols.shape}")
print(f"Left section non-null density: {left_cols.notna().sum().sum() / (left_cols.shape[0] * left_cols.shape[1]):.2f}")
print(f"Right section non-null density: {right_cols.notna().sum().sum() / (right_cols.shape[0] * right_cols.shape[1]):.2f}")


=== DATA STRUCTURE ANALYSIS ===
DataFrame shape: (47, 5)
Columns: ['Date', 'Item', 'Amount (USD)', 'Amount (CFA)', 'Unnamed: 4']

=== FIRST 10 ROWS ===
                  Date                                       Item Amount (USD) Amount (CFA) Unnamed: 4
0  2023-01-29 00:00:00                              Western Union    82.817929        50000        NaN
1  2023-02-16 00:00:00                              Western Union       400.58       210000        NaN
2  2023-04-27 00:00:00                               Sent to Papa     17096.69     10000000        NaN
3  2023-05-30 00:00:00                   Sent to Yaya via Atchima       3414.2      2058000        NaN
4  2023-06-02 00:00:00          Sent to Yaya via XE/Ria in Guinea       518.46       304168        NaN
5  2023-06-06 00:00:00  Western Union to Abdoulaye Bamba for Yaya        48.41        28615        NaN
6  2023-06-13 00:00:00                   Sent to Yaya via Atchima         5000      2994000        NaN
7  2023-06-14 00:00:00  

  print(f"Right section non-null density: {right_cols.notna().sum().sum() / (right_cols.shape[0] * right_cols.shape[1]):.2f}")


In [4]:
# Since the analysis results indicate a RuntimeWarning related to density calculation, it suggests that the data has some non-uniformity or missing values.
# Let's examine the structure of the DataFrame to identify potential tables.

# Check the shape, info, and a sample of the data
info = df.info()
head = df.head()
shape = df.shape

# Check for empty columns and rows to identify potential boundaries
empty_rows = df.isnull().all(axis=1)
empty_row_indices = empty_rows[empty_rows].index.tolist()
empty_cols = df.isnull().all(axis=0)
empty_col_indices = empty_cols[empty_cols].index.tolist()

# Check for column patterns to identify different entity types or tables
column_patterns = []
for col in df.columns:
    unique_vals = df[col].dropna().unique()
    column_patterns.append((col, len(unique_vals)))

# Based on the above, identify potential table boundaries
# For simplicity, assume tables are separated by empty rows or columns
# and look for clusters of non-empty data

# Find start and end rows of tables based on non-empty data blocks
import numpy as np
non_empty_rows = ~empty_rows
row_groups = []
start_idx = None
for idx, val in enumerate(non_empty_rows):
    if val and start_idx is None:
        start_idx = idx
    elif not val and start_idx is not None:
        row_groups.append((start_idx, idx - 1))
        start_idx = None
if start_idx is not None:
    row_groups.append((start_idx, len(non_empty_rows) - 1))

# Find start and end columns of tables based on non-empty columns
non_empty_cols = ~empty_cols
col_groups = []
start_col_idx = None
for idx, val in enumerate(non_empty_cols):
    if val and start_col_idx is None:
        start_col_idx = idx
    elif not val and start_col_idx is not None:
        col_groups.append((start_col_idx, idx - 1))
        start_col_idx = None
if start_col_idx is not None:
    col_groups.append((start_col_idx, len(non_empty_cols) - 1))

# Now, create table boundaries based on these groups
# For each row group and column group, define a table
# Filter out groups that are too small (less than 3 rows or columns)
detected_tables = []
for r_start, r_end in row_groups:
    for c_start, c_end in col_groups:
        if (r_end - r_start + 1) >= 3 and (c_end - c_start + 1) >= 3:
            detected_tables.append({
                "start_row": r_start,
                "end_row": r_end,
                "start_col": c_start,
                "end_col": c_end,
                "description": "Detected table based on non-empty data blocks",
                "entity_type": "business data",
                "confidence": 0.8,
                "table_type": "DETAIL"
            })

# Output the detected tables
detected_tables


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47 entries, 0 to 46
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Date          27 non-null     object
 1   Item          28 non-null     object
 2   Amount (USD)  27 non-null     object
 3   Amount (CFA)  28 non-null     object
 4   Unnamed: 4    9 non-null      object
dtypes: object(5)
memory usage: 2.0+ KB


[{'start_row': 0,
  'end_row': 18,
  'start_col': 0,
  'end_col': 4,
  'description': 'Detected table based on non-empty data blocks',
  'entity_type': 'business data',
  'confidence': 0.8,
  'table_type': 'DETAIL'},
 {'start_row': 38,
  'end_row': 46,
  'start_col': 0,
  'end_col': 4,
  'description': 'Detected table based on non-empty data blocks',
  'entity_type': 'business data',
  'confidence': 0.8,
  'table_type': 'DETAIL'}]

In [5]:

# Extract detection results created by LLM
# The detector should have created a 'detected_tables' variable with the results

# CLAUDE-TEST-WORKAROUND: Validate that detected_tables exists and is properly formatted
if 'detected_tables' in globals():
    # Validate it's a list
    if isinstance(detected_tables, list):
        detection_results = detected_tables
        print(f"✅ Found {len(detection_results)} tables from LLM detection")
        # Validate first table has required fields (if any tables exist)
        if detection_results:
            required_fields = ['table_id', 'description', 'start_row', 'end_row', 'start_col', 'end_col']
            first_table = detection_results[0]
            missing_fields = [f for f in required_fields if f not in first_table]
            if missing_fields:
                print(f"⚠️ Warning: First table missing fields: {missing_fields}")
    else:
        print(f"❌ Error: detected_tables is not a list, it's a {type(detected_tables)}")
        detection_results = []
else:
    # CLAUDE-GOTCHA: Gemini sometimes fails to create the variable even after multiple prompts
    print("❌ No 'detected_tables' variable found - LLM failed to complete detection")
    detection_results = []

detection_results


✅ Found 2 tables from LLM detection


[{'start_row': 0,
  'end_row': 18,
  'start_col': 0,
  'end_col': 4,
  'description': 'Detected table based on non-empty data blocks',
  'entity_type': 'business data',
  'confidence': 0.8,
  'table_type': 'DETAIL'},
 {'start_row': 38,
  'end_row': 46,
  'start_col': 0,
  'end_col': 4,
  'description': 'Detected table based on non-empty data blocks',
  'entity_type': 'business data',
  'confidence': 0.8,
  'table_type': 'DETAIL'}]


# Table Detection Results

Detected 2 tables:


## Table 1: Detected table based on non-empty data blocks
- Location: Rows 0-18, Columns 0-4
- Type: detail
- Entity: business data
- Confidence: 0.80

## Table 2: Detected table based on non-empty data blocks
- Location: Rows 38-46, Columns 0-4
- Type: detail
- Entity: business data
- Confidence: 0.80
