# Palm Case Study - ResortChain Financial Analysis



## Overview



## Data Loading



**Code Cell Description:**

This code cell loads all required libraries and datasets, configures plotting defaults, and performs initial data validation.


In [None]:
# CELL 1: SETUP & DATA LOADING
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Plotly defaults
px.defaults.template = 'plotly_white'
px.defaults.width = 1100
px.defaults.height = 500

# Configure dataset paths (absolute)
base_dir = Path("/Users/gianniskotsas/Documents/Side Projects/palm-case-study/scripts/datasets/raw")
transactions_path = base_dir / "transactions.csv"
balances_path = base_dir / "balances.csv"
system_forecasts_path = base_dir / "system_forecasts.csv"
user_forecasts_path = base_dir / "user_forecasts.csv"

# Helper: choose main date column name from available candidates
# IMPORTANT: value_date is prioritized over booking_date
DATE_CANDIDATES = [
    'value_date', 'booking_date', 'date', 'transaction_date', 'transactionDate'
]

def pick_date_column(df: pd.DataFrame) -> str | None:
    for c in DATE_CANDIDATES:
        if c in df.columns:
            return c
    # try to detect first column with datetime dtype
    for c in df.columns:
        if pd.api.types.is_datetime64_any_dtype(df[c]):
            return c
    # try to parse any column ending with 'date'
    for c in df.columns:
        if 'date' in c.lower():
            try:
                _ = pd.to_datetime(df[c], errors='raise')
                return c
            except Exception:
                continue
    return None

# Load Transactions: support CSV (expected) or Excel fallback
transactions = None
if transactions_path.suffix.lower() in {'.xlsx', '.xls'}:
    # Excel with potential multiple sheets
    xl = pd.ExcelFile(transactions_path)
    sheet_names = set([s.lower() for s in xl.sheet_names])
    try:
        tx_sheet = next(s for s in xl.sheet_names if s.lower() in {"transactions", "tx", "data"})
    except StopIteration:
        tx_sheet = xl.sheet_names[0]
    transactions = pd.read_excel(xl, sheet_name=tx_sheet)
else:
    # CSV (semicolon sep, comma decimal)
    parse_cols = ['value_date', 'booking_date']
    parse_present = [c for c in parse_cols if c in pd.read_csv(transactions_path, sep=';', nrows=0).columns]
    transactions = pd.read_csv(
        transactions_path,
        sep=';',
        decimal=',',
        dtype={
            'account_number': 'string',
            'currency': 'string',
            'credit_or_debit': 'string',
            'additional_info': 'string',
            'remittence_info': 'string',
            'bank_reference': 'string',
            'creditor_name': 'string',
            'debtor_name': 'string',
        },
        parse_dates=parse_present if len(parse_present) > 0 else None,
        dayfirst=True,
        infer_datetime_format=True,
        engine='python'
    )

# Load optional datasets when present
balances = None
if balances_path.exists():
    try:
        balances = pd.read_csv(balances_path, sep=';', decimal=',', parse_dates=['balance_date'])
    except Exception:
        balances = pd.read_csv(balances_path, sep=';', decimal=',')
        if 'balance_date' in balances.columns:
            balances['balance_date'] = pd.to_datetime(balances['balance_date'], errors='coerce')

system_forecasts = None
if system_forecasts_path.exists():
    system_forecasts = pd.read_csv(system_forecasts_path, sep=';', decimal=',')

user_forecasts = None
if user_forecasts_path.exists():
    user_forecasts = pd.read_csv(user_forecasts_path, sep=';', decimal=',')

# Identify main date column for transactions
trx_date_col = pick_date_column(transactions) if transactions is not None else None
if trx_date_col is not None:
    transactions[trx_date_col] = pd.to_datetime(transactions[trx_date_col], errors='coerce')

# Basic summary
print("=" * 60)
print("DATA LOADING SUMMARY")
print("=" * 60)
print(f"Transactions loaded: {transactions is not None}")
if transactions is not None:
    print(f"  Shape: {transactions.shape}")
    print(f"  📅 DATE COLUMN USED: {trx_date_col}")
print(f"\nBalances loaded: {balances is not None}")
if balances is not None:
    print(f"  Shape: {balances.shape}")
print(f"\nSystem forecasts loaded: {system_forecasts is not None}")
if system_forecasts is not None:
    print(f"  Shape: {system_forecasts.shape}")
print(f"\nUser forecasts loaded: {user_forecasts is not None}")
if user_forecasts is not None:
    print(f"  Shape: {user_forecasts.shape}")
print("=" * 60)

# Peek
display(transactions.head(5) if transactions is not None else "transactions missing")


## Monthly Expenses vs Revenue



**Code Cell Description:**

Generates the monthly expenses chart showing stacked bars for expense categories (Payroll, Tax, FX, Investment, Insurance, Maintenance) with hotel revenue as an overlaid line. Exports data in TypeScript-ready format.


In [None]:
# CHART 1: Monthly Expenses by Category (Stacked Bar) + Hotel Revenue (Line)

# Prepare data
tx = transactions.copy()
tx['month'] = pd.to_datetime(tx[trx_date_col]).dt.to_period('M')

# Extract category from additional_info
tx['category'] = tx['additional_info'].str.extract(r'(.*?) - ')[0]

# Separate expenses (DBIT) and revenue (CRDT)
expenses = tx[(tx['credit_or_debit'] == 'DBIT') & (tx['category'].notna())]
revenue = tx[(tx['credit_or_debit'] == 'CRDT')]

# Aggregate monthly expenses by category
monthly_expenses = expenses.groupby(['month', 'category'])['amount'].sum().reset_index()
monthly_expenses['month_str'] = monthly_expenses['month'].astype(str)

# Aggregate monthly revenue
monthly_revenue = revenue.groupby('month')['amount'].sum().reset_index()
monthly_revenue['month_str'] = monthly_revenue['month'].astype(str)

# Create stacked bar chart for expenses
fig1 = go.Figure()

# Add expense bars by category
categories = monthly_expenses['category'].unique()
for category in sorted(categories):
    cat_data = monthly_expenses[monthly_expenses['category'] == category]
    fig1.add_trace(go.Bar(
        x=cat_data['month_str'],
        y=cat_data['amount'],
        name=category,
        hovertemplate='%{x}<br>%{fullData.name}: €%{y:,.2f}<extra></extra>'
    ))

# Add revenue line
fig1.add_trace(go.Scatter(
    x=monthly_revenue['month_str'],
    y=monthly_revenue['amount'],
    name='Hotel Revenue',
    mode='lines+markers',
    line=dict(color='darkgreen', width=3),
    marker=dict(size=8),
    yaxis='y2',
    hovertemplate='%{x}<br>Revenue: €%{y:,.2f}<extra></extra>'
))

# Update layout
fig1.update_layout(
    title='Monthly Expenses by Category & Hotel Revenue',
    xaxis=dict(title='Month'),
    yaxis=dict(title='Expenses (EUR)', side='left'),
    yaxis2=dict(title='Revenue (EUR)', overlaying='y', side='right'),
    barmode='stack',
    legend=dict(orientation='v', yanchor='top', y=1, xanchor='left', x=1.15),
    hovermode='x unified',
    height=600,
    width=1200
)

fig1.show()

# Calculate and display average percentage share of each expense category
print("\n=== Average Percentage Share of Expense Categories ===")
total_expenses_by_category = expenses.groupby('category')['amount'].sum().reset_index()
total_expenses_overall = total_expenses_by_category['amount'].sum()
total_expenses_by_category['percentage'] = (total_expenses_by_category['amount'] / total_expenses_overall) * 100
total_expenses_by_category = total_expenses_by_category.sort_values('percentage', ascending=False)

for idx, row in total_expenses_by_category.iterrows():
    print(f"{row['category']}: {row['percentage']:.2f}% (€{row['amount']:,.2f})")
print(f"\nTotal Expenses: €{total_expenses_overall:,.2f}")

# Create DataFrame for CSV export
# Pivot expenses to have categories as columns
monthly_expenses_pivot = monthly_expenses.pivot(index='month', columns='category', values='amount').reset_index()

# Rename columns to match expected format
category_mapping = {
    'Payroll Payment': 'Payroll',
    'Tax Payment': 'Tax',
    'FX Transaction': 'FX',
    'Investment Purchase': 'Investment',
    'Insurance Premium': 'Insurance',
    'Maintenance Expenses': 'Maintenance'
}
monthly_expenses_pivot = monthly_expenses_pivot.rename(columns=category_mapping)

# Add revenue column
monthly_revenue_for_merge = monthly_revenue.copy()
monthly_revenue_for_merge['month'] = monthly_revenue_for_merge['month'].astype(str).apply(lambda x: pd.Period(x, freq='M'))
monthly_expenses_pivot = monthly_expenses_pivot.merge(
    monthly_revenue_for_merge[['month', 'amount']], 
    on='month', 
    how='left'
).rename(columns={'amount': 'Revenue'})

# Convert month to date string
monthly_expenses_pivot['date'] = monthly_expenses_pivot['month'].astype(str)
monthly_expenses_pivot = monthly_expenses_pivot.drop('month', axis=1)

# Fill NaN with 0 and reorder columns
expected_columns = ['date', 'Maintenance', 'Insurance', 'Investment', 'FX', 'Tax', 'Payroll', 'Revenue']
for col in expected_columns:
    if col not in monthly_expenses_pivot.columns and col != 'date':
        monthly_expenses_pivot[col] = 0

monthly_expenses_pivot = monthly_expenses_pivot[expected_columns].fillna(0)

# Create alias for export
revenue_expenses_chart = monthly_expenses_pivot.copy()

print("\n✓ DataFrame 'revenue_expenses_chart' created for export")
print(f"Shape: {revenue_expenses_chart.shape}")
display(revenue_expenses_chart)

# Format output for JavaScript/TypeScript
print("\n" + "="*80)
print("COPY-PASTE READY FORMAT:")
print("="*80)
print("const data = [")

for idx, row in revenue_expenses_chart.iterrows():
    # Convert date string "2025-01" to "Jan 25" format
    month_period = pd.Period(row['date'], freq='M')
    date_formatted = month_period.strftime('%b %y')
    
    print(f"  {{ date: \"{date_formatted}\", Maintenance: {row['Maintenance']}, Insurance: {row['Insurance']}, Investment: {row['Investment']}, FX: {row['FX']}, Tax: {row['Tax']}, Payroll: {row['Payroll']}, Revenue: {row['Revenue']} }},")

print("];")
print("="*80)


**Code Cell Description:**

Exports the revenue and expenses data to CSV format for potential external use or archival.


In [None]:
import csv

# Export the revenue and expenses chart data to CSV using the DataFrame `revenue_expenses_chart`
# (If your DataFrame is named differently, update the variable name below accordingly.)

# Check if the DataFrame exists in the environment
if "revenue_expenses_chart" in locals():
    export_df = revenue_expenses_chart.copy()
elif "monthly_expenses_pivot" in locals():
    # Fallback: try to use monthly_expenses_pivot if available
    export_df = monthly_expenses_pivot.copy()
else:
    raise NameError("No DataFrame named 'revenue_expenses_chart' or 'monthly_expenses_pivot' found. Please define it before exporting.")

# Ensure the columns exist and are in the correct order
csv_columns = ["date", "Maintenance", "Insurance", "Investment", "FX", "Tax", "Payroll", "Revenue"]
missing_cols = [col for col in csv_columns if col not in export_df.columns]
if missing_cols:
    raise ValueError(f"Missing columns in export DataFrame: {missing_cols}")

# Reorder columns and convert to list of dicts
export_data = export_df[csv_columns].to_dict(orient="records")

csv_filename = "revenue_expenses_chart_data.csv"

with open(csv_filename, mode="w", newline="", encoding="utf-8") as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    writer.writeheader()
    for row in export_data:
        writer.writerow(row)

print(f"Chart data exported to {csv_filename}")


## Cash Flow Drivers



**Code Cell Description:**

Calculates monthly averages, percentage shares, and trend data for all expense and revenue categories. Outputs TypeScript-formatted data for dashboard spark charts showing each category's contribution and monthly pattern.


In [None]:
# CASH FLOW DRIVERS: Calculate monthly values, averages, and percentage shares

# Use the existing data from previous cells
# Expenses by category (monthly)
expense_categories = ['Maintenance', 'Insurance', 'Investment', 'FX', 'Tax', 'Payroll']
revenue_categories = ['Resort Revenue', 'Investment Income', 'Tax Refund']

# Get monthly expense data (already computed in earlier cells)
# From revenue_expenses_chart DataFrame
expenses_df = revenue_expenses_chart[['date'] + expense_categories].copy()
revenue_income_df = monthly_income_pivot[['date'] + revenue_categories].copy()

# Calculate totals
total_expenses_per_month = expenses_df[expense_categories].sum(axis=1)
total_revenue_per_month = revenue_income_df[revenue_categories].sum(axis=1)

# Calculate overall totals and averages
total_expenses_all = expenses_df[expense_categories].sum().sum()
total_revenue_all = revenue_income_df[revenue_categories].sum().sum()

print("="*80)
print("CASH FLOW DRIVERS ANALYSIS")
print("="*80)

# Build the cash flow drivers data structure
cash_flow_drivers = {
    'expenses': {},
    'revenue': {}
}

print("\n--- EXPENSE CATEGORIES ---\n")
for category in expense_categories:
    monthly_values = expenses_df[category].tolist()
    avg_monthly = expenses_df[category].mean()
    total_category = expenses_df[category].sum()
    share_pct = (total_category / total_expenses_all) * 100
    
    cash_flow_drivers['expenses'][category] = {
        'avg': round(avg_monthly, 2),
        'share': round(share_pct, 2),
        'data': [round(v, 2) for v in monthly_values]
    }
    
    print(f"{category}:")
    print(f"  Average: €{avg_monthly:,.2f}/month")
    print(f"  Share: {share_pct:.2f}% of total expenses")
    print(f"  Monthly values: {[round(v, 2) for v in monthly_values]}")
    print()

print("\n--- REVENUE CATEGORIES ---\n")
for category in revenue_categories:
    monthly_values = revenue_income_df[category].tolist()
    avg_monthly = revenue_income_df[category].mean()
    total_category = revenue_income_df[category].sum()
    share_pct = (total_category / total_revenue_all) * 100
    
    cash_flow_drivers['revenue'][category] = {
        'avg': round(avg_monthly, 2),
        'share': round(share_pct, 2),
        'data': [round(v, 2) for v in monthly_values]
    }
    
    print(f"{category}:")
    print(f"  Average: €{avg_monthly:,.2f}/month")
    print(f"  Share: {share_pct:.2f}% of total revenue")
    print(f"  Monthly values: {[round(v, 2) for v in monthly_values]}")
    print()

# Generate TypeScript-ready format
print("\n" + "="*80)
print("COPY-PASTE READY FORMAT FOR TYPESCRIPT:")
print("="*80)
print("const cashFlowDrivers = {")
print("  expenses: {")

# Sort expenses by share (largest to smallest)
sorted_expenses = sorted(
    cash_flow_drivers['expenses'].items(), 
    key=lambda x: x[1]['share'], 
    reverse=True
)

for i, (category, data) in enumerate(sorted_expenses):
    comma = "," if i < len(sorted_expenses) - 1 else ""
    data_str = ", ".join(str(v) for v in data['data'])
    print(f"    '{category}': {{ avg: {data['avg']}, share: {data['share']}, data: [{data_str}] }}{comma}")

print("  },")
print("  revenue: {")

# Sort revenue by share (largest to smallest)
sorted_revenue = sorted(
    cash_flow_drivers['revenue'].items(), 
    key=lambda x: x[1]['share'], 
    reverse=True
)

for i, (category, data) in enumerate(sorted_revenue):
    comma = "," if i < len(sorted_revenue) - 1 else ""
    data_str = ", ".join(str(v) for v in data['data'])
    print(f"    '{category}': {{ avg: {data['avg']}, share: {data['share']}, data: [{data_str}] }}{comma}")

print("  }")
print("};")
print("="*80)

# Summary
print(f"\nTotal Expenses: €{total_expenses_all:,.2f}")
print(f"Total Revenue: €{total_revenue_all:,.2f}")
print(f"Number of months: {len(expenses_df)}")
print(f"\n✓ Cash flow drivers data ready for dashboard spark charts")


## Investment Expenses & FX Fees



**Code Cell Description:**

Creates grouped bar chart comparing monthly investment expenses and FX fees side-by-side, enabling analysis of these operational costs over time.


In [None]:
# CHART 6: Exchange Fees Ratio (Line) + Investment Expenses (Bar)

# Prepare monthly data - use loaded transactions
tx_monthly = transactions.copy()
tx_monthly['date'] = pd.to_datetime(tx_monthly[trx_date_col])
tx_monthly['month'] = tx_monthly['date'].dt.to_period('M')

# Calculate monthly income (CRDT)
monthly_income = tx_monthly[tx_monthly['credit_or_debit'] == 'CRDT'].groupby('month')['amount'].sum().reset_index()
monthly_income.columns = ['month', 'income']

# Calculate monthly exchange fees (FX fees where debtor_name is 'FX Bank')
fx_expenses = tx_monthly[
    (tx_monthly['credit_or_debit'] == 'DBIT') & 
    (tx_monthly['debtor_name'] == 'FX Bank')
].groupby('month')['amount'].sum().reset_index()
fx_expenses.columns = ['month', 'fx_fees']

print(f"FX fees transactions found: {len(tx_monthly[(tx_monthly['credit_or_debit'] == 'DBIT') & (tx_monthly['debtor_name'] == 'FX Bank')])}")

# Calculate monthly investment expenses
investment_expenses = tx_monthly[
    (tx_monthly['credit_or_debit'] == 'DBIT') & 
    (tx_monthly['additional_info'].str.contains('Investment', na=False))
].groupby('month')['amount'].sum().reset_index()
investment_expenses.columns = ['month', 'investment_expenses']

# Merge data
chart6_data = monthly_income.merge(fx_expenses, on='month', how='left')
chart6_data = chart6_data.merge(investment_expenses, on='month', how='left')
chart6_data['fx_fees'] = chart6_data['fx_fees'].fillna(0)
chart6_data['investment_expenses'] = chart6_data['investment_expenses'].fillna(0)

chart6_data['month_str'] = chart6_data['month'].astype(str)

# Create grouped bar chart
fig6 = go.Figure()

# Add investment expenses bars
fig6.add_trace(go.Bar(
    x=chart6_data['month_str'],
    y=chart6_data['investment_expenses'],
    name='Investment Expenses',
    marker_color='lightcoral',
    hovertemplate='%{x}<br>Investment: €%{y:,.2f}<extra></extra>'
))

# Add FX fees bars
fig6.add_trace(go.Bar(
    x=chart6_data['month_str'],
    y=chart6_data['fx_fees'],
    name='FX Fees',
    marker_color='steelblue',
    hovertemplate='%{x}<br>FX Fees: €%{y:,.2f}<extra></extra>'
))

# Update layout for grouped bars
fig6.update_layout(
    title='Investment Expenses & FX Fees by Month',
    xaxis=dict(title='Month'),
    yaxis=dict(title='Amount (EUR)'),
    barmode='group',
    hovermode='x unified',
    height=600,
    width=1200,
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
)

fig6.show()

# Display summary stats
print("\nMonthly Investment & FX Fees Summary:")
print(chart6_data[['month_str', 'investment_expenses', 'fx_fees']])


## Revenue Patterns Analysis



**Code Cell Description:**

Generates a heatmap visualization showing resort revenue patterns by day of week and week of year. Reveals business-focused clientele with mid-week peaks and seasonal trends.


In [None]:
# CHART 2: Revenue Heatmap (Day of Week vs Week of Year)

# Confirm which date column is being used
print(f"Using date column for heatmap: {trx_date_col}")

# Prepare revenue data - use loaded transactions
revenue_data = transactions[(transactions['credit_or_debit'] == 'CRDT') & 
                             (transactions['additional_info'].str.contains('Resort Revenue', na=False))].copy()

revenue_data['date'] = pd.to_datetime(revenue_data[trx_date_col])
revenue_data['day_of_week'] = revenue_data['date'].dt.day_name()
# Convert week to integer properly
revenue_data['week_of_year'] = revenue_data['date'].dt.isocalendar().week.astype(int)

# Aggregate revenue by week and day
heatmap_data = revenue_data.groupby(['week_of_year', 'day_of_week'], as_index=False)['amount'].sum()

# Debug: check data
print(f"Heatmap data shape: {heatmap_data.shape}")
print(f"Sample data:\n{heatmap_data.head(10)}")
print(f"Week range: {heatmap_data['week_of_year'].min()} to {heatmap_data['week_of_year'].max()}")

# Pivot for heatmap
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
heatmap_pivot = heatmap_data.pivot(index='day_of_week', columns='week_of_year', values='amount')
heatmap_pivot = heatmap_pivot.reindex(day_order)

# Replace NaN with 0 for visualization
heatmap_pivot = heatmap_pivot.fillna(0)

# Create heatmap
fig2 = go.Figure(data=go.Heatmap(
    z=heatmap_pivot.values,
    x=heatmap_pivot.columns.tolist(),
    y=heatmap_pivot.index.tolist(),
    colorscale='YlOrRd',
    hovertemplate='Week %{x}<br>%{y}<br>Revenue: €%{z:,.2f}<extra></extra>',
    colorbar=dict(title='Revenue (EUR)'),
    zmid=None  # Auto-scale colors
))

fig2.update_layout(
    title='Hotel Revenue Heatmap: Day of Week vs Week of Year',
    xaxis=dict(title='Week of Year', type='category'),
    yaxis=dict(title='Day of Week'),
    height=500,
    width=1200
)

fig2.show()


**Code Cell Description:**

Exports the heatmap data in TypeScript-ready format with complete week coverage (including zero values for days without revenue), suitable for frontend visualization components.


In [None]:
# Export heatmap data in copy-paste ready format for TypeScript/JavaScript

# Use the heatmap_data DataFrame from the previous cell
# Group by day and create the nested structure
day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

# First, let's analyze the data coverage
print("\n" + "="*80)
print("DATA COVERAGE ANALYSIS:")
print("="*80)

# Get the range of weeks in the data
all_weeks = sorted(heatmap_data['week_of_year'].unique())
min_week = heatmap_data['week_of_year'].min()
max_week = heatmap_data['week_of_year'].max()
print(f"Week range in data: {min_week} to {max_week}")
print(f"Total weeks with ANY data: {len(all_weeks)}")

# Create a complete grid of all possible day/week combinations
all_possible_weeks = range(min_week, max_week + 1)
total_possible_combinations = len(day_order) * len(all_possible_weeks)
actual_data_points = len(heatmap_data)

print(f"\nTotal possible day/week combinations: {total_possible_combinations}")
print(f"Actual data points with revenue > 0: {actual_data_points}")
print(f"Missing combinations: {total_possible_combinations - actual_data_points}")
print(f"Coverage: {(actual_data_points / total_possible_combinations * 100):.1f}%")

# Check each day of week
print("\n" + "-"*80)
print("BREAKDOWN BY DAY OF WEEK:")
print("-"*80)
for day in day_order:
    day_data = heatmap_data[heatmap_data['day_of_week'] == day]
    weeks_with_data = set(day_data['week_of_year'].values)
    missing_weeks = set(all_possible_weeks) - weeks_with_data
    
    print(f"\n{day}:")
    print(f"  Weeks with data: {len(weeks_with_data)} out of {len(all_possible_weeks)}")
    print(f"  Missing weeks: {len(missing_weeks)}")
    if len(missing_weeks) > 0 and len(missing_weeks) <= 10:
        print(f"  Missing week numbers: {sorted(missing_weeks)}")
    elif len(missing_weeks) > 10:
        print(f"  Missing week numbers: {sorted(list(missing_weeks)[:10])}... (showing first 10)")

# Now let's check the raw revenue data to see if missing means no transactions
print("\n" + "="*80)
print("CHECKING RAW TRANSACTION DATA:")
print("="*80)

# Go back to the original revenue_data to see all days
revenue_check = revenue_data.copy()
revenue_check['has_revenue'] = revenue_check['amount'] > 0

# Count by day of week
print("\nTransactions by day of week:")
for day in day_order:
    day_count = len(revenue_check[revenue_check['day_of_week'] == day])
    day_revenue = revenue_check[revenue_check['day_of_week'] == day]['amount'].sum()
    print(f"  {day}: {day_count} transactions, €{day_revenue:,.2f} total")

print("\n" + "="*80)
print("COPY-PASTE READY HEATMAP DATA (WITH COMPLETE WEEK COVERAGE):")
print("="*80)
print("const data = [")

for day in day_order:
    # Filter data for this day
    day_data = heatmap_data[heatmap_data['day_of_week'] == day].copy()
    
    # Create a dictionary for quick lookup of week -> revenue
    week_revenue_map = dict(zip(day_data['week_of_year'], day_data['amount']))
    
    # Format the data array - INCLUDING ALL WEEKS (missing ones get value: 0)
    data_entries = []
    for week in all_possible_weeks:
        # Check if this week has data, otherwise use 0
        value = round(week_revenue_map.get(week, 0), 2)
        data_entries.append(f'{{ weekOfYear: {week}, index: 1, value: {value} }}')
    
    # Join all entries for this day
    data_str = ', '.join(data_entries)
    
    print(f'  {{')
    print(f'    "name": "{day}",')
    print(f'    "data": [{data_str}]')
    print(f'  }},')

print("];")
print("="*80)
print(f"\nNote: Complete dataset includes ALL {len(all_possible_weeks)} weeks for each day.")
print(f"Missing week/day combinations now included with value: 0")

# Summary statistics
print(f"\nSummary:")
print(f"- Total data points: {len(heatmap_data)}")
print(f"- Days with data: {heatmap_data['day_of_week'].nunique()}")
print(f"- Weeks with data: {heatmap_data['week_of_year'].nunique()}")
print(f"- Total revenue: €{heatmap_data['amount'].sum():,.2f}")
print(f"- Average daily revenue: €{heatmap_data['amount'].mean():,.2f}")


## Daily Revenue with Holiday Correlations



**Code Cell Description:**

Creates a bar chart of daily income with vertical lines marking French bank holidays. Helps identify correlations between holidays, major events (Monaco GP, Easter) and revenue spikes.


In [None]:
# CHART 5: Daily Income Transactions with French Holidays

# Load bank holidays and filter for France
holidays_path = base_dir.parent / "extra_datasets" / "bank_holidays_2025.csv"
holidays_df = pd.read_csv(holidays_path)
france_holidays = holidays_df[holidays_df['Country'] == 'France'].copy()
france_holidays['Date'] = pd.to_datetime(france_holidays['Date'])

# Get daily income (CRDT transactions) - use loaded transactions
income_data = transactions[transactions['credit_or_debit'] == 'CRDT'].copy()
income_data['date'] = pd.to_datetime(income_data[trx_date_col])

# Aggregate daily income
daily_income = income_data.groupby('date')['amount'].sum().reset_index()
daily_income.columns = ['date', 'income']

# Create chart
fig5 = go.Figure()

# Add income bars
fig5.add_trace(go.Bar(
    x=daily_income['date'],
    y=daily_income['income'],
    name='Daily Income',
    marker_color='lightgreen',
    hovertemplate='%{x|%Y-%m-%d}<br>Income: €%{y:,.2f}<extra></extra>'
))

# Add vertical lines for French holidays using shapes
for idx, holiday in france_holidays.iterrows():
    # Convert Timestamp to datetime for plotly
    holiday_date = pd.Timestamp(holiday['Date']).to_pydatetime()
    
    fig5.add_shape(
        type="line",
        x0=holiday_date,
        x1=holiday_date,
        y0=0,
        y1=1,
        yref="paper",
        line=dict(color="red", width=2, dash="dash")
    )
    
    # Add annotation
    fig5.add_annotation(
        x=holiday_date,
        y=1,
        yref="paper",
        text=holiday['Holiday'],
        showarrow=False,
        textangle=-90,
        font=dict(size=8, color="red"),
        yanchor="bottom"
    )

fig5.update_layout(
    title='Daily Income Transactions with French Bank Holidays (2025)',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Income (EUR)'),
    hovermode='x unified',
    height=600,
    width=1200,
    showlegend=True
)

fig5.show()

# Create alias for export
daily_income_chart = daily_income.copy()
french_holidays_chart = france_holidays.copy()

print("\n✓ DataFrame 'daily_income_chart' created for export")
print(f"Shape: {daily_income_chart.shape}")
print(f"Date range: {daily_income_chart['date'].min()} to {daily_income_chart['date'].max()}")
print(f"Total income: €{daily_income_chart['income'].sum():,.2f}")

print("\n✓ DataFrame 'french_holidays_chart' created for export")
print(f"French holidays count: {len(french_holidays_chart)}")

# Format output for JavaScript/TypeScript
print("\n" + "="*80)
print("COPY-PASTE READY FORMAT (DAILY INCOME):")
print("="*80)
print("const dailyIncomeData = [")

for idx, row in daily_income_chart.iterrows():
    date_formatted = row['date'].strftime('%b %d')
    income_value = round(row['income'], 2)
    print(f"  {{ date: '{date_formatted}', income: {income_value} }},")

print("];")
print("="*80)

print("\n" + "="*80)
print("COPY-PASTE READY FORMAT (FRENCH HOLIDAYS):")
print("="*80)
print("const frenchHolidays = [")

for idx, row in french_holidays_chart.iterrows():
    date_formatted = row['Date'].strftime('%b %d')
    holiday_name = row['Holiday']
    print(f"  {{ date: '{date_formatted}', holiday: '{holiday_name}' }},")

print("];")
print("="*80)


**Code Cell Description:**

Exports monthly income data broken down by category (Resort Revenue, Investment Income, Tax Refund) in TypeScript format for dashboard integration.


In [None]:
# Export Monthly Income Data (similar to the example format with categories)

# Create monthly aggregation of income
monthly_income_breakdown = income_data.copy()
monthly_income_breakdown['month'] = monthly_income_breakdown['date'].dt.to_period('M')

# Extract category/type from additional_info if available
monthly_income_breakdown['category'] = monthly_income_breakdown['additional_info'].str.extract(r'(.*?) - ')[0]

# If no clear categories, check for patterns in creditor_name or remittence_info
if monthly_income_breakdown['category'].isna().all():
    # Try to extract from creditor_name
    monthly_income_breakdown['category'] = 'Revenue'  # Default category
else:
    monthly_income_breakdown['category'] = monthly_income_breakdown['category'].fillna('Other')

# Group by month and category
monthly_income_by_category = monthly_income_breakdown.groupby(['month', 'category'])['amount'].sum().reset_index()

# Pivot to get categories as columns
monthly_income_pivot = monthly_income_by_category.pivot(index='month', columns='category', values='amount').reset_index()
monthly_income_pivot = monthly_income_pivot.fillna(0)

# Convert month to string
monthly_income_pivot['date'] = monthly_income_pivot['month'].astype(str)
monthly_income_pivot = monthly_income_pivot.drop('month', axis=1)

# Reorder columns
column_order = ['date'] + [col for col in monthly_income_pivot.columns if col != 'date']
monthly_income_pivot = monthly_income_pivot[column_order]

print("="*80)
print("MONTHLY INCOME BY CATEGORY:")
print("="*80)
display(monthly_income_pivot)

# Format for TypeScript/JavaScript (similar to the example provided)
print("\n" + "="*80)
print("COPY-PASTE READY FORMAT (MONTHLY INCOME - EXAMPLE STYLE):")
print("="*80)
print("const monthlyIncomeData = [")

for idx, row in monthly_income_pivot.iterrows():
    # Convert date string "2025-01" to "Jan 25" format
    month_period = pd.Period(row['date'], freq='M')
    date_formatted = month_period.strftime('%b %y')
    
    # Build the object dynamically based on available categories
    row_str = f"  {{ date: '{date_formatted}'"
    
    for col in monthly_income_pivot.columns:
        if col != 'date':
            value = round(row[col], 2)
            row_str += f", {col}: {value}"
    
    row_str += " },"
    print(row_str)

print("];")
print("="*80)

# Also provide a summary
print("\n" + "="*80)
print("SUMMARY:")
print("="*80)
print(f"Total months: {len(monthly_income_pivot)}")
print(f"Categories found: {[col for col in monthly_income_pivot.columns if col != 'date']}")
print(f"Total income: €{monthly_income_breakdown['amount'].sum():,.2f}")

# Create breakdown by category
if len(monthly_income_pivot.columns) > 2:  # More than just date + one category
    print("\nBreakdown by category:")
    for col in monthly_income_pivot.columns:
        if col != 'date':
            total = monthly_income_pivot[col].sum()
            pct = (total / monthly_income_breakdown['amount'].sum()) * 100
            print(f"  {col}: €{total:,.2f} ({pct:.1f}%)")


## Net Cashflow Analysis



**Code Cell Description:**

Visualizes daily net cashflow as colored bars (green for positive, red for negative) with a cumulative cash position line starting from the initial balance. Shows the 30% decline in cash position.


In [None]:
# CHART 3: Net Cashflow (Bar) + Cumulative Cash (Line)

# Get starting balance from balances.csv (first date)
starting_balance = balances.sort_values('balance_date')['amount'].iloc[0]
print(f"Starting balance from balances.csv: €{starting_balance:,.2f}")

# Calculate net cashflow per day - use loaded transactions
cashflow_data = transactions.copy()
cashflow_data['date'] = pd.to_datetime(cashflow_data[trx_date_col])
cashflow_data['amount_signed'] = cashflow_data.apply(
    lambda row: row['amount'] if row['credit_or_debit'] == 'CRDT' else -row['amount'], 
    axis=1
)

# Daily net cashflow
daily_cashflow = cashflow_data.groupby('date')['amount_signed'].sum().reset_index()
daily_cashflow.columns = ['date', 'net_cashflow']

# Calculate cumulative cash starting from the initial balance
daily_cashflow = daily_cashflow.sort_values('date')
daily_cashflow['cumulative_cash'] = starting_balance + daily_cashflow['net_cashflow'].cumsum()

# Create figure with secondary y-axis
fig3 = make_subplots(specs=[[{"secondary_y": True}]])

# Add net cashflow bars
fig3.add_trace(
    go.Bar(
        x=daily_cashflow['date'],
        y=daily_cashflow['net_cashflow'],
        name='Net Cashflow',
        marker_color=['green' if x >= 0 else 'red' for x in daily_cashflow['net_cashflow']],
        hovertemplate='%{x|%Y-%m-%d}<br>Net Cashflow: €%{y:,.2f}<extra></extra>'
    ),
    secondary_y=False
)

# Add cumulative cash line
fig3.add_trace(
    go.Scatter(
        x=daily_cashflow['date'],
        y=daily_cashflow['cumulative_cash'],
        name='Cumulative Cash',
        mode='lines',
        line=dict(color='blue', width=2),
        hovertemplate='%{x|%Y-%m-%d}<br>Cumulative: €%{y:,.2f}<extra></extra>'
    ),
    secondary_y=True
)

# Update layout
fig3.update_xaxes(title_text='Date')
fig3.update_yaxes(title_text='Net Cashflow (EUR)', secondary_y=False)
fig3.update_yaxes(title_text='Cumulative Cash (EUR)', secondary_y=True)

fig3.update_layout(
    title='Daily Net Cashflow & Cumulative Cash Position',
    hovermode='x unified',
    height=600,
    width=1200,
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1)
)

fig3.show()


## Forecast Analysis



**Code Cell Description:**

Parses the system_forecasts.csv file to extract daily forecasts from JSON format. Creates a flattened dataframe with one row per forecast date, method, and category combination.


In [None]:
# FORECASTING ANALYSIS - STEP 1: Parse System Forecasts with Date Ranges

print("=" * 80)
print("STEP 1: PARSING SYSTEM FORECASTS")
print("=" * 80)

import json
from datetime import datetime

# Parse system forecasts to create daily forecast lookup
forecasts_daily = []

for idx, row in system_forecasts.iterrows():
    try:
        start_date = pd.to_datetime(row['start_date'])
        end_date = pd.to_datetime(row['end_date'])
        method = row['forecast_method']
        category = row['category_id']
        
        # Parse forecast_amounts JSON
        forecast_data = json.loads(row['forecast_amounts'])
        
        for entry in forecast_data:
            date_str = entry.get('datetime.date', '')
            amount = entry.get('amount', 0)
            
            if date_str and amount is not None:
                forecasts_daily.append({
                    'forecast_method': method,
                    'category': category,
                    'start_date': start_date,
                    'end_date': end_date,
                    'forecast_date': pd.to_datetime(date_str),
                    'forecast_amount': amount
                })
    except (json.JSONDecodeError, TypeError, KeyError) as e:
        print(f"Warning: Skipping row {idx} due to error: {e}")
        continue

forecasts_daily_df = pd.DataFrame(forecasts_daily)

print(f"\n✓ Parsed {len(forecasts_daily_df):,} daily forecast entries")
print(f"  Forecast methods: {forecasts_daily_df['forecast_method'].unique().tolist()}")
print(f"  Categories: {forecasts_daily_df['category'].nunique()}")
print(f"  Date range: {forecasts_daily_df['forecast_date'].min()} to {forecasts_daily_df['forecast_date'].max()}")
print(f"\nSample:")
display(forecasts_daily_df.head())


**Code Cell Description:**

Matches each actual transaction to the most recent applicable forecast (by date, category, and method). This enables point-in-time forecast accuracy analysis and identifies unmatched transactions.


In [None]:
# FORECASTING ANALYSIS - STEP 2: Match Transactions to Forecasts

print("=" * 80)
print("STEP 2: MATCHING TRANSACTIONS TO FORECASTS")
print("=" * 80)

# Categorize transactions (reuse from previous analysis)
def categorize_transaction(row):
    desc = str(row.get('remittence_info', '')).lower()
    credit_or_debit = row.get('credit_or_debit', '')
    
    if 'resort revenue' in desc:
        return 'cash_in_resort_revenue'
    elif 'payroll' in desc or 'salary' in desc:
        return 'cash_out_payroll'
    elif 'tax' in desc and credit_or_debit == 'CRDT':
        return 'cash_in_tax_income'
    elif 'tax' in desc and credit_or_debit == 'DBIT':
        return 'cash_out_tax_payments'
    elif 'investment' in desc and 'income' in desc:
        return 'cash_in_investments_income'
    elif 'investment' in desc and credit_or_debit == 'DBIT':
        return 'cash_out_investments_outflow'
    elif 'fx' in desc or 'foreign exchange' in desc:
        return 'cash_out_foreign_exchange_expenses'
    elif 'insurance' in desc:
        return 'cash_out_insurance_costs'
    elif 'maintenance' in desc:
        return 'cash_out_resort_maintenance_expenses'
    else:
        return 'other'

transactions['category'] = transactions.apply(categorize_transaction, axis=1)

# Point-in-time matching: for each transaction, find the best forecast
matched_forecasts = []
unmatched_count = 0

for idx, txn in transactions.iterrows():
    txn_date = pd.to_datetime(txn[trx_date_col])
    txn_category = txn['category']
    txn_amount = txn['amount']
    
    # Find all forecasts that cover this date
    candidates = forecasts_daily_df[
        (forecasts_daily_df['forecast_date'] == txn_date) &
        (forecasts_daily_df['category'] == txn_category) &
        (forecasts_daily_df['start_date'] <= txn_date) &
        (forecasts_daily_df['end_date'] >= txn_date)
    ].copy()
    
    if len(candidates) > 0:
        # Select forecast with LATEST (most recent) start_date
        best_idx = candidates['start_date'].idxmax()
        best_forecast = candidates.loc[best_idx]
        forecast_age = (txn_date - best_forecast['start_date']).days
        
        matched_forecasts.append({
            'transaction_date': txn_date,
            'category': txn_category,
            'forecast_method': best_forecast['forecast_method'],
            'actual_amount': txn_amount,
            'forecast_amount': best_forecast['forecast_amount'],
            'forecast_age': forecast_age,
            'forecast_start_date': best_forecast['start_date']
        })
    else:
        unmatched_count += 1

matched_df = pd.DataFrame(matched_forecasts)

print(f"\n✓ Matched {len(matched_df):,} transactions to forecasts")
print(f"  Unmatched transactions: {unmatched_count}")
print(f"  Categories matched: {matched_df['category'].nunique()}")
print(f"  Forecast methods: {matched_df['forecast_method'].unique().tolist()}")
print(f"  Average forecast age: {matched_df['forecast_age'].mean():.1f} days")
print(f"\nSample matched data:")
display(matched_df.head(10))


**Code Cell Description:**

Calculates forecast accuracy metrics including MAPE (Mean Absolute Percentage Error), MAE (Mean Absolute Error), and Bias for each forecast method and category combination. Identifies which methods perform best.


In [None]:
# FORECASTING ANALYSIS - STEP 3: Calculate Aggregate Accuracy Metrics

print("=" * 80)
print("STEP 3: CALCULATING ACCURACY METRICS")
print("=" * 80)

# Calculate error metrics
matched_df['abs_error'] = abs(matched_df['actual_amount'] - matched_df['forecast_amount'])
matched_df['error'] = matched_df['forecast_amount'] - matched_df['actual_amount']

# Calculate metrics by (forecast_method, category)
def calculate_metrics(group):
    # Filter out rows where actual is 0 for MAPE calculation
    non_zero_actual = group[group['actual_amount'] != 0]
    
    metrics = {
        'MAE': group['abs_error'].mean(),
        'Bias': group['error'].mean(),
        'Count': len(group),
        'Avg_Forecast_Age': group['forecast_age'].mean()
    }
    
    # Calculate MAPE only on non-zero actuals
    if len(non_zero_actual) > 0:
        mape_values = (non_zero_actual['abs_error'] / non_zero_actual['actual_amount'].abs()) * 100
        metrics['MAPE'] = mape_values.mean()
        
        # Bias as percentage of mean actual
        metrics['Bias_Pct'] = (group['error'].mean() / non_zero_actual['actual_amount'].abs().mean()) * 100
    else:
        metrics['MAPE'] = np.nan
        metrics['Bias_Pct'] = np.nan
    
    return pd.Series(metrics)

accuracy_df = matched_df.groupby(['forecast_method', 'category']).apply(calculate_metrics).reset_index()

# Sort by MAPE (best to worst)
accuracy_df = accuracy_df.sort_values('MAPE').reset_index(drop=True)

print(f"\n✓ Calculated accuracy metrics for {len(accuracy_df)} (method, category) combinations")
print(f"\nAccuracy Summary (sorted by MAPE):")
print("="*100)

display(accuracy_df)

# Best overall method (weighted by count)
method_summary = accuracy_df.groupby('forecast_method').apply(
    lambda g: pd.Series({
        'MAE': (g['MAE'] * g['Count']).sum() / g['Count'].sum(),
        'MAPE': (g['MAPE'] * g['Count']).sum() / g['Count'].sum(),
        'Bias_Pct': (g['Bias_Pct'] * g['Count']).sum() / g['Count'].sum(),
        'Count': g['Count'].sum(),
        'Avg_Forecast_Age': (g['Avg_Forecast_Age'] * g['Count']).sum() / g['Count'].sum()
    })
).sort_values('MAPE')

print(f"\n\nMethod Summary (Weighted by Count):")
print("="*100)
display(method_summary)


**Code Cell Description:**

Reconstructs a complete daily time series (Jan-Aug 2025) for all categories and forecast methods. Uses most recent forecasts for each date and includes user forecast data. Creates foundation for unified forecast analysis.


In [None]:
# FORECASTING ANALYSIS - STEP 4: Reconstruct Daily Time Series (Jan-Aug 2025)

print("=" * 80)
print("STEP 4: RECONSTRUCTING DAILY TIME SERIES")
print("=" * 80)

# Prepare user forecasts (exclude dismissed/cancelled)
user_forecasts['value_date'] = pd.to_datetime(user_forecasts['value_date'], errors='coerce')
user_forecasts_active = user_forecasts[
    ~user_forecasts['status'].isin(['dismissed', 'cancelled'])
].copy()

# Generate complete date range
date_range = pd.date_range('2025-01-01', '2025-08-31', freq='D')
categories = transactions['category'].unique()

daily_reconstruction = []

print(f"\nProcessing {len(date_range)} days x {len(categories)} categories = {len(date_range) * len(categories):,} records")
print("Using MOST RECENT forecast (latest start_date) for each date...")

for date in date_range:
    for category in categories:
        # Get actual amount for this date and category
        actual = transactions[
            (pd.to_datetime(transactions[trx_date_col]) == date) & 
            (transactions['category'] == category)
        ]['amount'].sum()
        
        # Make cash_out categories negative
        if category.startswith('cash_out_'):
            actual = -actual
        
        row = {
            'date': date,
            'category': category,
            'actual': actual if actual != 0 else 0
        }
        
        # For each forecast method, find the MOST RECENT forecast (latest start_date)
        for method in ['ml_model', 'statistical_model', 'foundation_model', 'historical', 'static']:
            candidates = forecasts_daily_df[
                (forecasts_daily_df['forecast_date'] == date) &
                (forecasts_daily_df['category'] == category) &
                (forecasts_daily_df['forecast_method'] == method) &
                (forecasts_daily_df['start_date'] <= date) &
                (forecasts_daily_df['end_date'] >= date)
            ].copy()
            
            if len(candidates) > 0:
                # Select forecast with LATEST (most recent) start_date
                best_idx = candidates['start_date'].idxmax()
                best_forecast = candidates.loc[best_idx]
                row[f'forecast_{method}'] = best_forecast['forecast_amount']
            else:
                row[f'forecast_{method}'] = 0
        
        # Add user forecast if exists
        user_forecast = user_forecasts_active[
            (user_forecasts_active['value_date'] == date) &
            (user_forecasts_active['category_id'] == category)
        ]['amount'].sum()
        row['forecast_user'] = user_forecast if user_forecast != 0 else 0
        
        daily_reconstruction.append(row)

daily_df = pd.DataFrame(daily_reconstruction)

print(f"\n✓ Reconstructed {len(daily_df):,} daily records")
print(f"  Date range: {daily_df['date'].min()} to {daily_df['date'].max()}")
print(f"  Categories: {daily_df['category'].nunique()}")
print(f"  Days with actual transactions: {(daily_df['actual'] != 0).sum():,}")
print(f"\nSample daily reconstruction:")
display(daily_df[daily_df['actual'] != 0].head(10))


**Code Cell Description:**

Exports the daily reconstruction data to CSV format (forecast_daily_reconstruction.csv) for use by subsequent analysis steps and potential external tools.


In [None]:
# FORECASTING ANALYSIS - STEP 5: Export Daily Reconstruction

print("=" * 80)
print("STEP 5: EXPORTING DAILY RECONSTRUCTION")
print("=" * 80)

# Export daily reconstruction to forecast_analysis directory
forecast_dir = Path("/Users/gianniskotsas/Documents/Side Projects/palm-case-study/scripts/datasets/forecast_analysis")
daily_export = daily_df.copy()
daily_export['date'] = daily_export['date'].dt.strftime('%Y-%m-%d')

daily_path = forecast_dir / "forecast_daily_reconstruction.csv"
daily_export.to_csv(daily_path, index=False)

print(f"\n✓ Exported daily reconstruction: {daily_path}")
print(f"  Records: {len(daily_export):,}")
print(f"  Date range: {daily_export['date'].min()} to {daily_export['date'].max()}")
print(f"  Categories: {daily_export['category'].nunique()}")

print("\n" + "="*80)
print("DAILY RECONSTRUCTION EXPORTED")
print("="*80)

**Code Cell Description:**

Creates a unified forecast by selecting the best available system forecast (priority: ML > Statistical > Foundation > Static) and applies user overrides where finance team has provided verified/unverified forecasts. Converts all values from cents to EUR.


In [None]:
# CREATE UNIFIED FORECAST DATASET

print("=" * 80)
print("CREATING UNIFIED FORECAST DATASET")
print("=" * 80)

# Load the forecast daily reconstruction
forecast_dir = Path("/Users/gianniskotsas/Documents/Side Projects/palm-case-study/scripts/datasets/forecast_analysis")
recon_path = forecast_dir / "forecast_daily_reconstruction.csv"
daily_recon = pd.read_csv(recon_path, parse_dates=['date'])

print(f"\nLoaded {len(daily_recon):,} records from forecast_daily_reconstruction.csv")

# Convert forecast values from cents to EUR (divide by 100)
forecast_columns = ['forecast_ml_model', 'forecast_statistical_model', 'forecast_foundation_model', 
                   'forecast_static', 'forecast_user']
for col in forecast_columns:
    daily_recon[col] = daily_recon[col] / 100

print("✓ Converted forecast values from cents to EUR")

# Create unified forecast column using fallback logic: ML > Statistical > Foundation > Static
def get_unified_forecast(row):
    """Select first non-zero forecast in priority order"""
    for method in ['forecast_ml_model', 'forecast_statistical_model', 'forecast_foundation_model', 'forecast_static']:
        value = row[method]
        if value != 0 and pd.notna(value):
            return value
    return 0

daily_recon['unified_forecast'] = daily_recon.apply(get_unified_forecast, axis=1)

# Create unified forecast with user overrides
# Load user forecasts and filter for verified/unverified only
user_forecasts_override = user_forecasts[
    user_forecasts['status'].isin(['verified', 'unverified'])
].copy()
user_forecasts_override['value_date'] = pd.to_datetime(user_forecasts_override['value_date'])

print(f"User forecasts to override: {len(user_forecasts_override)}")

# Start with unified forecast, then override with user forecasts
daily_recon['unified_forecast_with_user'] = daily_recon['unified_forecast'].copy()

# Apply user overrides (convert from cents to EUR)
for idx, row in daily_recon.iterrows():
    user_match = user_forecasts_override[
        (user_forecasts_override['value_date'] == row['date']) &
        (user_forecasts_override['category_id'] == row['category'])
    ]
    if len(user_match) > 0:
        user_value = user_match['amount'].sum() / 100  # Convert from cents to EUR
        if user_value != 0:
            daily_recon.at[idx, 'unified_forecast_with_user'] = user_value

# Create export dataset with selected columns
unified_export = daily_recon[['date', 'category', 'actual', 'unified_forecast', 'unified_forecast_with_user']].copy()
unified_export['date'] = unified_export['date'].dt.strftime('%Y-%m-%d')

# Export to forecast_analysis directory
unified_path = forecast_dir / "unified_forecast_daily.csv"
unified_export.to_csv(unified_path, index=False)

print(f"\n✓ Created unified forecast dataset: {unified_path}")
print(f"  Records: {len(unified_export):,}")
print(f"  Date range: {unified_export['date'].min()} to {unified_export['date'].max()}")
print(f"  Categories: {unified_export['category'].nunique()}")
print(f"  NOTE: All forecast values converted from cents to EUR (divided by 100)")
print(f"\nSample unified forecasts:")
display(unified_export[unified_export['actual'] != 0].head(10))


**Code Cell Description:**

Calculates monthly MAPE (Mean Absolute Percentage Error) metrics for both unified forecast types, enabling month-by-month accuracy comparison and identification of which months are most/least predictable.


In [None]:
# CALCULATE MONTHLY MAPE METRICS FOR UNIFIED FORECASTS

print("=" * 80)
print("CALCULATING MONTHLY MAPE METRICS (JAN 1 - AUG 31, 2025)")
print("=" * 80)
print("NOTE: Using EUR values (forecasts already converted from cents)\n")

# Filter data for the period Jan 1 - Aug 31, 2025
daily_recon_filtered = daily_recon[
    (daily_recon['date'] >= '2025-01-01') & 
    (daily_recon['date'] <= '2025-08-31')
].copy()

# Add month column
daily_recon_filtered['month'] = daily_recon_filtered['date'].dt.to_period('M')

print(f"\nFiltered to {len(daily_recon_filtered):,} records for Jan 1 - Aug 31, 2025")

# Calculate MAPE for both unified forecast types by month and category
mape_results = []

for forecast_type in ['unified_forecast', 'unified_forecast_with_user']:
    print(f"\n--- {forecast_type.replace('_', ' ').title()} ---")
    
    # Filter out rows where actual = 0 to avoid division by zero
    data_with_actuals = daily_recon_filtered[daily_recon_filtered['actual'] != 0].copy()
    
    # Calculate absolute percentage error
    data_with_actuals['abs_pct_error'] = (
        abs(data_with_actuals['actual'] - data_with_actuals[forecast_type]) / 
        abs(data_with_actuals['actual'])
    ) * 100
    
    # Per-month, per-category MAPE
    monthly_category_mape = data_with_actuals.groupby(['month', 'category']).agg({
        'abs_pct_error': 'mean',
        'actual': 'count'
    }).reset_index()
    monthly_category_mape.columns = ['month', 'category', 'mape', 'count']
    monthly_category_mape['forecast_type'] = forecast_type
    
    # Total MAPE per month (treating all data points equally)
    monthly_total_mape = data_with_actuals.groupby('month')['abs_pct_error'].mean().reset_index()
    monthly_total_mape.columns = ['month', 'total_mape']
    
    # Merge with category data
    monthly_category_mape = monthly_category_mape.merge(monthly_total_mape, on='month')
    
    print(f"  Categories analyzed: {monthly_category_mape['category'].nunique()}")
    print(f"  Months analyzed: {monthly_category_mape['month'].nunique()}")
    print(f"  Data points: {len(data_with_actuals):,}")
    
    # Add to results
    for _, row in monthly_category_mape.iterrows():
        mape_results.append({
            'month': str(row['month']),
            'category': row['category'],
            'forecast_type': forecast_type,
            'mape': row['mape'],
            'count': row['count'],
            'total_mape': row['total_mape']
        })

# Create DataFrame
mape_df = pd.DataFrame(mape_results)

# Export to CSV
mape_path = forecast_dir / "mape_metrics.csv"
mape_df.to_csv(mape_path, index=False)

print(f"\n✓ Exported monthly MAPE metrics: {mape_path}")
print(f"  Records: {len(mape_df)}")
print(f"\nMonthly MAPE Summary by Forecast Type:")
print("=" * 80)

# Display summary by month
monthly_summary = mape_df.groupby(['month', 'forecast_type']).agg({
    'mape': 'mean',
    'total_mape': 'first',
    'count': 'sum'
}).reset_index()
monthly_summary.columns = ['Month', 'Forecast Type', 'Avg Category MAPE', 'Total MAPE', 'Total Data Points']
display(monthly_summary)

print(f"\nTop 5 Best Categories (Unified Forecast):")
best_unified = mape_df[mape_df['forecast_type'] == 'unified_forecast'].nsmallest(5, 'mape')[['month', 'category', 'mape', 'count']]
display(best_unified)

print(f"\nTop 5 Worst Categories (Unified Forecast):")
worst_unified = mape_df[mape_df['forecast_type'] == 'unified_forecast'].nlargest(5, 'mape')[['month', 'category', 'mape', 'count']]
display(worst_unified)


**Code Cell Description:**

Creates a detailed line chart showing daily resort revenue (actual in black) compared to all forecast methods. Helps visualize which methods track actual revenue most closely and where gaps occur.


In [None]:
# CHART: RESORT REVENUE - ACTUAL VS ALL FORECASTS

print("=" * 80)
print("CREATING RESORT REVENUE CHART")
print("=" * 80)

# Filter for resort revenue
revenue_data = daily_recon[daily_recon['category'] == 'cash_in_resort_revenue'].copy()
revenue_data = revenue_data.sort_values('date')

print(f"\nRevenue data points: {len(revenue_data)}")

# Create figure
fig = go.Figure()

# Add actual line
fig.add_trace(go.Scatter(
    x=revenue_data['date'],
    y=revenue_data['actual'],
    mode='lines',
    name='Actual',
    line=dict(color='black', width=3),
    hovertemplate='%{y:,.2f}<extra></extra>'
))

# Add forecast model lines
forecast_lines = [
    ('forecast_ml_model', 'ML Model', '#1f77b4'),
    ('forecast_statistical_model', 'Statistical Model', '#ff7f0e'),
    ('forecast_foundation_model', 'Foundation Model', '#2ca02c'),
    ('forecast_static', 'Static', '#d62728'),
    ('forecast_user', 'User Forecasts', '#9467bd')
]

for col, name, color in forecast_lines:
    # Replace 0 with NaN to create gaps in the line
    y_data = revenue_data[col].replace(0, np.nan)
    
    fig.add_trace(go.Scatter(
        x=revenue_data['date'],
        y=y_data,
        mode='lines',
        name=name,
        line=dict(color=color, width=2, dash='dash' if 'user' in col.lower() else 'solid'),
        hovertemplate='%{y:,.2f}<extra></extra>'
    ))

# Update layout
fig.update_layout(
    title='Resort Revenue: Actual vs All Forecasts (EUR)',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Amount (EUR)'),
    height=600,
    width=1200,
    hovermode='x unified',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    )
)

fig.show()

print("\n✓ Resort revenue chart displayed")
print("  NOTE: Forecast values displayed in EUR (converted from cents)")


**Code Cell Description:**

Creates a detailed line chart showing daily payroll expenses (actual in black) compared to all forecast methods. Reveals forecasting challenges for this significant expense category.


In [None]:
# CHART: PAYROLL EXPENSES - ACTUAL VS ALL FORECASTS

print("=" * 80)
print("CREATING PAYROLL EXPENSES CHART")
print("=" * 80)

# Filter for payroll expenses
payroll_data = daily_recon[daily_recon['category'] == 'cash_out_payroll'].copy()
payroll_data = payroll_data.sort_values('date')

print(f"\nPayroll data points: {len(payroll_data)}")

# Create figure
fig = go.Figure()

# Add actual line
fig.add_trace(go.Scatter(
    x=payroll_data['date'],
    y=payroll_data['actual'],
    mode='lines',
    name='Actual',
    line=dict(color='black', width=3),
    hovertemplate='%{y:,.2f}<extra></extra>'
))

# Add forecast model lines
forecast_lines = [
    ('forecast_ml_model', 'ML Model', '#1f77b4'),
    ('forecast_statistical_model', 'Statistical Model', '#ff7f0e'),
    ('forecast_foundation_model', 'Foundation Model', '#2ca02c'),
    ('forecast_static', 'Static', '#d62728'),
    ('forecast_user', 'User Forecasts', '#9467bd')
]

for col, name, color in forecast_lines:
    # Replace 0 with NaN to create gaps in the line
    y_data = payroll_data[col].replace(0, np.nan)
    
    fig.add_trace(go.Scatter(
        x=payroll_data['date'],
        y=y_data,
        mode='lines',
        name=name,
        line=dict(color=color, width=2, dash='dash' if 'user' in col.lower() else 'solid'),
        hovertemplate='%{y:,.2f}<extra></extra>'
    ))

# Update layout
fig.update_layout(
    title='Payroll Expenses: Actual vs All Forecasts (EUR)',
    xaxis=dict(title='Date'),
    yaxis=dict(title='Amount (EUR)'),
    height=600,
    width=1200,
    hovermode='x unified',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    )
)

fig.show()

print("\n✓ Payroll expenses chart displayed")
print("  NOTE: Forecast values displayed in EUR (converted from cents)")


**Code Cell Description:**

Generates monthly aggregated chart comparing actual resort revenue to unified forecast and unified forecast with user overrides. Shows overall monthly accuracy at a higher level than daily comparisons.


In [None]:
# CHART: RESORT REVENUE - ACTUAL VS UNIFIED FORECASTS (MONTHLY)

print("=" * 80)
print("CREATING MONTHLY RESORT REVENUE UNIFIED FORECAST CHART")
print("=" * 80)

# Filter for resort revenue
revenue_unified = daily_recon[daily_recon['category'] == 'cash_in_resort_revenue'].copy()
revenue_unified = revenue_unified.sort_values('date')

# Add month column
revenue_unified['month'] = revenue_unified['date'].dt.to_period('M')

# Aggregate to monthly
monthly_revenue = revenue_unified.groupby('month').agg({
    'actual': 'sum',
    'unified_forecast': 'sum',
    'unified_forecast_with_user': 'sum'
}).reset_index()

# Convert month to string for display
monthly_revenue['month_display'] = monthly_revenue['month'].apply(lambda x: x.strftime('%b %Y'))

print(f"\nMonthly revenue data points: {len(monthly_revenue)}")

# Create figure
fig = go.Figure()

# Add actual line
fig.add_trace(go.Scatter(
    x=monthly_revenue['month_display'],
    y=monthly_revenue['actual'],
    mode='lines+markers',
    name='Actual',
    line=dict(color='black', width=3),
    marker=dict(size=8),
    hovertemplate='€%{y:,.2f}<extra></extra>'
))

# Add unified forecast line
fig.add_trace(go.Scatter(
    x=monthly_revenue['month_display'],
    y=monthly_revenue['unified_forecast'],
    mode='lines+markers',
    name='Unified Forecast',
    line=dict(color='#2196F3', width=2, dash='dash'),
    marker=dict(size=6),
    hovertemplate='€%{y:,.2f}<extra></extra>'
))

# Add unified forecast with user overrides line
fig.add_trace(go.Scatter(
    x=monthly_revenue['month_display'],
    y=monthly_revenue['unified_forecast_with_user'],
    mode='lines+markers',
    name='Unified Forecast + User',
    line=dict(color='#FF9800', width=2, dash='dot'),
    marker=dict(size=6),
    hovertemplate='€%{y:,.2f}<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Monthly Resort Revenue: Actual vs Unified Forecasts (EUR)',
    xaxis=dict(title='Month'),
    yaxis=dict(title='Amount (EUR)'),
    height=600,
    width=1200,
    hovermode='x unified',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    )
)

fig.show()

print("\n✓ Monthly resort revenue unified forecast chart displayed")


**Code Cell Description:**

Generates monthly aggregated chart comparing actual payroll expenses to unified forecast and unified forecast with user overrides. Enables assessment of monthly payroll predictability.


In [None]:
# CHART: PAYROLL EXPENSES - ACTUAL VS UNIFIED FORECASTS (MONTHLY)

print("=" * 80)
print("CREATING MONTHLY PAYROLL EXPENSES UNIFIED FORECAST CHART")
print("=" * 80)

# Filter for payroll expenses
payroll_unified = daily_recon[daily_recon['category'] == 'cash_out_payroll'].copy()
payroll_unified = payroll_unified.sort_values('date')

# Add month column
payroll_unified['month'] = payroll_unified['date'].dt.to_period('M')

# Aggregate to monthly
monthly_payroll = payroll_unified.groupby('month').agg({
    'actual': 'sum',
    'unified_forecast': 'sum',
    'unified_forecast_with_user': 'sum'
}).reset_index()

# Convert month to string for display
monthly_payroll['month_display'] = monthly_payroll['month'].apply(lambda x: x.strftime('%b %Y'))

print(f"\nMonthly payroll data points: {len(monthly_payroll)}")

# Create figure
fig = go.Figure()

# Add actual line
fig.add_trace(go.Scatter(
    x=monthly_payroll['month_display'],
    y=monthly_payroll['actual'],
    mode='lines+markers',
    name='Actual',
    line=dict(color='black', width=3),
    marker=dict(size=8),
    hovertemplate='€%{y:,.2f}<extra></extra>'
))

# Add unified forecast line
fig.add_trace(go.Scatter(
    x=monthly_payroll['month_display'],
    y=monthly_payroll['unified_forecast'],
    mode='lines+markers',
    name='Unified Forecast',
    line=dict(color='#2196F3', width=2, dash='dash'),
    marker=dict(size=6),
    hovertemplate='€%{y:,.2f}<extra></extra>'
))

# Add unified forecast with user overrides line
fig.add_trace(go.Scatter(
    x=monthly_payroll['month_display'],
    y=monthly_payroll['unified_forecast_with_user'],
    mode='lines+markers',
    name='Unified Forecast + User',
    line=dict(color='#FF9800', width=2, dash='dot'),
    marker=dict(size=6),
    hovertemplate='€%{y:,.2f}<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Monthly Payroll Expenses: Actual vs Unified Forecasts (EUR)',
    xaxis=dict(title='Month'),
    yaxis=dict(title='Amount (EUR)'),
    height=600,
    width=1200,
    hovermode='x unified',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    )
)

fig.show()

print("\n✓ Monthly payroll expenses unified forecast chart displayed")


**Code Cell Description:**

Creates comparison chart and statistical summary showing monthly MAPE for unified forecast vs unified forecast with user overrides. Quantifies whether user interventions improve or degrade forecast accuracy. Includes detailed monthly breakdowns and overall statistics.


In [None]:
# CHART: MONTHLY MAPE COMPARISON - UNIFIED FORECAST vs UNIFIED FORECAST WITH USER

print("=" * 80)
print("CREATING MONTHLY MAPE COMPARISON CHART")
print("=" * 80)

# Load the monthly MAPE data
mape_df = pd.read_csv(forecast_dir / "mape_metrics.csv")

# Calculate average MAPE per month for each forecast type
monthly_mape_summary = mape_df.groupby(['month', 'forecast_type'])['mape'].mean().reset_index()

# Pivot to get forecast types as columns
monthly_mape_pivot = monthly_mape_summary.pivot(index='month', columns='forecast_type', values='mape').reset_index()

# Convert month strings to proper format for display
monthly_mape_pivot['month_display'] = monthly_mape_pivot['month'].apply(
    lambda x: pd.Period(x).strftime('%b %Y')
)

print(f"\nMonthly MAPE data points: {len(monthly_mape_pivot)}")
print(f"Months: {monthly_mape_pivot['month_display'].tolist()}")

# Create figure
fig = go.Figure()

# Add line for unified_forecast
fig.add_trace(go.Scatter(
    x=monthly_mape_pivot['month_display'],
    y=monthly_mape_pivot['unified_forecast'],
    mode='lines+markers',
    name='Unified Forecast',
    line=dict(color='#2196F3', width=3),
    marker=dict(size=8),
    hovertemplate='%{x}<br>Unified Forecast MAPE: %{y:.2f}%<extra></extra>'
))

# Add line for unified_forecast_with_user
fig.add_trace(go.Scatter(
    x=monthly_mape_pivot['month_display'],
    y=monthly_mape_pivot['unified_forecast_with_user'],
    mode='lines+markers',
    name='Unified Forecast + User',
    line=dict(color='#FF9800', width=3),
    marker=dict(size=8),
    hovertemplate='%{x}<br>Unified Forecast + User MAPE: %{y:.2f}%<extra></extra>'
))

# Update layout
fig.update_layout(
    title='Monthly MAPE Comparison: Unified Forecast vs Unified Forecast + User',
    xaxis=dict(title='Month'),
    yaxis=dict(title='MAPE (%)'),
    height=600,
    width=1200,
    hovermode='x unified',
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=1.02,
        xanchor='right',
        x=1
    )
)

fig.show()

print("\n✓ Monthly MAPE comparison chart displayed")

# Display summary statistics
print("\n" + "=" * 80)
print("MONTHLY MAPE SUMMARY STATISTICS")
print("=" * 80)

# Calculate improvement from user overrides
monthly_mape_pivot['improvement'] = monthly_mape_pivot['unified_forecast'] - monthly_mape_pivot['unified_forecast_with_user']
monthly_mape_pivot['improvement_pct'] = (monthly_mape_pivot['improvement'] / monthly_mape_pivot['unified_forecast']) * 100

print("\nMonthly MAPE Values:")
for idx, row in monthly_mape_pivot.iterrows():
    print(f"{row['month_display']}:")
    print(f"  Unified Forecast: {row['unified_forecast']:.2f}%")
    print(f"  Unified + User:   {row['unified_forecast_with_user']:.2f}%")
    print(f"  Improvement:      {row['improvement']:.2f}% ({row['improvement_pct']:.1f}%)")
    print()

# Overall statistics
avg_unified = monthly_mape_pivot['unified_forecast'].mean()
avg_unified_user = monthly_mape_pivot['unified_forecast_with_user'].mean()
overall_improvement = avg_unified - avg_unified_user
overall_improvement_pct = (overall_improvement / avg_unified) * 100

print(f"Overall Average MAPE:")
print(f"  Unified Forecast: {avg_unified:.2f}%")
print(f"  Unified + User:   {avg_unified_user:.2f}%")
print(f"  Average Improvement: {overall_improvement:.2f}% ({overall_improvement_pct:.1f}%)")

# Best and worst months
best_month_idx = monthly_mape_pivot['unified_forecast_with_user'].idxmin()
worst_month_idx = monthly_mape_pivot['unified_forecast_with_user'].idxmax()

print(f"\nBest Month (Lowest MAPE): {monthly_mape_pivot.loc[best_month_idx, 'month_display']} ({monthly_mape_pivot.loc[best_month_idx, 'unified_forecast_with_user']:.2f}%)")
print(f"Worst Month (Highest MAPE): {monthly_mape_pivot.loc[worst_month_idx, 'month_display']} ({monthly_mape_pivot.loc[worst_month_idx, 'unified_forecast_with_user']:.2f}%)")


## Daily Forecast Comparison by Category



**Code Cell Description:**

Loads the previously generated forecast reconstruction data to prepare for category-by-category visualization.


In [None]:
# Load forecast reconstruction data
forecast_recon_path = Path("/Users/gianniskotsas/Documents/Side Projects/palm-case-study/scripts/datasets/forecast_analysis/forecast_daily_reconstruction.csv")
forecast_df = pd.read_csv(forecast_recon_path, parse_dates=['date'])

# Get unique categories
categories = forecast_df['category'].unique()
print(f"Categories found: {len(categories)}")
print(categories)


**Code Cell Description:**

Generates individual line charts for each transaction category showing actual values vs all forecast methods. Provides detailed visual analysis of forecast performance per category with color-coded forecast lines.


In [None]:
# Generate line charts for each category
forecast_columns = [
    'forecast_ml_model',
    'forecast_statistical_model', 
    'forecast_foundation_model',
    'forecast_historical',
    'forecast_static',
    'forecast_user'
]

# Define colors for each forecast type
forecast_colors = {
    'forecast_ml_model': '#FF6B6B',
    'forecast_statistical_model': '#4ECDC4',
    'forecast_foundation_model': '#45B7D1',
    'forecast_historical': '#FFA07A',
    'forecast_static': '#98D8C8',
    'forecast_user': '#C7CEEA'
}

for category in categories:
    # Filter data for this category
    category_df = forecast_df[forecast_df['category'] == category].sort_values('date')
    
    # Create figure
    fig = go.Figure()
    
    # Add actual line (solid)
    fig.add_trace(go.Scatter(
        x=category_df['date'],
        y=category_df['actual'],
        mode='lines',
        name='Actual',
        line=dict(color='#2C3E50', width=2, dash='solid'),
        hovertemplate='<b>Actual</b><br>Date: %{x}<br>Value: €%{y:,.2f}<extra></extra>'
    ))
    
    # Add forecast lines (dotted)
    for forecast_col in forecast_columns:
        forecast_name = forecast_col.replace('forecast_', '').replace('_', ' ').title()
        fig.add_trace(go.Scatter(
            x=category_df['date'],
            y=category_df[forecast_col]/100,
            mode='lines',
            name=forecast_name,
            line=dict(color=forecast_colors[forecast_col], width=1.5, dash='dot'),
            hovertemplate=f'<b>{forecast_name}</b><br>Date: %{{x}}<br>Value: €%{{y:,.2f}}<extra></extra>'
        ))
    
    # Update layout
    category_title = category.replace('_', ' ').title()
    fig.update_layout(
        title=dict(
            text=f'Daily Forecast vs Actual: {category_title}',
            font=dict(size=16, color='#2C3E50')
        ),
        xaxis_title='Date',
        yaxis_title='Amount (€)',
        hovermode='x unified',
        legend=dict(
            orientation='v',
            yanchor='top',
            y=1,
            xanchor='left',
            x=1.02
        ),
        width=1200,
        height=500,
        margin=dict(r=200)
    )
    
    # Show figure
    fig.show()


## Data Quality Assessment



**Code Cell Description:**

Comprehensive data quality analysis that generates JSON files for four key areas: 1) Balance Reconciliation (transaction vs snapshot comparison), 2) User Forecast Quality (status breakdown and verification rates), 3) System Forecast Coverage (gaps by method and category), 4) Transaction Data Quality (completeness and anomalies). All outputs feed the data quality dashboard section.


In [None]:
# DATA QUALITY ANALYSIS - Generate JSON files for frontend

import json
from pathlib import Path

# Create data_quality folder
output_dir = base_dir.parent / 'data_quality'
output_dir.mkdir(exist_ok=True)

print("=" * 80)
print("DATA QUALITY ANALYSIS")
print("=" * 80)

# Check if required DataFrames exist
if 'forecasts_daily_df' not in locals():
    print("⚠️  Warning: forecasts_daily_df not found. Please run the forecasting analysis cells first.")
    print("   This analysis requires the parsed system forecasts from earlier cells.")
else:
    print(f"✓ Using forecasts_daily_df with {len(forecasts_daily_df):,} daily forecast records")

# ============================================================================
# 1. BALANCE RECONCILIATION ANALYSIS
# ============================================================================
print("\n1. BALANCE RECONCILIATION ANALYSIS")
print("-" * 80)

# Prepare balances data
bal = balances.copy()
bal['date'] = pd.to_datetime(bal['balance_date'])
bal['month'] = bal['date'].dt.to_period('M')

# Get first and last balance per month from snapshots
monthly_bal = bal.groupby('month').agg({
    'amount': ['first', 'last'],
    'date': ['min', 'max']
}).reset_index()
monthly_bal.columns = ['month', 'start_balance_snapshot', 'end_balance_snapshot', 'start_date', 'end_date']

# Calculate expected balances based on transactions
tx_calc = transactions.copy()
tx_calc['date'] = pd.to_datetime(tx_calc[trx_date_col])
tx_calc['month'] = tx_calc['date'].dt.to_period('M')
tx_calc['amount_signed'] = tx_calc.apply(
    lambda row: row['amount'] if row['credit_or_debit'] == 'CRDT' else -row['amount'], 
    axis=1
)

# Calculate monthly net change from transactions
monthly_tx_change = tx_calc.groupby('month')['amount_signed'].sum().reset_index()
monthly_tx_change.columns = ['month', 'tx_net_change']

# Merge with balance data
balance_comparison = monthly_bal.merge(monthly_tx_change, on='month', how='left')
balance_comparison['tx_net_change'] = balance_comparison['tx_net_change'].fillna(0)

# Calculate expected end balance and deviation
balance_comparison['expected_end_balance'] = balance_comparison['start_balance_snapshot'] + balance_comparison['tx_net_change']
balance_comparison['deviation'] = balance_comparison['end_balance_snapshot'] - balance_comparison['expected_end_balance']
balance_comparison['deviation_pct'] = (balance_comparison['deviation'] / balance_comparison['end_balance_snapshot'] * 100).round(2)
balance_comparison['month_str'] = balance_comparison['month'].astype(str)

# Find largest discrepancy
largest_discrepancy_idx = balance_comparison['deviation'].abs().idxmax()
largest_discrepancy = balance_comparison.loc[largest_discrepancy_idx]

balance_reconciliation_data = {
    "monthly_deviations": balance_comparison[[
        'month_str', 'start_balance_snapshot', 'end_balance_snapshot', 
        'expected_end_balance', 'deviation', 'deviation_pct'
    ]].to_dict('records'),
    "total_deviation": float(balance_comparison['deviation'].sum()),
    "avg_deviation": float(balance_comparison['deviation'].mean()),
    "largest_discrepancy": {
        "month": str(largest_discrepancy['month_str']),
        "deviation": float(largest_discrepancy['deviation']),
        "deviation_pct": float(largest_discrepancy['deviation_pct']),
        "expected": float(largest_discrepancy['expected_end_balance']),
        "actual": float(largest_discrepancy['end_balance_snapshot'])
    },
    "months_analyzed": len(balance_comparison)
}

# Save to JSON
with open(output_dir / 'data_quality_balance_reconciliation.json', 'w') as f:
    json.dump(balance_reconciliation_data, f, indent=2)

print(f"✓ Total Deviation: €{balance_reconciliation_data['total_deviation']:,.2f}")
print(f"✓ Average Deviation: €{balance_reconciliation_data['avg_deviation']:,.2f}")
print(f"✓ Largest Discrepancy: {largest_discrepancy['month_str']} (€{largest_discrepancy['deviation']:,.2f})")

# ============================================================================
# 2. USER FORECAST QUALITY ANALYSIS
# ============================================================================
print("\n2. USER FORECAST QUALITY ANALYSIS")
print("-" * 80)

# Status breakdown
status_counts = user_forecasts['status'].value_counts().to_dict()
total_user_forecasts = len(user_forecasts)
status_percentages = {k: round(v / total_user_forecasts * 100, 2) for k, v in status_counts.items()}

# Forecasts with missing/zero amounts
missing_amounts = user_forecasts[user_forecasts['amount'].isna()].shape[0]
zero_amounts = user_forecasts[user_forecasts['amount'] == 0].shape[0]

# By forecast source
source_counts = user_forecasts['forecast_source'].value_counts().to_dict()

# By category (top 5)
category_counts = user_forecasts['category_id'].value_counts().head(10).to_dict()

user_forecast_data = {
    "total_forecasts": total_user_forecasts,
    "status_breakdown": {
        "counts": status_counts,
        "percentages": status_percentages
    },
    "missing_data": {
        "missing_amounts": int(missing_amounts),
        "zero_amounts": int(zero_amounts),
        "total_problematic": int(missing_amounts + zero_amounts)
    },
    "forecast_sources": source_counts,
    "top_categories": category_counts,
    "quality_score": round((status_counts.get('verified', 0) / total_user_forecasts) * 100, 1)
}

# Save to JSON
with open(output_dir / 'data_quality_user_forecasts.json', 'w') as f:
    json.dump(user_forecast_data, f, indent=2)

print(f"✓ Total User Forecasts: {total_user_forecasts:,}")
print(f"✓ Verified: {status_counts.get('verified', 0)} ({status_percentages.get('verified', 0)}%)")
print(f"✓ Dismissed: {status_counts.get('dismissed', 0)} ({status_percentages.get('dismissed', 0)}%)")
print(f"✓ Cancelled: {status_counts.get('cancelled', 0)} ({status_percentages.get('cancelled', 0)}%)")
print(f"✓ Quality Score: {user_forecast_data['quality_score']}%")

# ============================================================================
# 3. SYSTEM FORECAST COVERAGE ANALYSIS
# ============================================================================
print("\n3. SYSTEM FORECAST COVERAGE ANALYSIS")
print("-" * 80)

# Check if forecasts_daily_df exists
if 'forecasts_daily_df' not in locals():
    print("⚠️  Skipping system forecast coverage - forecasts_daily_df not found")
    print("   Please run the forecasting analysis cells first")
    
    # Create empty placeholder data
    system_forecast_coverage_data = {
        "coverage_by_method": {},
        "overall_coverage": {},
        "gaps_by_category": {},
        "date_range": {"start": "2025-01-01", "end": "2025-08-31", "total_days": 243},
        "categories_analyzed": 0,
        "methods_analyzed": 0
    }
else:
    # Generate complete date range
    date_range = pd.date_range('2025-01-01', '2025-08-31', freq='D')
    total_days = len(date_range)

    # Get unique categories from transactions
    categories = transactions['category'].unique()
    forecast_methods = ['ml_model', 'statistical_model', 'foundation_model', 'static']

    coverage_by_method = {}
    gaps_by_category = {}

    # Use the parsed forecasts_daily_df from earlier analysis
    for method in forecast_methods:
        method_forecasts = forecasts_daily_df[forecasts_daily_df['forecast_method'] == method].copy()
        
        # For each category, calculate coverage
        category_coverage = {}
        for category in categories:
            category_forecasts = method_forecasts[method_forecasts['category'] == category]
            
            # Count how many days have forecasts (within our date range)
            forecast_dates = category_forecasts['forecast_date']
            forecast_dates_in_range = forecast_dates[(forecast_dates >= date_range[0]) & (forecast_dates <= date_range[-1])]
            coverage_days = len(forecast_dates_in_range.unique())
            coverage_pct = (coverage_days / total_days) * 100
            
            category_coverage[category] = {
                "covered_days": int(coverage_days),
                "total_days": int(total_days),
                "coverage_pct": round(coverage_pct, 2),
                "missing_days": int(total_days - coverage_days)
            }
            
            # Track gaps
            if category not in gaps_by_category:
                gaps_by_category[category] = {}
            gaps_by_category[category][method] = int(total_days - coverage_days)
        
        coverage_by_method[method] = category_coverage

    # Calculate overall coverage per method
    overall_coverage = {}
    for method in forecast_methods:
        total_possible = len(categories) * total_days
        total_covered = sum([coverage_by_method[method][cat]['covered_days'] for cat in categories])
        overall_coverage[method] = {
            "coverage_pct": round((total_covered / total_possible) * 100, 2),
            "covered": int(total_covered),
            "total": int(total_possible)
        }

    system_forecast_coverage_data = {
        "coverage_by_method": coverage_by_method,
        "overall_coverage": overall_coverage,
        "gaps_by_category": gaps_by_category,
        "date_range": {
            "start": "2025-01-01",
            "end": "2025-08-31",
            "total_days": int(total_days)
        },
        "categories_analyzed": len(categories),
        "methods_analyzed": len(forecast_methods)
    }

    print(f"✓ Categories Analyzed: {len(categories)}")
    print(f"✓ Date Range: 2025-01-01 to 2025-08-31 ({total_days} days)")
    for method, cov in overall_coverage.items():
        print(f"  - {method}: {cov['coverage_pct']}% coverage")

# Save to JSON
with open(output_dir / 'data_quality_system_forecast_gaps.json', 'w') as f:
    json.dump(system_forecast_coverage_data, f, indent=2)

# ============================================================================
# 4. TRANSACTION DATA QUALITY
# ============================================================================
print("\n4. TRANSACTION DATA QUALITY")
print("-" * 80)

# Date range
tx_date_min = transactions[trx_date_col].min()
tx_date_max = transactions[trx_date_col].max()
total_transactions = len(transactions)

# Count by category
transactions_by_category = transactions['category'].value_counts().to_dict()

# Potential duplicates (same date, amount, and bank_reference)
if 'bank_reference' in transactions.columns:
    dup_cols = [trx_date_col, 'amount', 'bank_reference']
    potential_duplicates = transactions.duplicated(subset=dup_cols, keep=False).sum()
else:
    dup_cols = [trx_date_col, 'amount']
    potential_duplicates = transactions.duplicated(subset=dup_cols, keep=False).sum()

# Missing critical fields
missing_dates = transactions[trx_date_col].isna().sum()
missing_amounts = transactions['amount'].isna().sum()
missing_categories = transactions['category'].isna().sum() if 'category' in transactions.columns else 0

# Credit/Debit distribution
credit_debit_counts = transactions['credit_or_debit'].value_counts().to_dict()

transaction_quality_data = {
    "total_transactions": int(total_transactions),
    "date_range": {
        "min_date": str(tx_date_min.date()),
        "max_date": str(tx_date_max.date()),
        "days_span": int((tx_date_max - tx_date_min).days)
    },
    "transactions_by_category": transactions_by_category,
    "credit_debit_distribution": credit_debit_counts,
    "data_quality_flags": {
        "potential_duplicates": int(potential_duplicates),
        "missing_dates": int(missing_dates),
        "missing_amounts": int(missing_amounts),
        "missing_categories": int(missing_categories),
        "total_issues": int(potential_duplicates + missing_dates + missing_amounts + missing_categories)
    },
    "quality_score": round(((total_transactions - potential_duplicates - missing_dates - missing_amounts) / total_transactions) * 100, 1)
}

# Save to JSON
with open(output_dir / 'data_quality_transactions.json', 'w') as f:
    json.dump(transaction_quality_data, f, indent=2)

print(f"✓ Total Transactions: {total_transactions:,}")
print(f"✓ Date Range: {tx_date_min.date()} to {tx_date_max.date()}")
print(f"✓ Potential Duplicates: {potential_duplicates}")
print(f"✓ Missing Data: {missing_dates + missing_amounts + missing_categories} records")
print(f"✓ Quality Score: {transaction_quality_data['quality_score']}%")

print("\n" + "=" * 80)
print("✅ ALL DATA QUALITY FILES GENERATED SUCCESSFULLY")
print("=" * 80)
print(f"\nFiles saved to: {output_dir.relative_to(Path.cwd().parent)}")
print("  - data_quality_balance_reconciliation.json")
print("  - data_quality_user_forecasts.json")
print("  - data_quality_system_forecast_gaps.json")
print("  - data_quality_transactions.json")
print("\n💡 Next steps:")
print("  1. Files are ready for the Next.js frontend")
print("  2. API routes will read from: scripts/datasets/data_quality/")
print("  3. Refresh your browser to see the data quality section populated")


### Balance Reconciliation



**Code Cell Description:**

Creates a bar chart showing monthly balance deviations between actual balance snapshots and expected balances calculated from transactions. Identifies months with the largest discrepancies indicating potential data issues.


In [None]:
# CHART 4: Monthly Balance Deviations

# Confirm which date column is being used
print(f"Using date column: {trx_date_col}")

# Prepare balances data - use loaded balances
bal = balances.copy()
bal['date'] = pd.to_datetime(bal['balance_date'])
bal['month'] = bal['date'].dt.to_period('M')

# Get first and last balance per month from snapshots
monthly_bal = bal.groupby('month').agg({
    'amount': ['first', 'last'],
    'date': ['min', 'max']
}).reset_index()
monthly_bal.columns = ['month', 'start_balance_snapshot', 'end_balance_snapshot', 'start_date', 'end_date']

# Calculate expected balances based on transactions - use loaded transactions
tx_calc = transactions.copy()
tx_calc['date'] = pd.to_datetime(tx_calc[trx_date_col])
tx_calc['month'] = tx_calc['date'].dt.to_period('M')
tx_calc['amount_signed'] = tx_calc.apply(
    lambda row: row['amount'] if row['credit_or_debit'] == 'CRDT' else -row['amount'], 
    axis=1
)

# Calculate monthly net change from transactions
monthly_tx_change = tx_calc.groupby('month')['amount_signed'].sum().reset_index()
monthly_tx_change.columns = ['month', 'tx_net_change']

# Merge with balance data
balance_comparison = monthly_bal.merge(monthly_tx_change, on='month', how='left')
balance_comparison['tx_net_change'] = balance_comparison['tx_net_change'].fillna(0)

# Calculate expected end balance
balance_comparison['expected_end_balance'] = balance_comparison['start_balance_snapshot'] + balance_comparison['tx_net_change']

# Calculate deviation
balance_comparison['deviation'] = balance_comparison['end_balance_snapshot'] - balance_comparison['expected_end_balance']
balance_comparison['month_str'] = balance_comparison['month'].astype(str)

# Create chart - only showing deviation
fig4 = go.Figure()

# Add deviation bars only
fig4.add_trace(go.Bar(
    x=balance_comparison['month_str'],
    y=balance_comparison['deviation'],
    name='Deviation',
    marker_color=['red' if x < 0 else 'green' for x in balance_comparison['deviation']],
    hovertemplate='%{x}<br>Deviation: €%{y:,.2f}<extra></extra>'
))

# Add a zero reference line
fig4.add_hline(y=0, line_dash="dash", line_color="gray", annotation_text="Zero Deviation")

fig4.update_layout(
    title='Monthly Balance Deviations: Actual vs Expected (from Transactions)',
    xaxis=dict(title='Month'),
    yaxis=dict(title='Deviation (EUR)', zeroline=True),
    hovermode='x unified',
    height=600,
    width=1200,
    showlegend=False
)

fig4.show()

# Display deviation summary
print("\nBalance Deviation Summary:")
print(balance_comparison[['month_str', 'start_balance_snapshot', 'tx_net_change', 
                           'expected_end_balance', 'end_balance_snapshot', 'deviation']])


**Code Cell Description:**

Performs comprehensive data exploration including: column analysis, missing value assessment, date range validation, duplicate detection, sign consistency checks, and overall data quality scoring. Provides foundation for understanding data reliability.


In [None]:
# CELL 2: DATA EXPLORATION & QUALITY ASSESSMENT
from typing import Optional

if transactions is None:
    display("Transactions dataset not found. Please ensure 'transactions.csv' exists under datasets/raw.")
else:
    # Helpers to detect amount column and coerce numerics
    AMOUNT_CANDIDATES = [
        'amount', 'booking_amount', 'transaction_amount', 'value', 'amt'
    ]

    def pick_amount_column(df: pd.DataFrame) -> Optional[str]:
        for c in AMOUNT_CANDIDATES:
            if c in df.columns:
                return c
        # guess: first numeric column named like amount
        for c in df.columns:
            if 'amount' in c.lower() or 'value' in c.lower():
                return c
        return None

    def coerce_amount(series: pd.Series) -> pd.Series:
        # If string with comma decimal, replace and convert
        if series.dtype == 'object' or pd.api.types.is_string_dtype(series):
            return pd.to_numeric(series.str.replace(',', '.', regex=False).str.replace(' ', ''), errors='coerce')
        return pd.to_numeric(series, errors='coerce')

    amount_col = pick_amount_column(transactions)
    if amount_col is None:
        # Try to infer from credits/debits
        amount_col = 'amount'
        transactions[amount_col] = pd.to_numeric(0)

    # Coerce date and amount columns
    if trx_date_col is not None:
        transactions[trx_date_col] = pd.to_datetime(transactions[trx_date_col], errors='coerce')
    transactions[amount_col] = coerce_amount(transactions[amount_col])

    # Preview head/tail
    display(transactions.head(5))
    display(transactions.tail(5))

    # Column names and dtypes
    display(pd.DataFrame({'dtype': transactions.dtypes.astype(str)}))

    # Missingness
    missing_counts = transactions.isna().sum().sort_values(ascending=False)
    missing_pct = (transactions.isna().mean() * 100).round(2)
    dq_missing = pd.concat([missing_counts.rename('missing_count'), missing_pct.rename('missing_%')], axis=1)
    display(dq_missing)

    # Date range
    date_min = transactions[trx_date_col].min() if trx_date_col else None
    date_max = transactions[trx_date_col].max() if trx_date_col else None
    print({'date_min': date_min, 'date_max': date_max})

    # Unique transaction categories/types if present
    category_like_cols = [c for c in transactions.columns if 'category' in c.lower() or 'type' in c.lower()]
    unique_info = {}
    for c in category_like_cols:
        nun = transactions[c].nunique(dropna=True)
        sample_vals = transactions[c].dropna().astype(str).value_counts().head(10)
        unique_info[c] = {'nunique': nun, 'top_values': sample_vals.to_dict()}
    display(unique_info)

    # Basic amount statistics
    amount_stats = transactions[amount_col].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9]).to_frame('amount_stats')
    display(amount_stats)

    # Obvious data quality issues
    # 1) Duplicates
    dup_cols = [trx_date_col, amount_col, 'bank_reference'] if 'bank_reference' in transactions.columns else [trx_date_col, amount_col]
    potential_dups = transactions.duplicated(subset=[c for c in dup_cols if c is not None], keep=False).sum()

    # 2) Non-parsable dates or amounts
    non_parsable_dates = transactions[trx_date_col].isna().sum() if trx_date_col else None
    non_parsable_amounts = transactions[amount_col].isna().sum()

    # 3) Sign consistency vs credit_or_debit
    sign_inconsistencies = None
    if 'credit_or_debit' in transactions.columns:
        cod = transactions['credit_or_debit'].astype(str).str.upper()
        # infer sign from amount if signed; if all positive, we cannot check reliably
        if transactions[amount_col].notna().any():
            sign = np.sign(transactions[amount_col].fillna(0))
            sign_inconsistencies = ((cod == 'CRDT') & (sign < 0) | (cod.str.startswith(('DB', 'DR'))) & (sign > 0)).sum()

    dq_summary = pd.DataFrame([
        {'metric': 'rows', 'value': len(transactions)},
        {'metric': 'columns', 'value': transactions.shape[1]},
        {'metric': 'date_min', 'value': date_min},
        {'metric': 'date_max', 'value': date_max},
        {'metric': 'potential_duplicates', 'value': int(potential_dups)},
        {'metric': 'non_parsable_dates', 'value': None if non_parsable_dates is None else int(non_parsable_dates)},
        {'metric': 'non_parsable_amounts', 'value': int(non_parsable_amounts)},
        {'metric': 'sign_inconsistencies', 'value': None if sign_inconsistencies is None else int(sign_inconsistencies)},
    ])
    display(dq_summary)

    # Balances presence and date coverage
    if balances is not None:
        bmin = balances['balance_date'].min() if 'balance_date' in balances.columns else None
        bmax = balances['balance_date'].max() if 'balance_date' in balances.columns else None
        print({'balances_date_min': bmin, 'balances_date_max': bmax, 'balances_rows': len(balances)})

    # Forecast datasets presence
    print({'system_forecasts_present': system_forecasts is not None, 'user_forecasts_present': user_forecasts is not None})



### Data Quality: Category Direction & Outliers



**Code Cell Description:**

Comprehensive outlier analysis using two methods: 1) IQR (Interquartile Range) method and 2) Z-Score method. Identifies transactions that deviate significantly from normal patterns by category, validates transaction sign consistency (cash_in vs cash_out), and generates visualizations including box plots, scatter plots, and distribution histograms.


In [None]:
# CELL: DATA QUALITY & OUTLIER ANALYSIS

print("="*80)
print("DATA QUALITY ANALYSIS: CATEGORY DIRECTION VALIDATION")
print("="*80)

# First, check if category column exists, otherwise create it
if 'category' not in transactions.columns:
    print("\n⚠️  'category' column not found - creating it now...")
    
    def categorize_transaction(row):
        desc = str(row.get('remittence_info', '')).lower()
        credit_or_debit = row.get('credit_or_debit', '')
        
        if 'resort revenue' in desc:
            return 'cash_in_resort_revenue'
        elif 'payroll' in desc or 'salary' in desc:
            return 'cash_out_payroll'
        elif 'tax payment' in desc:
            return 'cash_out_tax_payments'
        elif 'insurance' in desc:
            return 'cash_out_insurance_costs'
        elif 'maintenance' in desc:
            return 'cash_out_resort_maintenance_expenses'
        elif 'investment' in desc and credit_or_debit == 'DBIT':
            return 'cash_out_investments_outflow'
        elif 'investment' in desc and credit_or_debit == 'CRDT':
            return 'cash_in_investments_income'
        elif 'foreign exchange' in desc or 'fx fee' in desc:
            return 'cash_out_foreign_exchange_expenses'
        elif 'tax' in desc and credit_or_debit == 'CRDT':
            return 'cash_in_tax_income'
        else:
            return 'other'
    
    transactions['category'] = transactions.apply(categorize_transaction, axis=1)
    print("✓ Category column created")

# Get unique categories
all_categories_unique = transactions['category'].unique()
cash_in_categories = [cat for cat in all_categories_unique if cat.startswith('cash_in_')]
cash_out_categories = [cat for cat in all_categories_unique if cat.startswith('cash_out_')]

print(f"\nFound {len(all_categories_unique)} unique transaction categories:")
print(f"  Cash IN categories: {len(cash_in_categories)}")
for cat in cash_in_categories:
    count = (transactions['category'] == cat).sum()
    print(f"    - {cat}: {count} transactions")
print(f"\n  Cash OUT categories: {len(cash_out_categories)}")
for cat in cash_out_categories:
    count = (transactions['category'] == cat).sum()
    print(f"    - {cat}: {count} transactions")

# Check for negative values in cash_in categories
print("\n" + "="*80)
print("1. CASH_IN CATEGORIES - Checking for negative amounts")
print("="*80)

cash_in_issues = []
for category in cash_in_categories:
    cat_data = transactions[transactions['category'] == category]
    negative_count = (cat_data['amount'] < 0).sum()
    
    if negative_count > 0:
        negative_sum = cat_data[cat_data['amount'] < 0]['amount'].sum()
        negative_samples = cat_data[cat_data['amount'] < 0][['value_date', 'amount', 'remittence_info']].head()
        
        cash_in_issues.append({
            'category': category,
            'negative_count': negative_count,
            'negative_sum': negative_sum,
            'total_in_category': len(cat_data),
            'percentage': (negative_count / len(cat_data)) * 100
        })
        
        print(f"\n⚠️  {category}:")
        print(f"    - Found {negative_count} negative values ({(negative_count / len(cat_data)) * 100:.2f}% of this category)")
        print(f"    - Total negative amount: €{negative_sum:,.2f}")
        print(f"    - Sample transactions:")
        for idx, row in negative_samples.iterrows():
            print(f"      {row['value_date']}: €{row['amount']:,.2f} - {row['remittence_info'][:50]}")
    else:
        print(f"\n✓  {category}: All {len(cat_data)} transactions have positive amounts")

# Check for positive values in cash_out categories
print("\n" + "="*80)
print("2. CASH_OUT CATEGORIES - Checking for positive amounts")
print("="*80)

cash_out_issues = []
for category in cash_out_categories:
    cat_data = transactions[transactions['category'] == category]
    positive_count = (cat_data['amount'] > 0).sum()
    
    if positive_count > 0:
        positive_sum = cat_data[cat_data['amount'] > 0]['amount'].sum()
        positive_samples = cat_data[cat_data['amount'] > 0][['value_date', 'amount', 'remittence_info']].head()
        
        cash_out_issues.append({
            'category': category,
            'positive_count': positive_count,
            'positive_sum': positive_sum,
            'total_in_category': len(cat_data),
            'percentage': (positive_count / len(cat_data)) * 100
        })
        
        print(f"\n⚠️  {category}:")
        print(f"    - Found {positive_count} positive values ({(positive_count / len(cat_data)) * 100:.2f}% of this category)")
        print(f"    - Total positive amount: €{positive_sum:,.2f}")
        print(f"    - Sample transactions:")
        for idx, row in positive_samples.iterrows():
            print(f"      {row['value_date']}: €{row['amount']:,.2f} - {row['remittence_info'][:50]}")
    else:
        print(f"\n✓  {category}: All {len(cat_data)} transactions have negative amounts")

# Summary
print("\n" + "="*80)
print("CATEGORY DIRECTION VALIDATION SUMMARY")
print("="*80)

if cash_in_issues:
    print(f"\n⚠️  Found {len(cash_in_issues)} cash_in categories with negative values:")
    for issue in cash_in_issues:
        print(f"   - {issue['category']}: {issue['negative_count']} issues ({issue['percentage']:.2f}%)")
else:
    print("\n✓ All cash_in categories have correct (positive) values")

if cash_out_issues:
    print(f"\n⚠️  Found {len(cash_out_issues)} cash_out categories with positive values:")
    for issue in cash_out_issues:
        print(f"   - {issue['category']}: {issue['positive_count']} issues ({issue['percentage']:.2f}%)")
else:
    print("\n✓ All cash_out categories have correct (negative) values")

# ============================================================================
# OUTLIER ANALYSIS
# ============================================================================

print("\n\n" + "="*80)
print("OUTLIER ANALYSIS: ALL TRANSACTIONS")
print("="*80)

# Filter out 'other' category for outlier analysis
analysis_categories = [cat for cat in all_categories_unique if cat != 'other']

print(f"\nAnalyzing {len(analysis_categories)} categories across {len(transactions)} transactions")
print(f"Categories: {', '.join(analysis_categories)}")

# Create a dataframe with all transaction amounts and their categories
outlier_data = []
for idx, row in transactions.iterrows():
    category = row['category']
    amount = row['amount']
    
    if category in analysis_categories and pd.notna(amount) and amount != 0:
        outlier_data.append({
            'index': idx,
            'category': category,
            'amount': abs(amount),  # Use absolute value for outlier detection
            'original_value': amount,
            'date': row[trx_date_col] if pd.notna(row[trx_date_col]) else None,
            'description': row.get('remittence_info', '')[:50]
        })

outlier_df = pd.DataFrame(outlier_data)

if len(outlier_df) > 0:
    print(f"\nTotal non-zero transactions analyzed: {len(outlier_df):,}")
    print(f"Amount range: €{outlier_df['amount'].min():,.2f} to €{outlier_df['amount'].max():,.2f}")
    print(f"Mean: €{outlier_df['amount'].mean():,.2f}, Median: €{outlier_df['amount'].median():,.2f}")
    
    # Method 1: IQR Method (Interquartile Range)
    print("\n" + "-"*80)
    print("METHOD 1: IQR (Interquartile Range) Method")
    print("-"*80)
    
    Q1 = outlier_df['amount'].quantile(0.25)
    Q3 = outlier_df['amount'].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identify outliers
    iqr_outliers = outlier_df[(outlier_df['amount'] < lower_bound) | (outlier_df['amount'] > upper_bound)].copy()
    iqr_outliers = iqr_outliers.sort_values('amount', ascending=False)
    
    print(f"\nStatistics:")
    print(f"  Q1 (25th percentile): €{Q1:,.2f}")
    print(f"  Q3 (75th percentile): €{Q3:,.2f}")
    print(f"  IQR: €{IQR:,.2f}")
    print(f"  Lower bound: €{lower_bound:,.2f}")
    print(f"  Upper bound: €{upper_bound:,.2f}")
    print(f"\n🎯 Outliers found: {len(iqr_outliers)} ({(len(iqr_outliers)/len(outlier_df)*100):.2f}%)")
    
    if len(iqr_outliers) > 0:
        print(f"\nTop 25 IQR Outliers (by absolute amount):")
        print("-" * 100)
        print(f"{'Date':<12} {'Category':<40} {'Amount':>15} {'Description':<30}")
        print("-" * 100)
        for i, row in iqr_outliers.head(25).iterrows():
            date_str = row['date'].strftime('%Y-%m-%d') if pd.notna(row['date']) else 'N/A'
            cat_short = row['category'].replace('cash_in_', 'IN:').replace('cash_out_', 'OUT:')
            print(f"{date_str:<12} {cat_short:<40} €{row['original_value']:>13,.2f} {row['description'][:28]:<30}")
    
    # Method 2: Z-Score Method
    print("\n" + "-"*80)
    print("METHOD 2: Z-Score Method (|z| > 3)")
    print("-"*80)
    
    mean_amount = outlier_df['amount'].mean()
    std_amount = outlier_df['amount'].std()
    
    outlier_df['z_score'] = (outlier_df['amount'] - mean_amount) / std_amount
    z_outliers = outlier_df[abs(outlier_df['z_score']) > 3].copy()
    z_outliers = z_outliers.sort_values('z_score', ascending=False)
    
    print(f"\nStatistics:")
    print(f"  Mean: €{mean_amount:,.2f}")
    print(f"  Std Dev: €{std_amount:,.2f}")
    print(f"  Threshold: |z| > 3 (99.7% confidence)")
    print(f"\n🎯 Outliers found: {len(z_outliers)} ({(len(z_outliers)/len(outlier_df)*100):.2f}%)")
    
    if len(z_outliers) > 0:
        print(f"\nTop 25 Z-Score Outliers:")
        print("-" * 110)
        print(f"{'Date':<12} {'Category':<40} {'Amount':>15} {'Z-Score':>8} {'Description':<25}")
        print("-" * 110)
        for i, row in z_outliers.head(25).iterrows():
            date_str = row['date'].strftime('%Y-%m-%d') if pd.notna(row['date']) else 'N/A'
            cat_short = row['category'].replace('cash_in_', 'IN:').replace('cash_out_', 'OUT:')
            print(f"{date_str:<12} {cat_short:<40} €{row['original_value']:>13,.2f} {row['z_score']:>8.2f} {row['description'][:23]:<25}")
    
    # Category-wise outlier analysis
    print("\n" + "-"*80)
    print("CATEGORY-WISE OUTLIER SUMMARY (IQR Method)")
    print("-"*80)
    
    category_outlier_summary = []
    for category in analysis_categories:
        cat_data = outlier_df[outlier_df['category'] == category].copy()
        if len(cat_data) > 0:
            cat_Q1 = cat_data['amount'].quantile(0.25)
            cat_Q3 = cat_data['amount'].quantile(0.75)
            cat_IQR = cat_Q3 - cat_Q1
            cat_lower = cat_Q1 - 1.5 * cat_IQR
            cat_upper = cat_Q3 + 1.5 * cat_IQR
            
            cat_outliers = cat_data[(cat_data['amount'] < cat_lower) | (cat_data['amount'] > cat_upper)]
            
            category_outlier_summary.append({
                'category': category,
                'total_txns': len(cat_data),
                'outliers': len(cat_outliers),
                'outlier_pct': (len(cat_outliers) / len(cat_data)) * 100 if len(cat_data) > 0 else 0,
                'median': cat_data['amount'].median(),
                'mean': cat_data['amount'].mean(),
                'max': cat_data['amount'].max(),
                'upper_bound': cat_upper
            })
    
    summary_df = pd.DataFrame(category_outlier_summary).sort_values('outliers', ascending=False)
    
    print(f"\n{'Category':<40} {'Total':>8} {'Outliers':>10} {'%':>7} {'Median':>12} {'Mean':>12} {'Max':>12}")
    print("-" * 115)
    for _, row in summary_df.iterrows():
        cat_short = row['category'].replace('cash_in_', 'IN:').replace('cash_out_', 'OUT:')
        print(f"{cat_short:<40} {row['total_txns']:>8,} {row['outliers']:>10,} {row['outlier_pct']:>6.1f}% €{row['median']:>10,.0f} €{row['mean']:>10,.0f} €{row['max']:>10,.0f}")
    
    # Visualization: Box plot of all categories
    print("\n" + "="*80)
    print("GENERATING OUTLIER VISUALIZATIONS")
    print("="*80)
    
    # Create box plot for each category
    fig = go.Figure()
    
    for category in sorted(analysis_categories):
        cat_data = outlier_df[outlier_df['category'] == category]['amount']
        cat_short = category.replace('cash_in_', 'IN:').replace('cash_out_', 'OUT:')
        fig.add_trace(go.Box(
            y=cat_data,
            name=cat_short,
            boxmean='sd',
            marker_color='lightblue' if category.startswith('cash_in_') else 'lightcoral'
        ))
    
    fig.update_layout(
        title='Transaction Amount Distribution by Category (Absolute Values, Log Scale)',
        yaxis_title='Amount (€)',
        xaxis_title='Category',
        height=600,
        width=1400,
        showlegend=True,
        yaxis_type='log',
        xaxis={'tickangle': -45}
    )
    
    fig.show()
    
    # Create scatter plot of outliers over time
    if len(z_outliers) > 0:
        print("\nGenerating outliers over time visualization...")
        
        outliers_with_dates = z_outliers[z_outliers['date'].notna()].copy()
        
        if len(outliers_with_dates) > 0:
            fig2 = px.scatter(
                outliers_with_dates,
                x='date',
                y='amount',
                color='category',
                size='z_score',
                hover_data={'original_value': ':,.2f', 'z_score': ':.2f', 'description': True, 'amount': ':,.2f'},
                title=f'Significant Outliers Over Time (Z-Score > 3) - {len(outliers_with_dates)} transactions',
                labels={'amount': 'Amount (€, absolute)', 'date': 'Date'},
                height=600,
                width=1400
            )
            
            fig2.update_layout(
                yaxis_type='log',
                xaxis_title='Date',
                yaxis_title='Amount (€, log scale)'
            )
            
            fig2.show()
            
    # Distribution histogram
    print("\nGenerating amount distribution histogram...")
    
    fig3 = px.histogram(
        outlier_df,
        x='amount',
        color='category',
        nbins=50,
        title='Transaction Amount Distribution (All Categories)',
        labels={'amount': 'Amount (€, absolute)', 'count': 'Number of Transactions'},
        height=500,
        width=1400,
        log_y=True
    )
    
    fig3.update_layout(
        xaxis_type='log',
        xaxis_title='Amount (€, log scale)',
        yaxis_title='Count (log scale)'
    )
    
    fig3.show()

else:
    print("\n⚠️  No transaction data available for outlier analysis")

print("\n" + "="*80)
print("ANALYSIS COMPLETE")
print("="*80)


## Summary & Instructions

