# 01 - Data Exploration

This notebook explores the collected property sales data and prepares it for analysis.

## Overview
1. Load and inspect raw data
2. Data quality checks
3. Basic statistics by municipality
4. Data cleaning and preparation


In [None]:
# Setup
import sys
sys.path.insert(0, '../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from config import MUNICIPALITIES, EASTCHESTER_AREA
from data_collection.data_loader import DataLoader
from models.property import SaleDataset

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '${:,.2f}'.format)
sns.set_theme(style='whitegrid')

print("Setup complete!")


## 1. Load Data

We'll start by loading the sample/synthetic data. Replace this with real data when available.


In [None]:
# Initialize data loader
loader = DataLoader()

# Generate sample data for demonstration
# Replace with: dataset = loader.load_all_sales() for real data
dataset = loader.generate_sample_data(
    municipalities=['bronxville', 'eastchester_unincorp', 'tuckahoe', 
                   'scarsdale', 'larchmont', 'mamaroneck_village'],
    samples_per_muni=30
)

print(f"Loaded {len(dataset)} total sales records")

# Convert to DataFrame for exploration
df = dataset.to_dataframe()
print(f"DataFrame shape: {df.shape}")
df.head()


## 2. Summary Statistics by Municipality


In [None]:
# Aggregate statistics
summary = df.groupby('municipality').agg({
    'sale_price': ['count', 'mean', 'median', 'min', 'max'],
    'sqft': ['mean', 'median'],
    'price_per_sqft': ['mean', 'median', 'std'],
    'annual_taxes': ['mean', 'median'],
    'effective_tax_rate': ['mean', 'median'],
}).round(2)

# Flatten column names
summary.columns = ['_'.join(col).strip() for col in summary.columns.values]
summary = summary.sort_values('price_per_sqft_median', ascending=False)

print("Summary Statistics by Municipality:")
summary


In [None]:
# Price distribution by municipality
fig, ax = plt.subplots(figsize=(12, 6))
order = df.groupby('municipality')['price_per_sqft'].median().sort_values(ascending=False).index
sns.boxplot(data=df, x='municipality', y='price_per_sqft', order=order, ax=ax)
ax.set_title('Price per Square Foot Distribution by Municipality')
ax.set_xlabel('Municipality')
ax.set_ylabel('Price ($/sqft)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


## 3. Key Findings


In [None]:
# Key findings
print("=" * 60)
print("KEY FINDINGS")
print("=" * 60)

# Highest value
highest_value = summary['price_per_sqft_median'].idxmax()
highest_val = summary.loc[highest_value, 'price_per_sqft_median']
print(f"\nðŸ“ˆ Highest Value: {highest_value} at ${highest_val:,.0f}/sqft")

# Lowest value
lowest_value = summary['price_per_sqft_median'].idxmin()
lowest_val = summary.loc[lowest_value, 'price_per_sqft_median']
print(f"ðŸ“‰ Lowest Value: {lowest_value} at ${lowest_val:,.0f}/sqft")

# Premium calculation
premium_pct = ((highest_val / lowest_val) - 1) * 100
print(f"ðŸ’° Premium: {premium_pct:.0f}% difference between highest and lowest")

# Save for next notebook
loader.save_unified_dataset(dataset, 'unified_sales.parquet')
print("\nâœ… Data saved for next analysis!")
