# Comprehensive Property Use Analysis
## Florida Parcels Database - 9.1M Properties

This notebook analyzes all unique property uses across the 9.1M properties in the florida_parcels table to ensure complete categorization for filter buttons.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Database imports
from sqlalchemy import create_engine
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv('.env.mcp')

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
%matplotlib inline

In [None]:
# Initialize database connection
supabase_url = os.getenv('SUPABASE_URL')
supabase_key = os.getenv('SUPABASE_SERVICE_ROLE_KEY')
db_url = f"postgresql://postgres.{supabase_url.split('//')[1].split('.')[0]}:{os.getenv('SUPABASE_DB_PASSWORD', '')}@{supabase_url.split('//')[1]}/postgres"

engine = create_engine(db_url, pool_size=10, max_overflow=20)
print("Database connection established")

## 1. Database Overview and Statistics

In [None]:
# Get basic database statistics
stats_query = """
SELECT 
    COUNT(*) as total_properties,
    COUNT(DISTINCT property_use) as unique_property_uses,
    COUNT(DISTINCT county) as total_counties,
    COUNT(CASE WHEN property_use IS NULL OR property_use = '' THEN 1 END) as missing_property_use,
    MIN(year) as min_year,
    MAX(year) as max_year
FROM florida_parcels
"""

db_stats = pd.read_sql(stats_query, engine)
print("=== FLORIDA PARCELS DATABASE STATISTICS ===")
for col in db_stats.columns:
    print(f"{col.replace('_', ' ').title()}: {db_stats[col].iloc[0]:,}")

## 2. Comprehensive Property Use Analysis

In [None]:
# Query all unique property uses with comprehensive statistics
property_uses_query = """
SELECT
    property_use,
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 4) as percentage,
    COUNT(DISTINCT county) as county_count,
    ARRAY_AGG(DISTINCT county ORDER BY county) as counties,
    AVG(CASE WHEN just_value > 0 THEN just_value END) as avg_just_value,
    AVG(CASE WHEN building_value > 0 THEN building_value END) as avg_building_value,
    AVG(CASE WHEN land_value > 0 THEN land_value END) as avg_land_value,
    MIN(CASE WHEN just_value > 0 THEN just_value END) as min_just_value,
    MAX(CASE WHEN just_value > 0 THEN just_value END) as max_just_value
FROM florida_parcels
WHERE property_use IS NOT NULL
    AND property_use != ''
GROUP BY property_use
ORDER BY count DESC
"""

print("Querying all unique property uses...")
property_uses_df = pd.read_sql(property_uses_query, engine)
print(f"Found {len(property_uses_df)} unique property use types")

# Display top 20 property uses
print("\n=== TOP 20 PROPERTY USES ===")
top_20 = property_uses_df.head(20)
for i, row in top_20.iterrows():
    print(f"{i+1:2d}. {row['property_use']:<40} {row['count']:>8,} ({row['percentage']:>6.2f}%)")

In [None]:
# Create visualization of top property uses
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# Top 20 property uses bar chart
top_20 = property_uses_df.head(20)
axes[0, 0].barh(range(len(top_20)), top_20['count'])
axes[0, 0].set_yticks(range(len(top_20)))
axes[0, 0].set_yticklabels(top_20['property_use'], fontsize=8)
axes[0, 0].set_xlabel('Number of Properties')
axes[0, 0].set_title('Top 20 Property Use Types by Count')
axes[0, 0].invert_yaxis()

# Percentage distribution
axes[0, 1].bar(range(len(top_20)), top_20['percentage'])
axes[0, 1].set_xticks(range(len(top_20)))
axes[0, 1].set_xticklabels(top_20['property_use'], rotation=45, ha='right', fontsize=8)
axes[0, 1].set_ylabel('Percentage of Total Properties')
axes[0, 1].set_title('Top 20 Property Use Types by Percentage')

# County distribution
county_dist = property_uses_df['county_count'].value_counts().sort_index()
axes[1, 0].bar(county_dist.index, county_dist.values)
axes[1, 0].set_xlabel('Number of Counties')
axes[1, 0].set_ylabel('Number of Property Use Types')
axes[1, 0].set_title('Distribution of Property Uses Across Counties')

# Value distribution
value_data = property_uses_df.dropna(subset=['avg_just_value'])
axes[1, 1].scatter(value_data['count'], value_data['avg_just_value'], alpha=0.6)
axes[1, 1].set_xlabel('Number of Properties')
axes[1, 1].set_ylabel('Average Just Value')
axes[1, 1].set_title('Property Count vs Average Value')
axes[1, 1].set_yscale('log')
axes[1, 1].set_xscale('log')

plt.tight_layout()
plt.show()

## 3. Current Filter Categories Analysis

In [None]:
# Current filter categories from the frontend
current_categories = [
    "Residential", "Commercial", "Industrial", "Agricultural",
    "Vacant Land", "Government", "Conservation", "Religious",
    "Vacant/Special", "Tax Deed Sales"
]

# Define comprehensive mapping rules
mapping_rules = {
    "Residential": [
        "SINGLE FAMILY", "CONDO", "MOBILE", "TOWNHOUSE", "DUPLEX",
        "APARTMENT", "RESIDENTIAL", "HOME", "HOUSE", "DWELLING",
        "MULTI FAMILY", "TRIPLEX", "FOURPLEX", "COOPERATIVE"
    ],
    "Commercial": [
        "RETAIL", "OFFICE", "STORE", "SHOPPING", "COMMERCIAL", "BUSINESS",
        "RESTAURANT", "HOTEL", "MOTEL", "WAREHOUSE", "PARKING",
        "SERVICE", "BANK", "GAS STATION", "AUTO", "MIXED USE"
    ],
    "Industrial": [
        "INDUSTRIAL", "MANUFACTURING", "FACTORY", "PLANT", "DISTRIBUTION",
        "PROCESSING", "PRODUCTION", "UTILITY", "MINING", "QUARRY"
    ],
    "Agricultural": [
        "AGRICULTURAL", "FARM", "RANCH", "GROVE", "PASTURE", "CROP",
        "LIVESTOCK", "DAIRY", "POULTRY", "TIMBER", "ORCHARD", "VINEYARD"
    ],
    "Vacant Land": [
        "VACANT", "UNDEVELOPED", "RAW LAND", "IMPROVED VACANT",
        "ACREAGE", "LOT", "UNIMPROVED"
    ],
    "Government": [
        "GOVERNMENT", "PUBLIC", "MUNICIPAL", "COUNTY", "STATE", "FEDERAL",
        "SCHOOL", "LIBRARY", "FIRE", "POLICE", "MILITARY", "COURTHOUSE"
    ],
    "Conservation": [
        "CONSERVATION", "PRESERVE", "ENVIRONMENTAL", "WETLAND",
        "NATURAL", "PARK", "RECREATION", "FOREST", "WILDLIFE"
    ],
    "Religious": [
        "CHURCH", "RELIGIOUS", "TEMPLE", "MOSQUE", "SYNAGOGUE",
        "MONASTERY", "CONVENT", "CHAPEL"
    ],
    "Infrastructure": [
        "ROAD", "BRIDGE", "CANAL", "DRAINAGE", "RIGHT OF WAY",
        "EASEMENT", "RAILROAD", "HIGHWAY", "PIPELINE"
    ],
    "Institutional": [
        "HOSPITAL", "NURSING", "MEDICAL", "HEALTH", "CLINIC",
        "UNIVERSITY", "COLLEGE", "EDUCATIONAL"
    ],
    "Recreation": [
        "GOLF", "CLUB", "STADIUM", "ARENA", "THEATER", "ENTERTAINMENT",
        "SPORTS", "MARINA", "BEACH", "RESORT"
    ],
    "Special Use": [
        "CEMETERY", "AIRPORT", "LANDFILL", "WASTE", "COMMUNICATION",
        "TOWER", "SUBSTATION"
    ]
}

print("Mapping property uses to categories...")

In [None]:
# Create category mappings
category_mappings = {}
matched_uses = set()

for category, keywords in mapping_rules.items():
    category_uses = []
    category_count = 0
    
    for _, row in property_uses_df.iterrows():
        use = str(row['property_use']).upper()
        if any(keyword in use for keyword in keywords):
            category_uses.append(row['property_use'])
            category_count += row['count']
            matched_uses.add(row['property_use'])
    
    if category_uses:
        total_properties = property_uses_df['count'].sum()
        percentage = (category_count / total_properties) * 100
        
        category_mappings[category] = {
            'uses': category_uses,
            'count': category_count,
            'percentage': percentage,
            'unique_uses': len(category_uses)
        }

# Find unmatched property uses
unmatched_uses = []
for _, row in property_uses_df.iterrows():
    if row['property_use'] not in matched_uses:
        unmatched_uses.append({
            'property_use': row['property_use'],
            'count': row['count'],
            'percentage': row['percentage']
        })

print(f"\n=== CATEGORY MAPPING RESULTS ===")
print(f"Total property uses mapped: {len(matched_uses)}")
print(f"Total property uses unmatched: {len(unmatched_uses)}")
print(f"Categories created: {len(category_mappings)}")

# Display category statistics
print("\n=== CATEGORY STATISTICS ===")
for category, data in sorted(category_mappings.items(), key=lambda x: x[1]['count'], reverse=True):
    print(f"{category:<15} {data['count']:>10,} properties ({data['percentage']:>6.2f}%) - {data['unique_uses']:>3} use types")

In [None]:
# Visualize category distribution
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# Category counts
categories = list(category_mappings.keys())
counts = [category_mappings[cat]['count'] for cat in categories]
percentages = [category_mappings[cat]['percentage'] for cat in categories]

# Bar chart of category counts
axes[0, 0].bar(categories, counts)
axes[0, 0].set_xlabel('Category')
axes[0, 0].set_ylabel('Number of Properties')
axes[0, 0].set_title('Properties by Category')
axes[0, 0].tick_params(axis='x', rotation=45)

# Pie chart of category percentages
unmatched_percentage = sum(use['percentage'] for use in unmatched_uses)
pie_data = percentages + [unmatched_percentage]
pie_labels = categories + ['Unmatched']
axes[0, 1].pie(pie_data, labels=pie_labels, autopct='%1.1f%%', startangle=90)
axes[0, 1].set_title('Property Distribution by Category')

# Number of unique uses per category
unique_counts = [category_mappings[cat]['unique_uses'] for cat in categories]
axes[1, 0].bar(categories, unique_counts)
axes[1, 0].set_xlabel('Category')
axes[1, 0].set_ylabel('Number of Unique Use Types')
axes[1, 0].set_title('Unique Use Types per Category')
axes[1, 0].tick_params(axis='x', rotation=45)

# Top unmatched uses
top_unmatched = sorted(unmatched_uses, key=lambda x: x['count'], reverse=True)[:15]
if top_unmatched:
    unmatched_names = [use['property_use'][:20] for use in top_unmatched]
    unmatched_counts = [use['count'] for use in top_unmatched]
    axes[1, 1].barh(range(len(unmatched_names)), unmatched_counts)
    axes[1, 1].set_yticks(range(len(unmatched_names)))
    axes[1, 1].set_yticklabels(unmatched_names, fontsize=8)
    axes[1, 1].set_xlabel('Number of Properties')
    axes[1, 1].set_title('Top 15 Unmatched Property Uses')
    axes[1, 1].invert_yaxis()

plt.tight_layout()
plt.show()

## 4. Interactive Plotly Visualizations

In [None]:
# Create interactive treemap of property uses
top_50 = property_uses_df.head(50)

# Assign categories for color coding
def assign_category(use):
    use_upper = str(use).upper()
    for category, keywords in mapping_rules.items():
        if any(keyword in use_upper for keyword in keywords):
            return category
    return 'Unmatched'

top_50['category'] = top_50['property_use'].apply(assign_category)

fig = px.treemap(
    top_50,
    values='count',
    names='property_use',
    color='category',
    title='Top 50 Property Uses - Interactive Treemap',
    width=1200,
    height=800
)

fig.show()

In [None]:
# Create sunburst chart showing category hierarchy
sunburst_data = []

for category, data in category_mappings.items():
    # Add category level
    sunburst_data.append({
        'ids': category,
        'labels': category,
        'parents': '',
        'values': data['count']
    })
    
    # Add top 5 uses for each category
    category_uses = [(use, property_uses_df[property_uses_df['property_use'] == use]['count'].iloc[0]) 
                     for use in data['uses']]
    category_uses.sort(key=lambda x: x[1], reverse=True)
    
    for use, count in category_uses[:5]:  # Top 5 only
        sunburst_data.append({
            'ids': f"{category} - {use}",
            'labels': use[:20] + ('...' if len(use) > 20 else ''),
            'parents': category,
            'values': count
        })

sunburst_df = pd.DataFrame(sunburst_data)

fig = go.Figure(go.Sunburst(
    ids=sunburst_df['ids'],
    labels=sunburst_df['labels'],
    parents=sunburst_df['parents'],
    values=sunburst_df['values'],
    branchvalues="total"
))

fig.update_layout(
    title="Property Use Categories - Sunburst Chart",
    width=800,
    height=800
)

fig.show()

## 5. Geographic Distribution Analysis

In [None]:
# Analyze property use distribution by county
county_analysis_query = """
SELECT
    county,
    property_use,
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(PARTITION BY county), 2) as county_percentage
FROM florida_parcels
WHERE property_use IS NOT NULL AND property_use != ''
GROUP BY county, property_use
ORDER BY county, count DESC
"""

print("Analyzing property use distribution by county...")
county_df = pd.read_sql(county_analysis_query, engine)
print(f"Loaded {len(county_df)} county-property use combinations")

# Get top property use per county
top_use_per_county = county_df.groupby('county').first().reset_index()
print("\n=== TOP PROPERTY USE PER COUNTY ===")
for _, row in top_use_per_county.head(20).iterrows():
    print(f"{row['county']:<15} {row['property_use']:<30} {row['count']:>8,} ({row['county_percentage']:>6.2f}%)")

In [None]:
# Create heatmap of county vs category
county_category_data = []

for _, row in county_df.iterrows():
    category = assign_category(row['property_use'])
    county_category_data.append({
        'county': row['county'],
        'category': category,
        'count': row['count']
    })

county_category_df = pd.DataFrame(county_category_data)
county_category_summary = county_category_df.groupby(['county', 'category'])['count'].sum().unstack(fill_value=0)

# Create heatmap for top 20 counties
total_by_county = county_category_summary.sum(axis=1).sort_values(ascending=False)
top_counties = total_by_county.head(20).index

plt.figure(figsize=(15, 10))
sns.heatmap(county_category_summary.loc[top_counties], 
            annot=False, 
            cmap='YlOrRd', 
            cbar_kws={'label': 'Number of Properties'})
plt.title('Property Category Distribution by County (Top 20 Counties)')
plt.xlabel('Property Category')
plt.ylabel('County')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Value Analysis by Property Type

In [None]:
# Analyze property values by category
value_analysis = []

for category, data in category_mappings.items():
    category_values = property_uses_df[property_uses_df['property_use'].isin(data['uses'])]
    
    if not category_values.empty:
        # Calculate weighted averages
        total_count = category_values['count'].sum()
        weighted_avg_value = (category_values['avg_just_value'] * category_values['count']).sum() / total_count
        weighted_avg_building = (category_values['avg_building_value'] * category_values['count']).sum() / total_count
        weighted_avg_land = (category_values['avg_land_value'] * category_values['count']).sum() / total_count
        
        value_analysis.append({
            'category': category,
            'count': total_count,
            'avg_just_value': weighted_avg_value,
            'avg_building_value': weighted_avg_building,
            'avg_land_value': weighted_avg_land
        })

value_df = pd.DataFrame(value_analysis)
value_df = value_df.dropna()

print("=== AVERAGE VALUES BY CATEGORY ===")
for _, row in value_df.sort_values('avg_just_value', ascending=False).iterrows():
    print(f"{row['category']:<15} Just: ${row['avg_just_value']:>10,.0f} Building: ${row['avg_building_value']:>10,.0f} Land: ${row['avg_land_value']:>10,.0f}")

In [None]:
# Create value comparison charts
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# Average just value by category
value_sorted = value_df.sort_values('avg_just_value', ascending=True)
axes[0, 0].barh(value_sorted['category'], value_sorted['avg_just_value'])
axes[0, 0].set_xlabel('Average Just Value ($)')
axes[0, 0].set_title('Average Just Value by Category')

# Count vs Average Value scatter
axes[0, 1].scatter(value_df['count'], value_df['avg_just_value'], s=100, alpha=0.7)
for _, row in value_df.iterrows():
    axes[0, 1].annotate(row['category'], (row['count'], row['avg_just_value']), 
                       xytext=(5, 5), textcoords='offset points', fontsize=8)
axes[0, 1].set_xlabel('Number of Properties')
axes[0, 1].set_ylabel('Average Just Value ($)')
axes[0, 1].set_title('Property Count vs Average Value by Category')
axes[0, 1].set_xscale('log')
axes[0, 1].set_yscale('log')

# Building vs Land value
axes[1, 0].scatter(value_df['avg_land_value'], value_df['avg_building_value'], s=100, alpha=0.7)
for _, row in value_df.iterrows():
    axes[1, 0].annotate(row['category'], (row['avg_land_value'], row['avg_building_value']), 
                       xytext=(5, 5), textcoords='offset points', fontsize=8)
axes[1, 0].set_xlabel('Average Land Value ($)')
axes[1, 0].set_ylabel('Average Building Value ($)')
axes[1, 0].set_title('Land vs Building Value by Category')

# Value composition stacked bar
categories = value_df['category']
land_values = value_df['avg_land_value']
building_values = value_df['avg_building_value']

axes[1, 1].bar(categories, land_values, label='Land Value')
axes[1, 1].bar(categories, building_values, bottom=land_values, label='Building Value')
axes[1, 1].set_ylabel('Average Value ($)')
axes[1, 1].set_title('Value Composition by Category')
axes[1, 1].legend()
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 7. Recommendations and Gap Analysis

In [None]:
# Analyze unmatched uses for potential new categories
print("=== GAP ANALYSIS AND RECOMMENDATIONS ===")
print(f"\nTotal unmatched property uses: {len(unmatched_uses)}")
print(f"Total properties in unmatched uses: {sum(use['count'] for use in unmatched_uses):,}")
print(f"Percentage of properties unmatched: {sum(use['percentage'] for use in unmatched_uses):.2f}%")

# High-volume unmatched uses (>1000 properties)
high_volume_unmatched = [use for use in unmatched_uses if use['count'] > 1000]
print(f"\n=== HIGH-VOLUME UNMATCHED USES (>1,000 properties) ===")
for use in sorted(high_volume_unmatched, key=lambda x: x['count'], reverse=True):
    print(f"{use['property_use']:<50} {use['count']:>8,} ({use['percentage']:>6.2f}%)")

# Categories missing from current filters
current_filter_categories = set(current_categories)
identified_categories = set(category_mappings.keys())
missing_categories = identified_categories - current_filter_categories

print(f"\n=== MISSING FILTER CATEGORIES ===")
for category in missing_categories:
    data = category_mappings[category]
    print(f"{category:<15} {data['count']:>10,} properties ({data['percentage']:>6.2f}%) - {data['unique_uses']:>3} use types")

# Generate recommendations
recommendations = {
    "add_filter_categories": list(missing_categories),
    "high_volume_unmatched": high_volume_unmatched[:10],  # Top 10
    "mapping_completeness": {
        "mapped_uses": len(matched_uses),
        "total_uses": len(property_uses_df),
        "mapping_percentage": (len(matched_uses) / len(property_uses_df)) * 100
    },
    "category_coverage": {
        "mapped_properties": sum(data['count'] for data in category_mappings.values()),
        "total_properties": property_uses_df['count'].sum(),
        "coverage_percentage": (sum(data['count'] for data in category_mappings.values()) / property_uses_df['count'].sum()) * 100
    }
}

print(f"\n=== MAPPING COMPLETENESS ===")
print(f"Mapped property uses: {recommendations['mapping_completeness']['mapped_uses']:,} / {recommendations['mapping_completeness']['total_uses']:,} ({recommendations['mapping_completeness']['mapping_percentage']:.1f}%)")
print(f"Mapped properties: {recommendations['category_coverage']['mapped_properties']:,} / {recommendations['category_coverage']['total_properties']:,} ({recommendations['category_coverage']['coverage_percentage']:.1f}%)")

## 8. Export Results and Generate Report

In [None]:
# Create comprehensive report
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

comprehensive_report = {
    "analysis_metadata": {
        "timestamp": datetime.now().isoformat(),
        "total_properties": int(property_uses_df['count'].sum()),
        "unique_property_uses": len(property_uses_df),
        "analysis_method": "Jupyter Notebook + SQLAlchemy + Pandas"
    },
    "database_statistics": db_stats.to_dict('records')[0],
    "property_uses": property_uses_df.to_dict('records'),
    "category_mappings": category_mappings,
    "unmatched_uses": unmatched_uses,
    "value_analysis": value_df.to_dict('records'),
    "current_filter_categories": current_categories,
    "recommendations": recommendations
}

# Save comprehensive report
report_filename = f"comprehensive_property_use_analysis_{timestamp}.json"
with open(report_filename, 'w') as f:
    json.dump(comprehensive_report, f, indent=2, default=str)

# Save individual CSV files
property_uses_df.to_csv(f"all_property_uses_{timestamp}.csv", index=False)
value_df.to_csv(f"value_analysis_by_category_{timestamp}.csv", index=False)
county_category_summary.to_csv(f"county_category_distribution_{timestamp}.csv")

# Create summary for unmatched uses
unmatched_df = pd.DataFrame(unmatched_uses)
if not unmatched_df.empty:
    unmatched_df.to_csv(f"unmatched_property_uses_{timestamp}.csv", index=False)

print(f"\n=== ANALYSIS COMPLETE ===")
print(f"Comprehensive report saved to: {report_filename}")
print(f"CSV exports created with timestamp: {timestamp}")
print(f"\nFiles created:")
print(f"- {report_filename}")
print(f"- all_property_uses_{timestamp}.csv")
print(f"- value_analysis_by_category_{timestamp}.csv")
print(f"- county_category_distribution_{timestamp}.csv")
if not unmatched_df.empty:
    print(f"- unmatched_property_uses_{timestamp}.csv")

## 9. Summary and Next Steps

In [None]:
print("="*80)
print("COMPREHENSIVE PROPERTY USE ANALYSIS - SUMMARY")
print("="*80)
print(f"Database: Florida Parcels ({db_stats['total_properties'].iloc[0]:,} properties)")
print(f"Unique Property Uses: {len(property_uses_df):,}")
print(f"Categories Identified: {len(category_mappings)}")
print(f"Mapping Coverage: {recommendations['category_coverage']['coverage_percentage']:.1f}% of properties")
print(f"")
print("CURRENT FILTER CATEGORIES:")
for i, cat in enumerate(current_categories, 1):
    print(f"{i:2d}. {cat}")
print(f"")
print("RECOMMENDED NEW FILTER CATEGORIES:")
for i, cat in enumerate(missing_categories, 1):
    data = category_mappings[cat]
    print(f"{i:2d}. {cat} ({data['count']:,} properties, {data['percentage']:.2f}%)")
print(f"")
print("HIGH-VOLUME UNMATCHED USES (Top 5):")
for i, use in enumerate(high_volume_unmatched[:5], 1):
    print(f"{i:2d}. {use['property_use']} ({use['count']:,} properties)")
print("="*80)

# Final recommendations
print("\nRECOMMENDATIONS:")
print("1. Add missing filter categories to improve coverage")
print("2. Review high-volume unmatched uses for potential new categories")
print("3. Update frontend filter buttons to include all identified categories")
print("4. Consider creating subcategories for large categories like Residential")
print("5. Implement dynamic category detection for future property types")