# DOR Use Code Assignment Analysis
## Comprehensive Analysis of 9.1M Florida Properties

This notebook provides:
- Current use code coverage analysis
- Distribution visualizations
- Assignment validation
- Quality metrics
- County-level breakdowns

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine, text
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

# Styling
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

print("✅ Libraries imported successfully")

In [None]:
# Connect to Supabase
SUPABASE_URL = os.getenv('SUPABASE_URL')
SUPABASE_KEY = os.getenv('SUPABASE_SERVICE_ROLE_KEY')

if not SUPABASE_URL or not SUPABASE_KEY:
    raise Exception("Missing Supabase credentials. Set SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY")

db_url = SUPABASE_URL.replace('https://', '')
connection_string = f"postgresql://postgres.{db_url.split('.')[1]}:{SUPABASE_KEY}@{db_url}:5432/postgres"

engine = create_engine(connection_string, pool_size=10)
print("✅ Connected to Supabase database")

## 1. Overall Status Analysis

In [None]:
# Get overall statistics
query = """
SELECT
    COUNT(*) as total_properties,
    COUNT(CASE WHEN dor_uc IS NOT NULL AND dor_uc != '' THEN 1 END) as with_code,
    COUNT(CASE WHEN dor_uc IS NULL OR dor_uc = '' THEN 1 END) as without_code,
    ROUND(COUNT(CASE WHEN dor_uc IS NOT NULL AND dor_uc != '' THEN 1 END)::numeric / COUNT(*) * 100, 2) as coverage_pct
FROM florida_parcels
WHERE year = 2025
"""

overall_stats = pd.read_sql(query, engine)
print("📊 OVERALL DOR USE CODE STATUS")
print("=" * 60)
print(f"Total Properties: {overall_stats['total_properties'].iloc[0]:,}")
print(f"With DOR Code: {overall_stats['with_code'].iloc[0]:,}")
print(f"Without DOR Code: {overall_stats['without_code'].iloc[0]:,}")
print(f"Coverage: {overall_stats['coverage_pct'].iloc[0]:.2f}%")
print("=" * 60)

In [None]:
# Visualize coverage
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Pie chart
labels = ['With DOR Code', 'Without DOR Code']
sizes = [overall_stats['with_code'].iloc[0], overall_stats['without_code'].iloc[0]]
colors = ['#4CAF50', '#FF5252']
explode = (0.05, 0)

ax[0].pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%',
          shadow=True, startangle=90)
ax[0].set_title('DOR Use Code Coverage', fontsize=14, fontweight='bold')

# Bar chart
ax[1].bar(labels, sizes, color=colors)
ax[1].set_ylabel('Number of Properties', fontsize=12)
ax[1].set_title('Properties by Assignment Status', fontsize=14, fontweight='bold')
ax[1].ticklabel_format(style='plain', axis='y')

for i, v in enumerate(sizes):
    ax[1].text(i, v, f'{v:,}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

## 2. Use Code Distribution Analysis

In [None]:
# Get use code distribution
query = """
SELECT
    fp.dor_uc,
    duc.use_description,
    duc.category,
    COUNT(*) as count,
    ROUND(COUNT(*)::numeric / (SELECT COUNT(*) FROM florida_parcels WHERE year = 2025) * 100, 2) as percentage,
    ROUND(AVG(fp.just_value)::numeric, 2) as avg_value,
    SUM(fp.just_value) as total_value
FROM florida_parcels fp
LEFT JOIN dor_use_codes duc ON fp.dor_uc = duc.use_code
WHERE fp.year = 2025 AND fp.dor_uc IS NOT NULL AND fp.dor_uc != ''
GROUP BY fp.dor_uc, duc.use_description, duc.category
ORDER BY count DESC
LIMIT 30
"""

use_code_dist = pd.read_sql(query, engine)
print("\n📊 TOP 30 USE CODES")
print(use_code_dist.to_string(index=False))

In [None]:
# Visualize top use codes
fig, ax = plt.subplots(figsize=(14, 8))

top_20 = use_code_dist.head(20)
colors = plt.cm.viridis(np.linspace(0, 1, len(top_20)))

bars = ax.barh(range(len(top_20)), top_20['count'], color=colors)
ax.set_yticks(range(len(top_20)))
ax.set_yticklabels([f"{row['dor_uc']} - {row['use_description'][:30]}" 
                     for _, row in top_20.iterrows()], fontsize=9)
ax.set_xlabel('Number of Properties', fontsize=12)
ax.set_title('Top 20 DOR Use Codes by Property Count', fontsize=14, fontweight='bold')
ax.invert_yaxis()

# Add value labels
for i, (bar, count) in enumerate(zip(bars, top_20['count'])):
    ax.text(count, i, f' {count:,}', va='center', fontsize=8)

plt.tight_layout()
plt.show()

## 3. Category Analysis

In [None]:
# Get category distribution
query = """
SELECT
    property_use_category,
    COUNT(*) as count,
    ROUND(COUNT(*)::numeric / (SELECT COUNT(*) FROM florida_parcels WHERE year = 2025) * 100, 2) as percentage,
    ROUND(AVG(just_value)::numeric, 2) as avg_value,
    SUM(just_value) as total_value
FROM florida_parcels
WHERE year = 2025
GROUP BY property_use_category
ORDER BY count DESC
"""

category_dist = pd.read_sql(query, engine)
print("\n📊 PROPERTY USE CATEGORIES")
print(category_dist.to_string(index=False))

In [None]:
# Visualize categories
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

# Filter out None/null categories
category_clean = category_dist[category_dist['property_use_category'].notna()]

# Pie chart
colors = plt.cm.Set3(np.linspace(0, 1, len(category_clean)))
ax[0].pie(category_clean['count'], labels=category_clean['property_use_category'],
          colors=colors, autopct='%1.1f%%', startangle=90)
ax[0].set_title('Properties by Category', fontsize=14, fontweight='bold')

# Bar chart with values
bars = ax[1].bar(range(len(category_clean)), category_clean['total_value'], color=colors)
ax[1].set_xticks(range(len(category_clean)))
ax[1].set_xticklabels(category_clean['property_use_category'], rotation=45, ha='right')
ax[1].set_ylabel('Total Property Value ($)', fontsize=12)
ax[1].set_title('Total Value by Category', fontsize=14, fontweight='bold')
ax[1].ticklabel_format(style='plain', axis='y')

plt.tight_layout()
plt.show()

## 4. County-Level Analysis

In [None]:
# Get county coverage
query = """
SELECT
    county,
    COUNT(*) as total,
    COUNT(CASE WHEN dor_uc IS NOT NULL AND dor_uc != '' THEN 1 END) as with_code,
    COUNT(CASE WHEN dor_uc IS NULL OR dor_uc = '' THEN 1 END) as without_code,
    ROUND(COUNT(CASE WHEN dor_uc IS NOT NULL AND dor_uc != '' THEN 1 END)::numeric / COUNT(*) * 100, 2) as coverage_pct
FROM florida_parcels
WHERE year = 2025
GROUP BY county
ORDER BY total DESC
LIMIT 20
"""

county_coverage = pd.read_sql(query, engine)
print("\n📊 TOP 20 COUNTIES BY PROPERTY COUNT")
print(county_coverage.to_string(index=False))

In [None]:
# Visualize county coverage
fig, ax = plt.subplots(figsize=(14, 8))

x = range(len(county_coverage))
ax.bar(x, county_coverage['coverage_pct'], color='#2196F3')
ax.set_xticks(x)
ax.set_xticklabels(county_coverage['county'], rotation=45, ha='right')
ax.set_ylabel('Coverage Percentage', fontsize=12)
ax.set_title('DOR Use Code Coverage by County (Top 20)', fontsize=14, fontweight='bold')
ax.axhline(y=100, color='green', linestyle='--', label='100% Coverage')
ax.axhline(y=80, color='orange', linestyle='--', label='80% Coverage')
ax.legend()

plt.tight_layout()
plt.show()

## 5. Value Analysis

In [None]:
# Get value statistics by use code
query = """
SELECT
    fp.dor_uc,
    duc.use_description,
    COUNT(*) as count,
    ROUND(MIN(fp.just_value)::numeric, 2) as min_value,
    ROUND(AVG(fp.just_value)::numeric, 2) as avg_value,
    ROUND(MAX(fp.just_value)::numeric, 2) as max_value,
    SUM(fp.just_value) as total_value
FROM florida_parcels fp
LEFT JOIN dor_use_codes duc ON fp.dor_uc = duc.use_code
WHERE fp.year = 2025
AND fp.dor_uc IS NOT NULL
AND fp.dor_uc != ''
AND fp.just_value > 0
GROUP BY fp.dor_uc, duc.use_description
ORDER BY total_value DESC
LIMIT 20
"""

value_analysis = pd.read_sql(query, engine)
print("\n💰 VALUE ANALYSIS BY USE CODE (Top 20)")
print(value_analysis.to_string(index=False))

In [None]:
# Visualize average values
fig, ax = plt.subplots(figsize=(14, 8))

top_15 = value_analysis.head(15)
colors = plt.cm.plasma(np.linspace(0, 1, len(top_15)))

bars = ax.barh(range(len(top_15)), top_15['avg_value'], color=colors)
ax.set_yticks(range(len(top_15)))
ax.set_yticklabels([f"{row['dor_uc']} - {row['use_description'][:25]}" 
                     for _, row in top_15.iterrows()], fontsize=9)
ax.set_xlabel('Average Property Value ($)', fontsize=12)
ax.set_title('Average Property Value by Use Code', fontsize=14, fontweight='bold')
ax.invert_yaxis()

plt.tight_layout()
plt.show()

## 6. Quality Validation

In [None]:
# Check for invalid use codes
query = """
SELECT DISTINCT fp.dor_uc
FROM florida_parcels fp
WHERE fp.year = 2025
AND fp.dor_uc IS NOT NULL
AND fp.dor_uc != ''
AND NOT EXISTS (
    SELECT 1 FROM dor_use_codes duc
    WHERE duc.use_code = fp.dor_uc
)
ORDER BY fp.dor_uc
"""

invalid_codes = pd.read_sql(query, engine)
print("\n⚠️ INVALID USE CODES")
if len(invalid_codes) == 0:
    print("✅ No invalid use codes found!")
else:
    print(f"❌ Found {len(invalid_codes)} invalid use codes:")
    print(invalid_codes.to_string(index=False))

## 7. Summary Report

In [None]:
# Generate final summary report
print("\n" + "=" * 80)
print("📋 DOR USE CODE ASSIGNMENT SUMMARY REPORT")
print("=" * 80)
print(f"\nGenerated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"\n1. OVERALL STATUS:")
print(f"   Total Properties: {overall_stats['total_properties'].iloc[0]:,}")
print(f"   Coverage: {overall_stats['coverage_pct'].iloc[0]:.2f}%")
print(f"   Remaining: {overall_stats['without_code'].iloc[0]:,}")

print(f"\n2. USE CODE DIVERSITY:")
print(f"   Unique Use Codes: {len(use_code_dist)}")
print(f"   Most Common: {use_code_dist.iloc[0]['dor_uc']} - {use_code_dist.iloc[0]['use_description']}")
print(f"   Properties in Top Code: {use_code_dist.iloc[0]['count']:,}")

print(f"\n3. CATEGORIES:")
for _, row in category_clean.iterrows():
    print(f"   {row['property_use_category']}: {row['count']:,} ({row['percentage']:.1f}%)")

print(f"\n4. DATA QUALITY:")
print(f"   Invalid Codes: {len(invalid_codes)}")
print(f"   Quality Status: {'✅ PASS' if len(invalid_codes) == 0 else '⚠️ NEEDS ATTENTION'}")

print("\n" + "=" * 80)
print("✅ Analysis Complete")
print("=" * 80)

## 8. Export Results

In [None]:
# Export data to CSV files
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

use_code_dist.to_csv(f'dor_use_code_distribution_{timestamp}.csv', index=False)
category_dist.to_csv(f'dor_category_distribution_{timestamp}.csv', index=False)
county_coverage.to_csv(f'dor_county_coverage_{timestamp}.csv', index=False)
value_analysis.to_csv(f'dor_value_analysis_{timestamp}.csv', index=False)

print(f"✅ Results exported with timestamp: {timestamp}")