In [None]:
# AI Data Journalist Analysis: Pedestrian Ramp Complaints
# Following systematic analysis rules for story discovery

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set display options for better data viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 100)

print("=== AI DATA JOURNALIST ANALYSIS ===")
print("Dataset: Pedestrian Ramp Complaints")
print("Analysis Date:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print("="*50)


In [None]:
# STEP 1: INITIAL DATA ASSESSMENT
print("STEP 1: INITIAL DATA ASSESSMENT")
print("="*40)

# Load and validate the dataset
try:
    df = pd.read_csv('Pedestrian_Ramp_Complaints_20250708.csv')
    print("✓ Dataset loaded successfully")
    print(f"Dataset shape: {df.shape[0]} rows, {df.shape[1]} columns")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    
print("\n" + "="*40)
print("COLUMN ANALYSIS")
print("="*40)

# Identify all columns and their data types
print("Column Names and Data Types:")
for i, (col, dtype) in enumerate(zip(df.columns, df.dtypes), 1):
    print(f"{i:2d}. {col:<30} | {dtype}")

print(f"\nTotal columns: {len(df.columns)}")

# Check for missing values
print("\n" + "="*40)
print("MISSING VALUES ANALYSIS")
print("="*40)

missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': missing_data.values,
    'Missing_Percent': missing_percent.values
}).sort_values('Missing_Count', ascending=False)

print("Missing values by column:")
print(missing_summary[missing_summary['Missing_Count'] > 0])

if missing_summary['Missing_Count'].sum() == 0:
    print("✓ No missing values found in dataset")

# Basic dataset info
print("\n" + "="*40)
print("DATASET OVERVIEW")
print("="*40)
print("First few rows:")
print(df.head(3))

print("\nDataset Info:")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"Duplicate rows: {df.duplicated().sum()}")


In [None]:
# NUMERICAL COLUMNS ANALYSIS
print("\n" + "="*40)
print("NUMERICAL STATISTICS")
print("="*40)

# Identify numerical columns
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")
print(f"Categorical columns ({len(categorical_cols)}): {categorical_cols}")

if numerical_cols:
    print("\nBasic Statistics for Numerical Columns:")
    print(df[numerical_cols].describe())
    
    # Additional statistics
    print("\nAdditional Statistics:")
    for col in numerical_cols:
        print(f"\n{col}:")
        print(f"  Range: {df[col].min()} to {df[col].max()}")
        print(f"  Std Dev: {df[col].std():.2f}")
        print(f"  Variance: {df[col].var():.2f}")
        print(f"  Skewness: {df[col].skew():.2f}")
        print(f"  Kurtosis: {df[col].kurtosis():.2f}")
else:
    print("No numerical columns found for statistical analysis.")

# Check for date columns
print("\n" + "="*40)
print("TEMPORAL COVERAGE ANALYSIS")
print("="*40)

# Look for potential date columns
date_cols = []
for col in df.columns:
    if any(keyword in col.lower() for keyword in ['date', 'time', 'created', 'updated', 'submitted']):
        date_cols.append(col)

print(f"Potential date columns found: {date_cols}")

# Try to parse dates and analyze temporal coverage
for col in date_cols:
    try:
        # Try different date parsing methods
        df[f'{col}_parsed'] = pd.to_datetime(df[col], errors='coerce')
        valid_dates = df[f'{col}_parsed'].dropna()
        
        if len(valid_dates) > 0:
            print(f"\n{col} temporal analysis:")
            print(f"  Date range: {valid_dates.min()} to {valid_dates.max()}")
            print(f"  Time span: {(valid_dates.max() - valid_dates.min()).days} days")
            print(f"  Valid dates: {len(valid_dates)}/{len(df)} ({len(valid_dates)/len(df)*100:.1f}%)")
            
            # Yearly breakdown
            if len(valid_dates) > 0:
                yearly_counts = valid_dates.dt.year.value_counts().sort_index()
                print(f"  Years covered: {yearly_counts.index.min()} - {yearly_counts.index.max()}")
                print("  Yearly distribution:")
                for year, count in yearly_counts.items():
                    print(f"    {year}: {count} records")
        else:
            print(f"  No valid dates found in {col}")
    except Exception as e:
        print(f"  Error parsing {col}: {e}")


In [None]:
# STEP 2: OVERALL TREND ANALYSIS
print("\n" + "="*50)
print("STEP 2: OVERALL TREND ANALYSIS")
print("="*50)

# Find the best date column for trend analysis
main_date_col = None
for col in date_cols:
    if f'{col}_parsed' in df.columns:
        valid_dates = df[f'{col}_parsed'].dropna()
        if len(valid_dates) > len(df) * 0.5:  # Use column with >50% valid dates
            main_date_col = f'{col}_parsed'
            break

if main_date_col:
    print(f"Using {main_date_col.replace('_parsed', '')} for trend analysis")
    
    # Create time series analysis
    df_with_dates = df[df[main_date_col].notna()].copy()
    df_with_dates['year'] = df_with_dates[main_date_col].dt.year
    df_with_dates['month'] = df_with_dates[main_date_col].dt.month
    df_with_dates['quarter'] = df_with_dates[main_date_col].dt.quarter
    df_with_dates['weekday'] = df_with_dates[main_date_col].dt.day_name()
    
    # Monthly trend analysis
    monthly_counts = df_with_dates.groupby([df_with_dates[main_date_col].dt.to_period('M')]).size()
    yearly_counts = df_with_dates.groupby('year').size()
    
    print("\n📈 YEARLY TRENDS:")
    print("="*30)
    
    if len(yearly_counts) > 1:
        for i, (year, count) in enumerate(yearly_counts.items()):
            if i > 0:
                prev_year = yearly_counts.index[i-1]
                prev_count = yearly_counts.iloc[i-1]
                change = count - prev_count
                pct_change = (change / prev_count) * 100
                trend_indicator = "📈" if change > 0 else "📉" if change < 0 else "➡️"
                print(f"  {year}: {count:,} complaints {trend_indicator} ({change:+,} = {pct_change:+.1f}%)")
            else:
                print(f"  {year}: {count:,} complaints (baseline)")
    
    # Monthly seasonal patterns
    print("\n📅 SEASONAL PATTERNS:")
    print("="*30)
    monthly_avg = df_with_dates.groupby('month').size()
    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
    print("Average complaints by month:")
    overall_monthly_avg = monthly_avg.mean()
    for month, avg_count in monthly_avg.items():
        month_name = month_names[month-1]
        deviation = ((avg_count - overall_monthly_avg) / overall_monthly_avg) * 100
        trend_indicator = "🔥" if deviation > 20 else "❄️" if deviation < -20 else "🌡️"
        print(f"  {month_name}: {avg_count:.1f} {trend_indicator} ({deviation:+.1f}% vs avg)")
    
    # Weekday patterns
    print("\n📊 WEEKDAY PATTERNS:")
    print("="*30)
    weekday_counts = df_with_dates['weekday'].value_counts()
    weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    weekday_avg = weekday_counts.mean()
    
    for day in weekday_order:
        if day in weekday_counts:
            count = weekday_counts[day]
            deviation = ((count - weekday_avg) / weekday_avg) * 100
            business_indicator = "💼" if day in ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] else "🏡"
            print(f"  {day}: {count} {business_indicator} ({deviation:+.1f}% vs avg)")

else:
    print("❌ No suitable date column found for trend analysis")
    print("Performing categorical frequency analysis instead...")
    
    # Alternative analysis when no dates available
    for col in categorical_cols:
        if df[col].nunique() < 20:  # Only analyze columns with reasonable number of unique values
            print(f"\n📊 {col.upper()} FREQUENCY ANALYSIS:")
            value_counts = df[col].value_counts().head(10)
            total = len(df)
            for value, count in value_counts.items():
                percentage = (count / total) * 100
                print(f"  {str(value)[:50]}: {count:,} ({percentage:.1f}%)")

# Growth rate calculations
if main_date_col and len(yearly_counts) > 1:
    print("\n📈 GROWTH RATE ANALYSIS:")
    print("="*30)
    
    # Calculate year-over-year growth
    growth_rates = []
    for i in range(1, len(yearly_counts)):
        current_year = yearly_counts.index[i]
        prev_year = yearly_counts.index[i-1]
        current_count = yearly_counts.iloc[i]
        prev_count = yearly_counts.iloc[i-1]
        
        yoy_growth = ((current_count - prev_count) / prev_count) * 100
        growth_rates.append(yoy_growth)
        
        print(f"  {prev_year} → {current_year}: {yoy_growth:+.1f}% growth")
    
    if growth_rates:
        avg_growth = np.mean(growth_rates)
        print(f"\n  Average annual growth rate: {avg_growth:.1f}%")
        
        if avg_growth > 10:
            print("  🚨 HIGH GROWTH TREND detected!")
        elif avg_growth < -10:
            print("  📉 DECLINING TREND detected!")
        else:
            print("  ➡️ STABLE TREND detected")


In [None]:
# STEP 3: EXTREME VALUE ANALYSIS
print("\n" + "="*50)
print("STEP 3: EXTREME VALUE ANALYSIS")
print("="*50)

# Function to find and analyze extreme values
def analyze_extreme_values(df, column, analysis_type="both"):
    """Analyze extreme values following journalism rules"""
    
    if df[column].dtype == 'object':
        # For categorical data, find most/least frequent
        value_counts = df[column].value_counts()
        print(f"\n🔍 EXTREME VALUES IN {column.upper()}:")
        print("="*40)
        
        if analysis_type in ["both", "max"]:
            print("📊 MOST FREQUENT VALUES:")
            for i, (value, count) in enumerate(value_counts.head(5).items()):
                percentage = (count / len(df)) * 100
                print(f"  {i+1}. {str(value)[:50]}: {count:,} occurrences ({percentage:.1f}%)")
                
        if analysis_type in ["both", "min"] and len(value_counts) > 5:
            print("\n📊 LEAST FREQUENT VALUES:")
            for i, (value, count) in enumerate(value_counts.tail(5).items()):
                percentage = (count / len(df)) * 100
                print(f"  {i+1}. {str(value)[:50]}: {count:,} occurrences ({percentage:.1f}%)")
        
        return value_counts.head(1).index[0], value_counts.head(1).values[0]
    
    else:
        # For numerical data
        col_data = df[column].dropna()
        if len(col_data) == 0:
            return None, None
            
        mean_val = col_data.mean()
        std_val = col_data.std()
        
        print(f"\n🔍 EXTREME VALUES IN {column.upper()}:")
        print("="*40)
        print(f"Mean: {mean_val:.2f}, Std Dev: {std_val:.2f}")
        
        if analysis_type in ["both", "max"]:
            max_val = col_data.max()
            max_idx = col_data.idxmax()
            std_from_mean = (max_val - mean_val) / std_val if std_val > 0 else 0
            
            print(f"\n📈 MAXIMUM VALUE: {max_val}")
            print(f"  Standard deviations from mean: {std_from_mean:.2f}")
            print(f"  Percentage above mean: {((max_val - mean_val) / mean_val * 100):.1f}%")
            
        if analysis_type in ["both", "min"]:
            min_val = col_data.min()
            min_idx = col_data.idxmin()
            std_from_mean = (mean_val - min_val) / std_val if std_val > 0 else 0
            
            print(f"\n📉 MINIMUM VALUE: {min_val}")
            print(f"  Standard deviations from mean: {std_from_mean:.2f}")
            print(f"  Percentage below mean: {((mean_val - min_val) / mean_val * 100):.1f}%")
        
        return max_val if analysis_type != "min" else min_val, max_idx if analysis_type != "min" else min_idx

# Look for location-related columns
location_columns = []
for col in df.columns:
    col_lower = col.lower()
    if any(keyword in col_lower for keyword in ['location', 'address', 'street', 'neighborhood', 
                                               'district', 'area', 'zone', 'community', 'precinct',
                                               'council', 'ward', 'borough', 'city', 'zip', 'postal']):
        location_columns.append(col)

print("🗺️ LOCATION COLUMNS IDENTIFIED:")
print(f"Found {len(location_columns)} location-related columns: {location_columns}")

# Analyze each location column for extremes
extreme_locations = {}

for col in location_columns:
    extreme_value, extreme_count = analyze_extreme_values(df, col, "max")
    if extreme_value is not None:
        extreme_locations[col] = {
            'value': extreme_value,
            'count': extreme_count,
            'column': col
        }

# Find outliers using IQR method for numerical columns
print("\n🎯 STATISTICAL OUTLIERS ANALYSIS:")
print("="*40)

outlier_indices = set()
for col in numerical_cols:
    col_data = df[col].dropna()
    if len(col_data) > 0:
        Q1 = col_data.quantile(0.25)
        Q3 = col_data.quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        outliers = col_data[(col_data < lower_bound) | (col_data > upper_bound)]
        if len(outliers) > 0:
            print(f"\n{col}: {len(outliers)} outliers found")
            print(f"  Normal range: {lower_bound:.2f} to {upper_bound:.2f}")
            print(f"  Outlier values: {sorted(outliers.values)}")
            outlier_indices.update(outliers.index)

if outlier_indices:
    print(f"\nTotal records with outliers: {len(outlier_indices)}")
else:
    print("No statistical outliers found in numerical columns.")

# Analyze all categorical columns for frequency-based extremes
print("\n📍 COMPREHENSIVE LOCATION ANALYSIS:")
print("="*50)

all_location_analysis = {}

for col in categorical_cols:
    if 'location' in col.lower() or 'address' in col.lower() or any(loc_word in col.lower() 
                                                                    for loc_word in ['street', 'neighborhood', 'area', 'district']):
        value_counts = df[col].value_counts()
        if len(value_counts) > 0:
            most_frequent = value_counts.index[0]
            frequency = value_counts.iloc[0]
            
            all_location_analysis[col] = {
                'most_frequent_location': most_frequent,
                'complaint_count': frequency,
                'percentage': (frequency / len(df)) * 100,
                'total_unique_locations': len(value_counts)
            }
            
            print(f"\n🏆 {col.upper()} - TOP COMPLAINT LOCATION:")
            print(f"  Location: {most_frequent}")
            print(f"  Complaints: {frequency:,}")
            print(f"  Percentage of total: {(frequency / len(df)) * 100:.1f}%")
            print(f"  Total unique locations: {len(value_counts):,}")
            
            # Show top 5 locations
            print(f"  Top 5 locations in {col}:")
            for i, (location, count) in enumerate(value_counts.head(5).items(), 1):
                pct = (count / len(df)) * 100
                print(f"    {i}. {str(location)[:60]}: {count:,} ({pct:.1f}%)")

# Store the extreme location for detailed analysis
if all_location_analysis:
    # Find the location with the highest complaint count across all location columns
    extreme_location_info = max(all_location_analysis.items(), 
                               key=lambda x: x[1]['complaint_count'])
    
    extreme_column = extreme_location_info[0]
    extreme_data = extreme_location_info[1]
    
    print(f"\n🚨 EXTREME LOCATION IDENTIFIED:")
    print("="*40)
    print(f"Column: {extreme_column}")
    print(f"Location: {extreme_data['most_frequent_location']}")
    print(f"Complaint Count: {extreme_data['complaint_count']:,}")
    print(f"Percentage: {extreme_data['percentage']:.1f}%")
    
    # Store for detailed analysis in next step
    EXTREME_LOCATION = extreme_data['most_frequent_location']
    EXTREME_COLUMN = extreme_column
    EXTREME_COUNT = extreme_data['complaint_count']
    
    print(f"\n✅ Will perform detailed analysis on: {EXTREME_LOCATION}")
else:
    print("❌ No clear location columns found for extreme analysis")
    EXTREME_LOCATION = None
    EXTREME_COLUMN = None
    EXTREME_COUNT = 0


In [None]:
# STEP 4: DETAILED EXTREME LOCATION ANALYSIS
print("\n" + "="*60)
print("STEP 4: DETAILED EXTREME LOCATION ANALYSIS")
print("="*60)

if EXTREME_LOCATION and EXTREME_COLUMN:
    print(f"🎯 DEEP DIVE ANALYSIS: {EXTREME_LOCATION}")
    print(f"📊 Column: {EXTREME_COLUMN}")
    print("="*60)
    
    # Filter data for the extreme location
    extreme_location_data = df[df[EXTREME_COLUMN] == EXTREME_LOCATION].copy()
    
    print(f"📈 BASIC STATISTICS:")
    print(f"  Total complaints at this location: {len(extreme_location_data):,}")
    print(f"  Percentage of all complaints: {(len(extreme_location_data) / len(df)) * 100:.2f}%")
    
    # Calculate how extreme this is
    all_location_counts = df[EXTREME_COLUMN].value_counts()
    mean_complaints = all_location_counts.mean()
    std_complaints = all_location_counts.std()
    
    if std_complaints > 0:
        std_deviations = (EXTREME_COUNT - mean_complaints) / std_complaints
        print(f"  Standard deviations above mean: {std_deviations:.2f}")
        
        if std_deviations > 3:
            print("  🚨 EXTREMELY UNUSUAL - Beyond 3 standard deviations!")
        elif std_deviations > 2:
            print("  ⚠️  VERY UNUSUAL - Beyond 2 standard deviations!")
        elif std_deviations > 1:
            print("  📊 MODERATELY UNUSUAL - Beyond 1 standard deviation")
    
    print(f"  Average complaints per location: {mean_complaints:.1f}")
    print(f"  This location vs average: {(EXTREME_COUNT / mean_complaints):.1f}x higher")
    
    # Show comparison with next closest locations
    print(f"\n📊 COMPARISON WITH OTHER TOP LOCATIONS:")
    print("="*50)
    top_5_locations = all_location_counts.head(5)
    for i, (location, count) in enumerate(top_5_locations.items(), 1):
        percentage = (count / len(df)) * 100
        is_extreme = "🎯" if location == EXTREME_LOCATION else "  "
        print(f"{is_extreme} {i}. {str(location)[:50]}")
        print(f"     Complaints: {count:,} ({percentage:.1f}%)")
        if i == 1 and len(top_5_locations) > 1:
            diff_to_second = count - top_5_locations.iloc[1]
            print(f"     Gap to #2: {diff_to_second:,} complaints ({(diff_to_second/top_5_locations.iloc[1]*100):.1f}% more)")
    
    # Complete record analysis - show all data for this extreme location
    print(f"\n📋 COMPLETE RECORDS FOR {EXTREME_LOCATION}:")
    print("="*60)
    print(f"Showing all {len(extreme_location_data)} records:")
    
    # Display first 10 complete records
    display_count = min(10, len(extreme_location_data))
    for i in range(display_count):
        record = extreme_location_data.iloc[i]
        print(f"\n🔍 RECORD {i+1}:")
        for col, value in record.items():
            if pd.notna(value):
                print(f"  {col}: {value}")
    
    if len(extreme_location_data) > display_count:
        print(f"\n... and {len(extreme_location_data) - display_count} more records")
    
    # Temporal analysis for this location if dates available
    if main_date_col and main_date_col in extreme_location_data.columns:
        extreme_with_dates = extreme_location_data[extreme_location_data[main_date_col].notna()]
        
        if len(extreme_with_dates) > 0:
            print(f"\n📅 TEMPORAL ANALYSIS FOR {EXTREME_LOCATION}:")
            print("="*50)
            
            date_range = extreme_with_dates[main_date_col]
            print(f"Date range: {date_range.min()} to {date_range.max()}")
            print(f"Time span: {(date_range.max() - date_range.min()).days} days")
            
            # Yearly breakdown for this location
            yearly_breakdown = extreme_with_dates.groupby(extreme_with_dates[main_date_col].dt.year).size()
            if len(yearly_breakdown) > 0:
                print("\nComplaints by year:")
                for year, count in yearly_breakdown.items():
                    print(f"  {year}: {count:,} complaints")
                
                # Calculate trend for this location
                if len(yearly_breakdown) > 1:
                    years = list(yearly_breakdown.index)
                    counts = list(yearly_breakdown.values)
                    trend_change = counts[-1] - counts[0]
                    trend_pct = (trend_change / counts[0]) * 100 if counts[0] > 0 else 0
                    
                    print(f"\nTrend: {trend_change:+,} complaints ({trend_pct:+.1f}%) from {years[0]} to {years[-1]}")
                    
                    if trend_pct > 50:
                        print("  🚨 DRAMATIC INCREASE at this location!")
                    elif trend_pct > 20:
                        print("  📈 SIGNIFICANT INCREASE at this location")
                    elif trend_pct < -50:
                        print("  📉 DRAMATIC DECREASE at this location")
                    elif trend_pct < -20:
                        print("  📉 SIGNIFICANT DECREASE at this location")
    
    # Analyze patterns in other columns for this extreme location
    print(f"\n🔍 PATTERN ANALYSIS FOR {EXTREME_LOCATION}:")
    print("="*50)
    
    for col in df.columns:
        if col != EXTREME_COLUMN and extreme_location_data[col].nunique() > 1:
            if df[col].dtype == 'object':
                value_counts = extreme_location_data[col].value_counts().head(5)
                if len(value_counts) > 0:
                    print(f"\n{col} patterns:")
                    for value, count in value_counts.items():
                        pct = (count / len(extreme_location_data)) * 100
                        print(f"  {str(value)[:40]}: {count} ({pct:.1f}%)")
            
            elif df[col].dtype in ['int64', 'float64']:
                col_stats = extreme_location_data[col].describe()
                print(f"\n{col} statistics:")
                print(f"  Mean: {col_stats['mean']:.2f}")
                print(f"  Range: {col_stats['min']:.2f} to {col_stats['max']:.2f}")
    
    # Potential causes analysis
    print(f"\n💡 POTENTIAL STORY ANGLES FOR {EXTREME_LOCATION}:")
    print("="*60)
    
    print("🔍 Possible explanations for high complaint volume:")
    print("  • High foot traffic area (business district, transit hub)")
    print("  • Aging infrastructure requiring frequent repairs")
    print("  • Accessibility-focused community advocacy")
    print("  • Poor initial construction requiring repeated fixes")
    print("  • Area with high disability/senior population")
    print("  • Tourist/visitor area with higher visibility")
    print("  • Recent development or construction impacts")
    
    print(f"\n📰 JOURNALISTIC ANGLES TO EXPLORE:")
    print("  • Interview residents/business owners about accessibility challenges")
    print("  • Check city budget allocated to this area for ramp repairs")
    print("  • Compare complaint resolution times to other areas") 
    print("  • Investigate if complaints correlate with specific events/seasons")
    print("  • Examine demographic data for disability/accessibility needs")
    print("  • Follow up on completed vs pending complaints")
    
    print(f"\n📊 RECOMMENDED FOLLOW-UP DATA:")
    print("  • City budget data for infrastructure repairs")
    print("  • Demographic data (age, disability status) for this area")
    print("  • Complaint resolution times and completion rates")
    print("  • Construction/development permits in the area")
    print("  • Public transit usage data")
    print("  • Similar data from other cities for comparison")

else:
    print("❌ No extreme location identified for detailed analysis")
    print("This could mean:")
    print("  • No clear location columns found in the dataset")
    print("  • Complaints are evenly distributed across locations")
    print("  • Location data may need cleaning or standardization")


In [None]:
# STEP 5: STORY INSIGHTS SUMMARY
print("\n" + "="*60)
print("STEP 5: NEWS STORY INSIGHTS SUMMARY")
print("="*60)

# Collect all potential stories
story_insights = []

# Story 1: Extreme Location Story
if EXTREME_LOCATION and EXTREME_COLUMN:
    extreme_percentage = (EXTREME_COUNT / len(df)) * 100
    
    story_insights.append({
        'headline': f"Single Location Accounts for {extreme_percentage:.1f}% of All Pedestrian Ramp Complaints",
        'type': 'Extreme',
        'key_finding': f"{EXTREME_LOCATION} has generated {EXTREME_COUNT:,} complaints, far exceeding other locations",
        'primary_metric': EXTREME_COUNT,
        'comparison': f"Average location has {df[EXTREME_COLUMN].value_counts().mean():.1f} complaints",
        'significance': f"{std_deviations:.1f} standard deviations above average" if 'std_deviations' in locals() else "Highly unusual concentration",
        'angle': "Infrastructure inequality and accessibility hotspots in the city"
    })

# Story 2: Temporal Trends
if main_date_col and 'yearly_counts' in locals() and len(yearly_counts) > 1:
    if 'avg_growth' in locals():
        if abs(avg_growth) > 10:
            trend_type = "surge" if avg_growth > 0 else "decline"
            story_insights.append({
                'headline': f"Pedestrian Ramp Complaints Show {avg_growth:+.1f}% Annual {trend_type.title()}",
                'type': 'Trend',
                'key_finding': f"Complaints have been {'increasing' if avg_growth > 0 else 'decreasing'} by an average of {abs(avg_growth):.1f}% per year",
                'primary_metric': f"{avg_growth:+.1f}% annual growth",
                'comparison': f"Total change from {yearly_counts.index[0]} to {yearly_counts.index[-1]}: {yearly_counts.iloc[-1] - yearly_counts.iloc[0]:+,} complaints",
                'significance': "Major trend requiring attention",
                'angle': "City infrastructure maintenance and accessibility planning effectiveness"
            })

# Story 3: Distribution Inequality
if len(location_columns) > 0 and EXTREME_COLUMN:
    location_counts = df[EXTREME_COLUMN].value_counts()
    # Calculate inequality metrics
    top_10_pct = location_counts.head(int(len(location_counts) * 0.1)).sum()
    total_complaints = len(df)
    top_10_pct_share = (top_10_pct / total_complaints) * 100
    
    if top_10_pct_share > 50:  # If top 10% of locations account for >50% of complaints
        story_insights.append({
            'headline': f"Top 10% of Locations Account for {top_10_pct_share:.1f}% of All Ramp Complaints",
            'type': 'Inequality',
            'key_finding': f"Complaints are heavily concentrated in a small number of locations",
            'primary_metric': f"{top_10_pct_share:.1f}% concentration",
            'comparison': f"Should be ~10% if evenly distributed",
            'significance': "Major geographic inequality in accessibility issues",
            'angle': "Systematic infrastructure neglect in specific areas"
        })

# Story 4: Seasonal Patterns
if main_date_col and 'monthly_avg' in locals():
    max_month = monthly_avg.idxmax()
    min_month = monthly_avg.idxmin()
    seasonal_difference = ((monthly_avg.max() - monthly_avg.min()) / monthly_avg.mean()) * 100
    
    if seasonal_difference > 30:  # Significant seasonal variation
        month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                       'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
        peak_month_name = month_names[max_month - 1]
        low_month_name = month_names[min_month - 1]
        
        story_insights.append({
            'headline': f"Pedestrian Ramp Complaints Peak in {peak_month_name}, Drop {seasonal_difference:.1f}% in {low_month_name}",
            'type': 'Seasonal',
            'key_finding': f"Clear seasonal pattern with {peak_month_name} seeing most complaints and {low_month_name} the fewest",
            'primary_metric': f"{seasonal_difference:.1f}% seasonal variation",
            'comparison': f"{peak_month_name}: {monthly_avg.max():.1f} avg vs {low_month_name}: {monthly_avg.min():.1f} avg",
            'significance': "Strong seasonal correlation suggests weather or usage pattern factors",
            'angle': "Weather impacts on accessibility infrastructure and complaint filing patterns"
        })

# Print all story insights in journalism format
print("📰 POTENTIAL NEWS STORIES IDENTIFIED:")
print("="*50)

for i, story in enumerate(story_insights, 1):
    print(f"\n🗞️  STORY INSIGHT #{i}:")
    print(f"HEADLINE: {story['headline']}")
    print(f"Type: {story['type']}")
    print(f"Key Finding: {story['key_finding']}")
    print("Supporting Data:")
    print(f"  - Primary metric: {story['primary_metric']}")
    if 'time_period' in story:
        print(f"  - Time period: {story['time_period']}")
    print(f"  - Comparison: {story['comparison']}")
    print(f"Statistical Significance: {story['significance']}")
    print(f"Potential Angle: {story['angle']}")
    print("-" * 50)

# Overall Data Quality Assessment
print(f"\n📊 DATA QUALITY ASSESSMENT:")
print("="*40)
print(f"✅ Records analyzed: {len(df):,}")
print(f"✅ Columns analyzed: {len(df.columns)}")
print(f"✅ Location columns identified: {len(location_columns)}")
print(f"✅ Temporal analysis: {'Possible' if main_date_col else 'Limited'}")

if len(story_insights) == 0:
    print("\n⚠️  LIMITED STORY POTENTIAL:")
    print("Possible reasons:")
    print("  • Data may be too evenly distributed")
    print("  • Time period covered may be too short")
    print("  • Additional context data needed")
    print("  • May require data cleaning/preprocessing")

print(f"\n🎯 ANALYSIS COMPLETE!")
print(f"Found {len(story_insights)} potential news stories")
print("Ready for further investigation and follow-up reporting")

# Create simple visualization if matplotlib is available
try:
    if EXTREME_LOCATION and EXTREME_COLUMN:
        plt.figure(figsize=(12, 8))
        
        # Top 10 locations bar chart
        top_locations = df[EXTREME_COLUMN].value_counts().head(10)
        
        plt.subplot(2, 1, 1)
        bars = plt.bar(range(len(top_locations)), top_locations.values)
        bars[0].set_color('red')  # Highlight the extreme location
        plt.title(f'Top 10 Locations by Complaint Count\n(Red bar shows extreme location: {EXTREME_LOCATION})')
        plt.ylabel('Number of Complaints')
        plt.xticks(range(len(top_locations)), [str(x)[:20] + '...' if len(str(x)) > 20 else str(x) 
                                              for x in top_locations.index], rotation=45, ha='right')
        
        # Time series if available
        if main_date_col and 'monthly_counts' in locals():
            plt.subplot(2, 1, 2)
            monthly_counts.plot(kind='line', marker='o')
            plt.title('Complaints Over Time')
            plt.ylabel('Number of Complaints')
            plt.xlabel('Time Period')
            plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        print("\n📈 Visualization generated above")
        
except Exception as e:
    print(f"\n📊 Visualization skipped: {e}")

print("\n" + "="*60)
print("AI DATA JOURNALIST ANALYSIS COMPLETE")
print("="*60)
