### COGS 128: Visualizing for Trust or Trickery


### 1. Prior Analysis: Exploring the Dataset

In [1]:
!pip install pandas altair

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import altair as alt
from pathlib import Path

# Load all the datasets with error handling
dfs = []
for year in range(2019, 2025):
    print(f"Loading {year}...")
    df = pd.read_csv(
        f'fd_incidents_{year}_datasd.csv',
        on_bad_lines='skip',  # Skip problematic rows
        engine='python',       # More flexible parser
        encoding='utf-8'
    )
    dfs.append(df)
    print(f"  Loaded {len(df):,} rows")

# Combine all years
df_all = pd.concat(dfs, ignore_index=True)

print(f"\nTotal incidents loaded: {len(df_all):,}")
print(f"Dataset shape: {df_all.shape}")
print(f"Columns: {df_all.columns.tolist()}")

# Display basic info
print("\n" + "="*50)
print("CALL CATEGORY BREAKDOWN")
print("="*50)
print(df_all['call_category'].value_counts())

# Filter for FIRE incidents only
df_fire = df_all[df_all['call_category'] == 'FIRE'].copy()
print(f"\n{'='*50}")
print(f"Total FIRE incidents (2019-2024): {len(df_fire):,}")
print(f"Percentage of all calls: {(len(df_fire)/len(df_all)*100):.2f}%")

# Clean zip codes
df_fire['address_zip'] = df_fire['address_zip'].astype(str).str.replace('.0', '', regex=False)
df_fire['address_zip'] = df_fire['address_zip'].replace('nan', None)

# Check for missing/invalid zip codes
print(f"\n{'='*50}")
print("ZIP CODE DATA")

Loading 2019...
  Loaded 160,861 rows
Loading 2020...
  Loaded 154,556 rows
Loading 2021...
  Loaded 171,740 rows
Loading 2022...
  Loaded 179,825 rows
Loading 2023...
  Loaded 182,830 rows
Loading 2024...
  Loaded 187,484 rows

Total incidents loaded: 1,037,296
Dataset shape: (1037296, 11)
Columns: ['agency_type', 'call_category', 'address_city', 'jurisdiction', 'problem', 'date_response', 'address_state', 'address_zip', 'day_response', 'month_response', 'year_response']

CALL CATEGORY BREAKDOWN
call_category
Life-Threatening Emergency Response    442796
Emergency Medical Response             218657
Non-Life-Threatening Response           95885
Urgent Response                         88337
HAZARD                                  75945
FIRE                                    35052
Urgent Medical Response                 30762
Non-Emergency Medical Response          28030
RESCUE                                   8511
OTHER                                    7221
SERVICE                 

In [3]:
print(f"Missing zip codes: {df_fire['address_zip'].isna().sum()}")
print(f"Unique zip codes: {df_fire['address_zip'].nunique()}")
print(f"\nTop 15 zip codes by fire incidents:")
print(df_fire['address_zip'].value_counts().head(15))

# add the line that is missing. 
df_fire_clean = df_fire.dropna(subset=['address_zip']).copy()
print(f"\nRecords after removing missing zips: {len(df_fire_clean):,}")

Missing zip codes: 93
Unique zip codes: 53

Top 15 zip codes by fire incidents:
address_zip
92101    4469
92105    2479
92102    2334
92113    2324
92109    1995
92154    1791
92110    1450
92115    1428
92108    1418
92173    1253
92114    1252
92103    1218
92111    1161
92104    1029
92037     828
Name: count, dtype: int64

Records after removing missing zips: 34,959


In [4]:
# VISUALIZATION 1: Top Zip Codes
top_zips = df_fire_clean['address_zip'].value_counts().head(20).reset_index()
top_zips.columns = ['zip_code', 'count']

alt.Chart(top_zips).mark_bar().encode(
    x=alt.X('count:Q', title='Number of Fire Incidents'),
    y=alt.Y('zip_code:N', title='Zip Code', sort='-x'),
    color=alt.Color('count:Q', scale=alt.Scale(scheme='reds'), legend=None),
    tooltip=['zip_code:N', 'count:Q']
).properties(
    width=600,
    height=400,
    title='Top 20 Zip Codes by Fire Incidents (2019-2024)'
)

In [5]:
# VISUALIZATION 2: Fires by Year
yearly_fires = df_fire_clean.groupby('year_response').size().reset_index(name='count')

alt.Chart(yearly_fires).mark_bar().encode(
    x=alt.X('year_response:O', title='Year'),
    y=alt.Y('count:Q', title='Number of Fire Incidents'),
    color=alt.Color('year_response:O', legend=None),
    tooltip=['year_response:O', 'count:Q']
).properties(
    width=500,
    height=300,
    title='Fire Incidents by Year'
)

In [6]:
# VISUALIZATION 3: Seasonal Pattern
monthly_pattern = df_fire_clean.groupby('month_response').size().reset_index(name='count')

alt.Chart(monthly_pattern).mark_bar().encode(
    x=alt.X('month_response:O', title='Month'),
    y=alt.Y('count:Q', title='Total Fire Incidents'),
    color=alt.Color('count:Q', scale=alt.Scale(scheme='oranges'), legend=None),
    tooltip=['month_response:O', 'count:Q']
).properties(
    width=600,
    height=300,
    title='Fire Incidents by Month (Aggregated 2019-2024)'
)

## Main Visualization Set for TRUST

In [7]:
# First, let's prepare our data with some additional context
import pandas as pd
import altair as alt

# We already have df_fire_clean from earlier
# Let's add some helpful context columns

# Get fire counts by zip
zip_counts = df_fire_clean.groupby('address_zip').size().reset_index(name='fire_count')
zip_counts = zip_counts.sort_values('fire_count', ascending=False)

print("Data prepared for trust-based visualizations")
print(f"Total zip codes with fire incidents: {len(zip_counts)}")

Data prepared for trust-based visualizations
Total zip codes with fire incidents: 53


In [8]:
# VISUALIZATION 1: Enhanced Bar Chart with Context
# This shows the full distribution honestly

top_n = 15  # Show top 15 for cleaner view
top_zips_context = zip_counts.head(top_n).copy()

# Add percentage of total
total_fires = zip_counts['fire_count'].sum()
top_zips_context['percentage'] = (top_zips_context['fire_count'] / total_fires * 100).round(1)

# Create bar chart with neutral colors (blues instead of reds)
base = alt.Chart(top_zips_context).encode(
    x=alt.X('fire_count:Q', 
            title='Number of Fire Incidents (2019-2024)',
            scale=alt.Scale(domain=[0, top_zips_context['fire_count'].max() * 1.1])),
    y=alt.Y('address_zip:N', 
            title='Zip Code',
            sort='-x'),
)

bars = base.mark_bar().encode(
    color=alt.Color('fire_count:Q',
                    scale=alt.Scale(scheme='blues'),  # Neutral blues
                    legend=None),
    tooltip=[
        alt.Tooltip('address_zip:N', title='Zip Code'),
        alt.Tooltip('fire_count:Q', title='Fire Incidents', format=','),
        alt.Tooltip('percentage:Q', title='% of Total', format='.1f')
    ]
)

# Add text labels
text = base.mark_text(align='left', dx=3).encode(
    text=alt.Text('fire_count:Q', format=',')
)

chart1 = (bars + text).properties(
    width=600,
    height=400,
    title={
        "text": "Fire Incidents by Zip Code: Top 15 Neighborhoods",
        "subtitle": f"San Diego Fire Rescue Data (2019-2024) | Total Incidents: {total_fires:,}",
        "fontSize": 16,
        "subtitleFontSize": 12
    }
)

chart1

In [9]:
# VISUALIZATION 2: Year-over-Year Stability
# Shows fires are NOT increasing alarmingly

yearly_data = df_fire_clean.groupby('year_response').size().reset_index(name='count')

# Calculate average for reference line
avg_fires = yearly_data['count'].mean()

# Create the bars
bars = alt.Chart(yearly_data).mark_bar(color='steelblue').encode(
    x=alt.X('year_response:O', title='Year', axis=alt.Axis(labelAngle=0)),
    y=alt.Y('count:Q', title='Number of Fire Incidents', scale=alt.Scale(domain=[0, 7000])),
    tooltip=['year_response:O', alt.Tooltip('count:Q', format=',')]
)

# Add average reference line
avg_line = alt.Chart(pd.DataFrame({'y': [avg_fires]})).mark_rule(
    color='gray',
    strokeDash=[5, 5],
    size=2
).encode(
    y='y:Q'
)

# Add text annotation for average
avg_text = alt.Chart(pd.DataFrame({'y': [avg_fires], 'label': [f'5-year average: {int(avg_fires):,}']})).mark_text(
    align='left',
    dx=5,
    dy=-10,
    color='gray'
).encode(
    x=alt.value(450),  # x position in pixels
    y='y:Q',
    text='label:N'
)

chart2 = (bars + avg_line + avg_text).properties(
    width=600,
    height=350,
    title={
        "text": "Fire Incidents Remain Stable Over Time",
        "subtitle": "Year-over-year comparison shows no alarming increase (2020-2024 average: ~6,200/year)",
        "fontSize": 16,
        "subtitleFontSize": 12
    }
)

chart2

In [10]:
# VISUALIZATION 3: Seasonal Pattern with Context
# Shows explainable summer peak

monthly_data = df_fire_clean.groupby('month_response').size().reset_index(name='count')
monthly_data['month_name'] = monthly_data['month_response'].map({
    1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
    7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'
})

# Create area chart for smooth pattern visualization
area = alt.Chart(monthly_data).mark_area(
    color='steelblue',
    opacity=0.7,
    line={'color': 'darkblue'}
).encode(
    x=alt.X('month_response:O', 
            title='Month',
            axis=alt.Axis(
                values=list(range(1, 13)),
                labelExpr="datum.value == 1 ? 'Jan' : datum.value == 2 ? 'Feb' : datum.value == 3 ? 'Mar' : datum.value == 4 ? 'Apr' : datum.value == 5 ? 'May' : datum.value == 6 ? 'Jun' : datum.value == 7 ? 'Jul' : datum.value == 8 ? 'Aug' : datum.value == 9 ? 'Sep' : datum.value == 10 ? 'Oct' : datum.value == 11 ? 'Nov' : 'Dec'"
            )),
    y=alt.Y('count:Q', 
            title='Total Fire Incidents',
            scale=alt.Scale(domain=[2200, 3400])),
    tooltip=['month_name:N', alt.Tooltip('count:Q', format=',')]
)

# Add annotation for summer peak
annotation = alt.Chart(pd.DataFrame({
    'x': [7],
    'y': [3300],
    'text': ['Summer peak\n(dry season)']
})).mark_text(
    align='center',
    fontSize=11,
    color='darkblue',
    fontWeight='bold'
).encode(
    x='x:Q',
    y='y:Q',
    text='text:N'
)

chart3 = (area + annotation).properties(
    width=700,
    height=350,
    title={
        "text": "Fire Incidents Peak During Summer Months",
        "subtitle": "Consistent seasonal pattern across 2019-2024 | Peak in July correlates with San Diego's dry season",
        "fontSize": 16,
        "subtitleFontSize": 12
    }
)

chart3

In [11]:
# VISUALIZATION 4: Distribution Overview
# Shows that MOST zip codes have relatively low fire counts

# Create bins for fire incident counts
zip_counts['fire_range'] = pd.cut(
    zip_counts['fire_count'],
    bins=[0, 500, 1000, 1500, 2000, 2500, 5000],
    labels=['0-500', '501-1,000', '1,001-1,500', '1,501-2,000', '2,001-2,500', '2,501+']
)

distribution = zip_counts.groupby('fire_range', observed=True).size().reset_index(name='num_zips')

bars = alt.Chart(distribution).mark_bar(color='steelblue').encode(
    x=alt.X('fire_range:N', 
            title='Fire Incidents per Zip Code',
            sort=['0-500', '501-1,000', '1,001-1,500', '1,501-2,000', '2,001-2,500', '2,501+']),
    y=alt.Y('num_zips:Q', title='Number of Zip Codes'),
    tooltip=['fire_range:N', 'num_zips:Q']
)

text = bars.mark_text(dy=-5).encode(
    text='num_zips:Q'
)

chart4 = (bars + text).properties(
    width=600,
    height=350,
    title={
        "text": "Most San Diego Neighborhoods Have Relatively Few Fires",
        "subtitle": f"Distribution of fire incidents across {len(zip_counts)} zip codes | Majority have fewer than 1,000 incidents over 5 years",
        "fontSize": 16,
        "subtitleFontSize": 12
    }
)

chart4

In [12]:
# BONUS: Create a combined dashboard view
# This shows multiple perspectives at once

chart_top = chart1
chart_middle = chart2 | chart3  # Side by side
chart_bottom = chart4

final_dashboard = alt.vconcat(
    chart_top,
    chart_middle,
    chart_bottom
).properties(
    title={
        "text": "San Diego Fire Incidents: A Transparent Analysis (2019-2024)",
        "subtitle": "Multiple perspectives reveal stable patterns and geographic concentration",
        "fontSize": 18,
        "subtitleFontSize": 14
    }
).configure_view(
    strokeWidth=0
).configure_axis(
    labelFontSize=11,
    titleFontSize=12
).configure_title(
    anchor='start',
    fontSize=18
)

final_dashboard