# Finding Story Angles in Census Data

This notebook demonstrates techniques for discovering newsworthy stories in Census data, including:
- Identifying outliers and unusual patterns
- Finding significant demographic shifts
- Discovering correlations between different metrics
- Comparing local data to state/national averages

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
from census import Census
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set up plotting
plt.style.use('seaborn')
%matplotlib inline

# Initialize Census API
api_key = os.getenv('CENSUS_API_KEY')
if not api_key:
    api_key = input('Enter your Census API key: ')
c = Census(api_key)

## 1. Finding Statistical Outliers

Look for counties or states that are significantly different from the average.

In [None]:
# Get demographic data for all states
variables = (
    'NAME',
    'B01003_001E',  # Total population
    'B19013_001E',  # Median household income
    'B01002_001E',  # Median age
    'B17001_002E'   # People in poverty
)

state_data = pd.DataFrame(c.acs5.state(variables, '*', year=2019))
state_data = state_data.rename(columns={
    'B01003_001E': 'population',
    'B19013_001E': 'median_income',
    'B01002_001E': 'median_age',
    'B17001_002E': 'poverty_count'
})

# Convert to numeric
numeric_cols = ['population', 'median_income', 'median_age', 'poverty_count']
state_data[numeric_cols] = state_data[numeric_cols].apply(pd.to_numeric)

# Calculate z-scores
for col in numeric_cols:
    state_data[f'{col}_zscore'] = stats.zscore(state_data[col])

# Find significant outliers (|z-score| > 2)
print("Significant outliers (z-score > 2 or < -2):")
for col in numeric_cols:
    outliers = state_data[abs(state_data[f'{col}_zscore']) > 2]
    if not outliers.empty:
        print(f"\n{col.title()} outliers:")
        print(outliers[['NAME', col, f'{col}_zscore']])

## 2. Identifying Demographic Shifts

Look for significant changes in demographics over time that could indicate newsworthy trends.

In [None]:
def get_yearly_data(year):
    """Get key demographic data for a specific year"""
    data = c.acs5.state(
        ('NAME', 'B01003_001E', 'B19013_001E'),
        '*',
        year=year
    )
    df = pd.DataFrame(data)
    df['year'] = year
    df = df.rename(columns={
        'B01003_001E': 'population',
        'B19013_001E': 'median_income'
    })
    return df

# Get data for 2015 and 2019
data_2015 = get_yearly_data(2015)
data_2019 = get_yearly_data(2019)

# Calculate changes
merged_data = pd.merge(data_2015, data_2019, on='NAME', suffixes=('_2015', '_2019'))
merged_data[['population_2015', 'population_2019', 'median_income_2015', 'median_income_2019']] = \
    merged_data[['population_2015', 'population_2019', 'median_income_2015', 'median_income_2019']].apply(pd.to_numeric)

merged_data['population_change_pct'] = ((merged_data['population_2019'] - merged_data['population_2015']) / 
                                      merged_data['population_2015'] * 100)
merged_data['income_change_pct'] = ((merged_data['median_income_2019'] - merged_data['median_income_2015']) / 
                                   merged_data['median_income_2015'] * 100)

# Show top changes
print("\nTop 5 States by Population Growth (2015-2019):")
print(merged_data.nlargest(5, 'population_change_pct')[['NAME', 'population_change_pct']])

print("\nTop 5 States by Income Growth (2015-2019):")
print(merged_data.nlargest(5, 'income_change_pct')[['NAME', 'income_change_pct']])

## 3. Finding Correlations

Discover relationships between different demographic factors that might suggest story angles.

In [None]:
# Get additional variables for correlation analysis
variables = (
    'NAME',
    'B01003_001E',  # Total population
    'B19013_001E',  # Median household income
    'B01002_001E',  # Median age
    'B17001_002E',  # People in poverty
    'B15003_022E'   # Bachelor's degree holders
)

corr_data = pd.DataFrame(c.acs5.state(variables, '*', year=2019))
corr_data = corr_data.rename(columns={
    'B01003_001E': 'population',
    'B19013_001E': 'median_income',
    'B01002_001E': 'median_age',
    'B17001_002E': 'poverty_count',
    'B15003_022E': 'bachelors_degree'
})

# Convert to numeric
numeric_cols = ['population', 'median_income', 'median_age', 'poverty_count', 'bachelors_degree']
corr_data[numeric_cols] = corr_data[numeric_cols].apply(pd.to_numeric)

# Calculate correlation matrix
correlation_matrix = corr_data[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Demographic Factors')
plt.tight_layout()
plt.show()

# Print strongest correlations
print("\nStrongest correlations:")
correlations = []
for i in range(len(numeric_cols)):
    for j in range(i+1, len(numeric_cols)):
        correlations.append({
            'variables': f"{numeric_cols[i]} vs {numeric_cols[j]}",
            'correlation': correlation_matrix.iloc[i, j]
        })

correlations_df = pd.DataFrame(correlations)
print("\nTop positive correlations:")
print(correlations_df.nlargest(3, 'correlation'))
print("\nTop negative correlations:")
print(correlations_df.nsmallest(3, 'correlation'))

## 4. Local vs. National Comparisons

Compare local areas to state and national averages to find interesting divergences.

In [None]:
# Get national statistics
us_stats = pd.DataFrame(c.acs5.us(variables, year=2019))
us_stats = us_stats.rename(columns={
    'B01003_001E': 'population',
    'B19013_001E': 'median_income',
    'B01002_001E': 'median_age',
    'B17001_002E': 'poverty_count',
    'B15003_022E': 'bachelors_degree'
})

# Calculate per capita metrics for comparison
state_data = corr_data.copy()
us_data = us_stats.copy()

for df in [state_data, us_data]:
    df['poverty_rate'] = df['poverty_count'] / df['population'] * 100
    df['bachelors_rate'] = df['bachelors_degree'] / df['population'] * 100

# Calculate national averages
national_avg = us_data.iloc[0]

# Compare states to national averages
comparisons = state_data.copy()
for metric in ['poverty_rate', 'bachelors_rate', 'median_income', 'median_age']:
    comparisons[f'{metric}_vs_national'] = (
        (comparisons[metric] - national_avg[metric]) / national_avg[metric] * 100
    )

# Show states with biggest divergences from national averages
metrics_vs_national = [col for col in comparisons.columns if col.endswith('_vs_national')]
for metric in metrics_vs_national:
    print(f"\nStates with largest divergence in {metric.replace('_vs_national', '')}:")
    print("Above national average:")
    print(comparisons.nlargest(3, metric)[['NAME', metric]])
    print("\nBelow national average:")
    print(comparisons.nsmallest(3, metric)[['NAME', metric]])

## 5. Story Ideas and Angles

Based on the analysis above, here are some potential story angles to explore:

1. States with extreme demographic outliers
2. Fastest changing states (population and income)
3. Strong correlations between different factors
4. Places that differ significantly from national averages
5. Regional patterns and clusters

Remember to:
- Verify findings with additional sources
- Contact local experts for context
- Consider historical context
- Look for human impact angles

In [None]:
# Export findings for further investigation
output_dir = '../output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save all our analysis results
with pd.ExcelWriter(f'{output_dir}/story_angles_analysis.xlsx') as writer:
    # Outliers
    state_data.to_excel(writer, sheet_name='State_Outliers')
    
    # Demographic changes
    merged_data.to_excel(writer, sheet_name='Demographic_Changes')
    
    # Correlations
    correlations_df.to_excel(writer, sheet_name='Correlations')
    
    # National comparisons
    comparisons.to_excel(writer, sheet_name='National_Comparisons')

print(f"Analysis results exported to {output_dir}/story_angles_analysis.xlsx")