In [1]:
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('../data/classification_data_raw.csv')

# Configuration
BASE_YEAR = 2020
TARGET_YEAR = 2035
INCOME_GROWTH = 0.025  # 2.5% per year
EDUCATION_GROWTH = 0.012  # 1.2% per year
RENT_GROWTH = 0.032  # 3.2% per year

# Clean column names (remove spaces)
df.columns = df.columns.str.strip()

# Find zipcode column (try common variations)
zipcode_col = None
for col in ['zipcode', 'zip', 'ZIP', 'Zipcode']:
    if col in df.columns:
        zipcode_col = col
        break

if zipcode_col is None:
    print("Available columns:", df.columns.tolist())
    zipcode_col = input("Enter zipcode column name: ")

# Create forecasted data
forecasts = []

for _, row in df.iterrows():
    zipcode = row[zipcode_col]
    base_income = row['income']
    base_education = row['per_college_educated_x']
    base_rent = row['median_contract_rent']
    
    # Skip rows with missing data
    if pd.isna(base_income) or pd.isna(base_education) or pd.isna(base_rent):
        continue
    
    # Generate forecasts for each year
    for year in range(BASE_YEAR + 1, TARGET_YEAR + 1):
        years_diff = year - BASE_YEAR
        
        # Compound growth formula
        forecast_income = base_income * (1 + INCOME_GROWTH) ** years_diff
        forecast_education = min(100, base_education * (1 + EDUCATION_GROWTH) ** years_diff)
        forecast_rent = base_rent * (1 + RENT_GROWTH) ** years_diff
        
        forecasts.append({
            'zipcode': zipcode,
            'year': year,
            'forecasted_income': round(forecast_income),
            'forecasted_per_college_educated': round(forecast_education, 1),
            'forecasted_median_contract_rent': round(forecast_rent)
        })

# Create DataFrame and save
forecast_df = pd.DataFrame(forecasts)
forecast_df.to_csv('forecasted_data_2035.csv', index=False)

print(f"Created forecasted_data_2035.csv with {len(forecast_df)} rows")
print(f"Forecasted {len(df)} zipcodes from {BASE_YEAR + 1} to {TARGET_YEAR}")
print("\nSample output:")
print(forecast_df.head())

print(forecast_df['zipcode'].value_counts())

Available columns: ['zip_code', 'msa_name', 'income', 'gentrified_income', 'year', 'per_college_educated_x', 'gentrified_college', 'median_contract_rent', 'gentrified_rent', 'gentrified', 'per_college_educated_y', 'at_risk_college', 'per_low_income', 'at_risk_income', 'percent_nonwhite', 'region_nonwhite', 'at_risk_min', 'at_risk_overall', 'first_gentrified_year']
Created forecasted_data_2035.csv with 2991150 rows
Forecasted 199410 zipcodes from 2021 to 2035

Sample output:
   zipcode  year  forecasted_income  forecasted_per_college_educated  \
0      602  2021              15484                             20.3   
1      602  2022              15871                             20.6   
2      602  2023              16268                             20.8   
3      602  2024              16674                             21.1   
4      602  2025              17091                             21.3   

   forecasted_median_contract_rent  
0                             2961  
1             