In [None]:
import pandas as pd

# Load the forecasted demographic/economic data
df = pd.read_csv('../data/forecasted_data_2035.csv')

# Filter the dataset to include only records for the year 2035
df = df[df['year'] == 2035]

# Drop the 'year' column since it's now redundant
df = df.drop(columns='year')

# Keep only relevant columns: ZIP code, income, rent, and education forecast
df = df[['zip_code', 'forecasted_income', 'forecasted_median_contract_rent', 'forecasted_per_college_educated']]

# Rename columns for clarity and consistency with previous datasets
df.columns = ['zip_code', 'income_2035', 'rent_2035', 'college_2035']

# Ensure all ZIP codes are 5-digit strings with leading zeros where necessary
df['zip_code'] = df['zip_code'].astype(str).str.zfill(5)

# Load the ZIP-to-MSA mapping file
df_msa = pd.read_csv('../data/msa-by-zip.csv')

# Remove entries with invalid ZIP codes (e.g., special-use ZIPs below 601)
df_msa = df_msa[df_msa['ZIP CODE'] >= 601]

# Convert ZIP codes in the MSA dataset to 5-digit strings to match the format in `df`
df_msa['ZIP CODE'] = df_msa['ZIP CODE'].astype(str).str.zfill(5)

# Standardize column names: lowercase and replace spaces with underscores
df_msa.columns = [col.lower().replace(" ", "_") for col in df_msa.columns]

# Extract only ZIP code and MSA name for merging
df_subset = df_msa[['zip_code', 'msa_name']]

# Merge forecasted data with MSA info based on ZIP code
df_merged = pd.merge(left=df, right=df_subset, on='zip_code', how='left')

# Remove duplicate ZIP codes, keeping the first occurrence
df_merged = df_merged.drop_duplicates(subset=['zip_code'])

# Export the cleaned and merged dataset to a new CSV file
df_merged.to_csv('../data/forecasted_data_2035_true.csv', index=False)
