# Example of age calibration on real CPS data for one district and national targets

### Install packages

In [1]:
from microcalibrate.calibration import Calibration
import logging
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from policyengine_core.data import Dataset

logging.basicConfig(
    level=logging.INFO,
)

  from .autonotebook import tqdm as notebook_tqdm


## Change original sample dataset to 2024 small enhanced CPS dataset

#### More imports for data import

In [19]:
from microcalibrate.data import get_dataset 


### Load the dataframe
##### I chose to include household ID and then drop it after to ensure each individual corresponded to a household, given we are doing a household level analysis. I also loaded the weights with the column data to ensure they match.

#### TODO: Wide to long conversion
My current plan: 
1. I would need to add a column to df_age_nworth that indicates the district the household belongs to. 
2. Then, I imagine that I would need to copy the dataset and paste all households once for each district, just changing the value of the district column.
   1. Should I do all districts in virginia to start? or one district in virginia and leave the rest of ALL national districts uncalibrated, to start. 
3. Then make the weights more than just a single array. Add several levels of targets.

In [20]:
# Load the data 
df_age_nworth = get_dataset()

# Remove the household_id column
df_age_nworth.drop(columns=["household_id/2024"], inplace=True)

# Extract the weights for calibration
df_init_weights = df_age_nworth.pop("household_weight/2024")

# Convert weights to an array (just in case??)
weights_array = df_init_weights.values

# Asserting the dataframes have the same length, as an extra check that the weights line up with the data.
assert(len(df_age_nworth) == len(weights_array))

## Bring in Ben's age target .csv.

### Starting with national calibration, then adding district level (maybe then state)

In [28]:
# Calculate target values: total income for age groups 20-30 and 40-50 (as an example) or employ existing targets
# Should this be reverted back to age values, as I'm just calibrating age here?? I'm confused because it's too late in the day.
income_matrix = pd.DataFrame({
    "income_aged_20_30": ((df_age_nworth["age/2024"] >= 25) & (df_age_nworth["age/2024"] <= 29)).astype(float) * df_age_nworth["net_worth/2024"],
    "income_aged_40_50": ((df_age_nworth["age/2024"] >= 40) & (df_age_nworth["age/2024"] <= 44)).astype(float) * df_age_nworth["net_worth/2024"],
    "income_aged_71" : (df_age_nworth["age/2024"] >= 85).astype(float) * df_age_nworth["net_worth/2024"],
})

age_matrix = pd.DataFrame({
    "income_aged_25_29": ((df_age_nworth["age/2024"] >= 25) & (df_age_nworth["age/2024"] <= 29)).astype(float),
    "income_aged_40_44": ((df_age_nworth["age/2024"] >= 40) & (df_age_nworth["age/2024"] <= 44)).astype(float),
    "income_aged_85_more" : (df_age_nworth["age/2024"] >= 85).astype(float),
})

In [29]:
## Replace with Ben's (national) targets. This might not change the dataframe much, but it's a starting place.

targets_df = pd.read_csv("National_Age_Demographics.csv")

# question: Are these age values being multiplied by the right weights?? I'm doubting if these are the right targets.
targets = np.array([
    (targets_df["25-29"] * df_init_weights).sum(), 
    (targets_df["40-44"] * df_init_weights).sum(), 
    (targets_df["85+"] * df_init_weights).sum()
])


In [30]:
# Initialize the Calibration object
calibrator = Calibration(
    # I'm a little confused between loss_matrix and targets (which is an array)
    loss_matrix=age_matrix,
    weights=df_init_weights, 
    # Check that I used the right number of levels of targets. 
    targets=targets,
    noise_level=0.05,
    epochs=528, # 
    learning_rate=0.01,
    dropout_rate=0,
    subsample_every=0,
)

# Perform the calibration
performance_df = calibrator.calibrate()

print(f"Original dataset size: {len(targets_matrix)}")
print(f"Calibrated dataset size: {len(calibrator.loss_matrix)}")
print(f"Number of calibrated weights: {len(calibrator.weights)}")

INFO:microcalibrate.calibration:Performing basic target assessment...
INFO:microcalibrate.reweight:Starting calibration process for targets ['income_aged_25_29' 'income_aged_40_44' 'income_aged_85_more']: [3.27757866e+12 3.31046097e+12 9.15955117e+11]
INFO:microcalibrate.reweight:Original weights - mean: 223306.4062, std: 175018.2344
INFO:microcalibrate.reweight:Initial weights after noise - mean: 223306.4688, std: 175018.2344
Reweighting progress:   0%|          | 0/528 [00:00<?, ?epoch/s, loss=1, count_observations=670, weights_mean=2.23e+5, weights_std=1.75e+5, weights_min=1.5e+5]INFO:microcalibrate.reweight:Within 10% from targets: 0.00% 

Reweighting progress:   0%|          | 0/528 [00:00<?, ?epoch/s, loss=1, count_observations=670, weights_mean=2.26e+5, weights_std=1.77e+5, weights_min=1.5e+5]INFO:microcalibrate.reweight:Within 10% from targets: 0.00% 

INFO:microcalibrate.reweight:Epoch   10: Loss = 0.999993, Change = 0.000001 (improving)
Reweighting progress:   2%|▏         | 

Original dataset size: 670
Calibrated dataset size: 670
Number of calibrated weights: 670


In [31]:
# Calculate final weighted totals
final_totals = targets_matrix.mul(calibrator.weights, axis=0).sum().values

print(f"Target totals: {targets}")
print(f"Final calibrated totals: {final_totals}")
print(f"Difference: {final_totals - targets}")
print(f"Relative error: {(final_totals - targets) / targets * 100}")

Target totals: [3.27757866e+12 3.31046097e+12 9.15955117e+11]
Final calibrated totals: [5.58919295e+18 2.37749161e+18 9.84521001e+09]
Difference: [ 5.58918967e+18  2.37748830e+18 -9.06109907e+11]
Relative error: [ 1.70528010e+08  7.18174393e+07 -9.89251427e+01]


Failed to pass tolerance level boundary.

In [32]:
np.testing.assert_allclose(
        final_totals,
        targets,
        rtol=0.01,  # relative tolerance
        err_msg="Calibrated totals do not match target values",
    )

AssertionError: 
Not equal to tolerance rtol=0.01, atol=0
Calibrated totals do not match target values
Mismatched elements: 3 / 3 (100%)
Max absolute difference: 5.58918967e+18
Max relative difference: 1705280.10025668
 x: array([5.589193e+18, 2.377492e+18, 9.845210e+09])
 y: array([3.277579e+12, 3.310461e+12, 9.159551e+11])

In [33]:
performance_df.head()

Unnamed: 0,epoch,loss,target_name,target,estimate,error,abs_error,rel_abs_error
0,0,0.999994,income_aged_25_29,3277579000000.0,8079228.5,-3277570000000.0,3277570000000.0,0.999998
1,0,0.999994,income_aged_40_44,3310461000000.0,7181536.0,-3310454000000.0,3310454000000.0,0.999998
2,0,0.999994,income_aged_85_more,915955100000.0,4189229.5,-915950900000.0,915950900000.0,0.999995
3,10,0.999993,income_aged_25_29,3277579000000.0,8770566.0,-3277570000000.0,3277570000000.0,0.999997
4,10,0.999993,income_aged_40_44,3310461000000.0,7796380.5,-3310453000000.0,3310453000000.0,0.999998
