# Example of age calibration on real CPS data for one district and national targets

### Install packages

In [1]:
from microcalibrate.calibration import Calibration
import logging
import numpy as np
import pandas as pd
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from policyengine_core.data import Dataset

logging.basicConfig(
    level=logging.INFO,
)

  from .autonotebook import tqdm as notebook_tqdm


## Change original sample dataset to 2024 small enhanced CPS dataset

#### More imports for data import

In [2]:
from microcalibrate.data import get_dataset 


### Load the dataframe
##### I chose to include household ID and then drop it after to ensure each individual corresponded to a household, given we are doing a household level analysis. I also loaded the weights with the column data to ensure they match.

#### TODO: Wide to long conversion
My current plan: 
1. I would need to add a column to df_age_nworth that indicates the district the household belongs to. 
2. Then, I imagine that I would need to copy the dataset and paste all households once for each district, just changing the value of the district column.
   1. Should I do all districts in virginia to start? or one district in virginia and leave the rest of ALL national districts uncalibrated, to start. 
3. Then make the weights more than just a single array. Add several levels of targets.

In [None]:
# Load the data 
df_age_nworth = get_dataset()

# Remove the household_id column
df_age_nworth.drop(columns=["household_id/2024"], inplace=True)

# Extract the weights for calibration
df_init_weights = df_age_nworth.pop("household_weight/2024")
# TODO: Convert weights to an array.

# Asserting the dataframes have the same length, as an extra check that the weights line up with the data.
assert(len(df_age_nworth) == len(df_init_weights))

## Bring in Ben's age target .csv.

### Starting with national calibration, then adding district level (maybe then state)

In [9]:
# Calculate target values: total income for age groups 20-30 and 40-50 (as an example) or employ existing targets
targets_matrix = pd.DataFrame({
    "income_aged_20_30": ((df_age_nworth["age/2024"] >= 20) & (df_age_nworth["age/2024"] <= 30)).astype(float) * df_age_nworth["net_worth/2024"],
    "income_aged_40_50": ((df_age_nworth["age/2024"] >= 40) & (df_age_nworth["age/2024"] <= 50)).astype(float) * df_age_nworth["net_worth/2024"],
    "income_aged_71" : (df_age_nworth["age/2024"] == 71).astype(float) * df_age_nworth["net_worth/2024"],
})

In [None]:
## Replace with Ben's (national) targets. This might not change the dataframe much, but it's a starting place.

targets = np.array([
    (targets_matrix["income_aged_20_30"] * weights * 1000).sum(), 
    (targets_matrix["income_aged_40_50"] * weights * 1.15).sum(), 
    (targets_matrix["income_aged_71"] * weights * 1.15).sum()
])

print(f"Original weights: {weights}")
print(f"Original targets: {targets}")

In [None]:
# Initialize the Calibration object
calibrator = Calibration(
    # I'm a little confused between loss_matrix and targets (which is an array)
    loss_matrix=targets_matrix,
    weights=weights, 
    # Check that I used the right number of levels of targets. 
    targets=targets,
    noise_level=0.05,
    epochs=528,
    learning_rate=0.01,
    dropout_rate=0,
    subsample_every=0,
)

# Perform the calibration
performance_df = calibrator.calibrate()

print(f"Original dataset size: {len(targets_matrix)}")
print(f"Calibrated dataset size: {len(calibrator.loss_matrix)}")
print(f"Number of calibrated weights: {len(calibrator.weights)}")

INFO:microcalibrate.calibration:Performing basic target assessment...
INFO:microcalibrate.reweight:Starting calibration process for targets ['income_aged_20_30' 'income_aged_40_50' 'income_aged_71']: [7.37032429e+08 9.76779350e+05 4.36479914e+04]
INFO:microcalibrate.reweight:Original weights - mean: 1.0000, std: 0.0000
INFO:microcalibrate.reweight:Initial weights after noise - mean: 1.0252, std: 0.0143
Reweighting progress:   0%|          | 0/528 [00:00<?, ?epoch/s, loss=0.342, count_observations=121, weights_mean=1.03, weights_std=0.0143, weights_min=1]INFO:microcalibrate.reweight:Within 10% from targets: 0.00% 

Reweighting progress:   0%|          | 0/528 [00:00<?, ?epoch/s, loss=0.333, count_observations=121, weights_mean=1.06, weights_std=0.0514, weights_min=1]INFO:microcalibrate.reweight:Within 10% from targets: 66.67% 

INFO:microcalibrate.reweight:Epoch   10: Loss = 0.333290, Change = 0.008943 (improving)
Reweighting progress:   2%|▏         | 13/528 [00:00<00:04, 125.70epoch/s

Original dataset size: 121
Calibrated dataset size: 121
Number of calibrated weights: 121
