## A colab for the American Community Survey Public Use Microdata Sample

Copyright 2025 The Google Research Authors

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
```
 http://www.apache.org/licenses/LICENSE-2.0
```
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

## Experiments with the American Community Survey (ACS) Public Use Microdata Sample (PUMS)

In [None]:
import os
import numpy as np
import pandas as pd
import scipy
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder

from folktables import ACSDataSource, BasicProblem
from folktables.acs import adult_filter, public_coverage_filter
from causal_evaluation import utils

In [None]:
data_root = './../../data/acs_pums/raw'  # @param
preds_data_path = './../../data/acs_pums/' # @param
year = 2018  # @param
horizon = '5-Year'  # @param
state = 'CA'  # @param

In [None]:
os.makedirs(data_root, exist_ok=True)
os.makedirs(preds_data_path, exist_ok=True)

In [None]:
data_source = ACSDataSource(
    survey_year=str(year), horizon=horizon, survey='person', root_dir=data_root
)
data_df = data_source.get_data(states=[state], download=True)

In [None]:
definitions = data_source.get_definitions()

In [None]:
# Get names for race field
race_df = definitions.loc[definitions[1] == 'RAC1P'][[5, 6]]
race_df.columns = ['RAC1P', 'RAC1P_NAME']
race_df = race_df.query('~RAC1P.isna()')
race_df['RAC1P'] = race_df['RAC1P'].astype(int)

In [None]:
def map_race_eth(df):
  """Maps race and ethnicity data to a single variable.

  The logic is as follows:

    * If ethnicity is Hispanic, then the combined variable is Hispanic.
    * If ethnicity is not Hispanic, then the combined variable is the value of
    the race variable.
    * After mapping the ethnicity variable, rare categories are combined with
    the "Other" category. The result is that American Indian, Alaskan Native,
    and Pacific Islander groups are mapped to "Other".
  """
  df['HISP_binary'] = (df['HISP'] != 1) * 1
  df['HISP_binary'] = df['HISP_binary'].map({0: 'Non-Hispanic', 1: 'Hispanic'})
  df = df.merge(race_df, how='left')
  df['RACE_ETH_NAME'] = df['HISP_binary'].where(
      df['HISP_binary'] == 'Hispanic', df['RAC1P_NAME']
  )
  race_eth_mapping = {
      'White alone': 'White',
      'Hispanic': 'Hispanic',
      'Asian alone': 'Asian',
      'Black or African American alone': 'Black',
      'Two or More Races': 'Multiracial',
      'Native Hawaiian and Other Pacific Islander alone': 'Other',
      'Some Other Race alone': 'Other',
      'American Indian and Alaska Native tribes specified; or American Indian or Alaska Native, not specified and no other races': (
          'Other'
      ),
      'Alaska Native alone': 'Other',
      'American Indian alone': 'Other',
  }
  df['RACE_ETH_NAME'] = df['RACE_ETH_NAME'].map(race_eth_mapping)
  df['RACE_ETH'] = pd.Categorical(df['RACE_ETH_NAME']).codes
  return df


data_df = map_race_eth(data_df)

In [None]:
display.display(data_df[['RAC1P', 'RAC1P_NAME']].value_counts())
display.display(
    data_df[['RAC1P', 'RAC1P_NAME', 'RACE_ETH', 'RACE_ETH_NAME']].value_counts()
)
display.display(data_df[['RACE_ETH', 'RACE_ETH_NAME']].value_counts())

In [None]:
tasks = {
    'ACSIncome': {
        'task_spec': BasicProblem(
            features=[
                'AGEP',
                'COW',
                'SCHL',
                'MAR',
                'OCCP',
                'POBP',
                'RELP',
                'WKHP',
                'SEX',
                'RACE_ETH',
            ],
            target='PINCP',
            target_transform=lambda x: x > 50000,
            group='RACE_ETH',
            preprocess=adult_filter,
        ),
        'categorical_cols': [
            'COW',
            'SCHL',
            'MAR',
            'OCCP',
            'POBP',
            'RELP',
            'SEX',
            'RACE_ETH',
        ],
        'numeric_cols': ['WKHP', 'AGEP'],
    },
    'ACSPublicCoverage': {
        'task_spec': BasicProblem(
            features=[
                'AGEP',
                'SCHL',
                'MAR',
                'SEX',
                'DIS',
                'ESP',
                'CIT',
                'MIG',
                'MIL',
                'ANC',
                'NATIVITY',
                'DEAR',
                'DEYE',
                'DREM',
                'PINCP',
                'ESR',
                'ST',
                'FER',
                'RACE_ETH',
            ],
            target='PUBCOV',
            target_transform=lambda x: x == 1,
            group='RACE_ETH',
            preprocess=public_coverage_filter,
        ),
        'categorical_cols': [
            'SCHL',
            'MAR',
            'SEX',
            'DIS',
            'ESP',
            'CIT',
            'MIG',
            'MIL',
            'ANC',
            'NATIVITY',
            'DEAR',
            'DEYE',
            'DREM',
            'ESR',
            'ST',
            'FER',
            'RACE_ETH',
        ],
        'numeric_cols': ['AGEP', 'PINCP'],
    },
}

In [None]:
group_id_map_df = data_df[['RACE_ETH', 'RACE_ETH_NAME']].drop_duplicates()

In [None]:
data_dict = {}
for task in tasks.keys():
  data_dict[task] = {}
  (
      data_dict[task]['features'],
      data_dict[task]['labels'],
      data_dict[task]['group'],
  ) = tasks[task]['task_spec'].df_to_pandas(data_df)
  data_dict[task]['labels'] = data_dict[task]['labels'].values.squeeze()
  data_dict[task]['group'] = data_dict[task]['group']
  group_name_df = data_dict[task]['group'].merge(group_id_map_df, how='left')
  data_dict[task]['group'] = data_dict[task]['group'].values.squeeze()
  data_dict[task]['group_name'] = group_name_df['RACE_ETH_NAME'].values

In [None]:
# Check missing values
for task in data_dict.keys():
  print(task)
  print(data_dict[task]['features'].isna().sum())  # Missing values OK
  assert np.isnan(data_dict[task]['labels']).sum() == 0

In [None]:
group_labels_df = pd.concat(
    pd.DataFrame(
        {'task': key, 'labels': value['labels'], 'group': value['group']}
    )
    for key, value in data_dict.items()
).merge(group_id_map_df, left_on='group', right_on='RACE_ETH')

In [None]:
# Get a table with the counts and prevalence of the outcome across groups for all tasks
count_prevalence_df = (
    group_labels_df.groupby(['task', 'RACE_ETH_NAME'])[['labels']]
    .agg(count=('labels', lambda x: x.count()), prevalence=('labels', 'mean'))
    .reset_index()
)
count_prevalence_df_long = count_prevalence_df.melt(
    id_vars=['task', 'RACE_ETH_NAME']
)
count_prevalence_df_wide = count_prevalence_df_long.pivot(
    index=['RACE_ETH_NAME'], columns=['task', 'variable']
).sort_index(level=0, axis=1)
for col in count_prevalence_df_wide.columns:
  if col[-1] == 'prevalence':
    count_prevalence_df_wide[col] = count_prevalence_df_wide[col].map(
        lambda x: f'{x:.3f}'
    )
  if col[-1] == 'count':
    count_prevalence_df_wide[col] = count_prevalence_df_wide[col].map(
        lambda x: f'{x:,.0f}'
    )
display.display(count_prevalence_df_wide)

## Prepare the data for model fitting

In [None]:
def map_rare_categories_df(x, max_categories=255):
  """For a pd.DataFrame maps rare categories in columns with more than max_categories values to a new category.

  For numeric data, this is column.max() + 1, for categorical data this is a
  category called 'RARE_CATEGORY'.

  This preprocessing is necessary for use of
  sklearn.HistGradientBoostingClassifier.
  """
  x = x.copy()
  for column in x.columns:
    counts = x[column].value_counts()
    if len(counts) > max_categories:
      rare_categories = counts.iloc[(max_categories - 1) :]
    else:
      continue
    if pd.api.types.is_numeric_dtype(x[column].dtype):
      replace_value = x[column].max() + 1
    else:
      replace_value = 'RARE_CATEGORY'
    x[column] = x[column].replace(
        {key: replace_value for key in list(rare_categories.index)}
    )
  return x

In [None]:
# Define preprocessors to map rare categories
for task in data_dict.keys():
  data_dict[task]["preprocessor"] = ColumnTransformer(
      transformers=[
          (
              "numerical",
              FunctionTransformer(lambda x: x),
              tasks[task]["numeric_cols"],
          ),
          (
              "categorical",
              FunctionTransformer(map_rare_categories_df),
              tasks[task]["categorical_cols"],
          ),
      ]
  )
  data_dict[task]["features_processed"] = data_dict[task][
      "preprocessor"
  ].fit_transform(data_dict[task]["features"])

In [None]:
# Record the categorical indices
for task in data_dict.keys():
  data_dict[task]['categorical_indices'] = [
      list(tasks[task]['numeric_cols'] + tasks[task]['categorical_cols']).index(
          col
      )
      for col in tasks[task]['categorical_cols']
  ]

In [None]:
# Split the data
for task in data_dict.keys():
  stratify_col = (
      data_dict[task]['features']['RACE_ETH'].astype(str)
      + '-'
      + pd.Series(data_dict[task]['labels']).astype(str)
  )
  (
      data_dict[task]['features_train'],
      data_dict[task]['features_test'],
      data_dict[task]['labels_train'],
      data_dict[task]['labels_test'],
      data_dict[task]['group_train'],
      data_dict[task]['group_test'],
  ) = train_test_split(
      data_dict[task]['features_processed'],
      data_dict[task]['labels'],
      data_dict[task]['group'],
      test_size=0.2,
      stratify=stratify_col,
      random_state=14,
  )

In [None]:
# Get a separate feature array after removing RACE_ETH from the feature set
def delete_and_shift(x, delete_value):
  result = []
  for value in x:
    if value < delete_value:
      result.append(value)
    elif value > delete_value:
      result.append(value - 1)
    else:
      continue
  return result


for task in data_dict.keys():
  print(task)
  # Create new feature arrays
  race_eth_feature_index = list(data_dict[task]['features'].columns).index(
      'RACE_ETH'
  )
  data_dict[task]['features_population_train'] = np.delete(
      data_dict[task]['features_train'], race_eth_feature_index, axis=1
  )
  data_dict[task]['features_population_test'] = np.delete(
      data_dict[task]['features_test'], race_eth_feature_index, axis=1
  )
  data_dict[task]['categorical_indices_population'] = delete_and_shift(
      data_dict[task]['categorical_indices'], race_eth_feature_index
  )

In [None]:
# Create a LabelEncoder for group membership
for task in data_dict.keys():
  group_encoder = LabelEncoder()
  data_dict[task]['group_encoder'] = group_encoder.fit(
      data_dict[task]['group_train']
  )
  data_dict[task]['group_encoded_train'] = group_encoder.transform(
      data_dict[task]['group_train']
  )
  data_dict[task]['group_encoded_test'] = group_encoder.transform(
      data_dict[task]['group_test']
  )

## Fit models

In [None]:
# Initialize a dictionary to hold predictions
preds_dict = {}

In [None]:
model_type = 'gradient_boosting'

In [None]:
for task in data_dict.keys():
  print(task)

  preds_dict[task] = {}
  # Population model

  model_population = utils.fit_model(
      data_dict[task]['features_population_train'],
      data_dict[task]['labels_train'],
      model_type=model_type,
      model_kwarg_dict={
          'categorical_features': data_dict[task][
              'categorical_indices_population'
          ]
      },
      model_cross_val=True,
  )
  preds_dict[task]['pred_probs_y_x'] = model_population.predict_proba(
      data_dict[task]['features_population_test'],
  )
  preds_dict[task]['pred_probs_y1_x'] = preds_dict[task]['pred_probs_y_x'][
      :, -1
  ]

  # XA model
  model = utils.fit_model(
      data_dict[task]['features_train'],
      data_dict[task]['labels_train'],
      model_type=model_type,
      model_kwarg_dict={
          'categorical_features': data_dict[task]['categorical_indices']
      },
      model_cross_val=True,
  )
  preds_dict[task]['pred_probs_y_xa'] = model.predict_proba(
      data_dict[task]['features_test'],
  )
  preds_dict[task]['pred_probs_y1_xa'] = preds_dict[task]['pred_probs_y_xa'][
      :, -1
  ]

  # Stratified
  model_dict = utils.fit_model_stratified(
      data_dict[task]['features_population_train'],
      data_dict[task]['labels_train'],
      group=data_dict[task]['group_train'],
      model_type=model_type,
      model_cross_val=True,
      model_kwarg_dict={
          'categorical_features': data_dict[task][
              'categorical_indices_population'
          ]
      },
  )

  # Predict y in the eval data
  preds_dict[task]['pred_probs_y_xa_stratified'] = utils.array_to_series(
      utils.predict_proba_stratified(
          data_dict[task]['features_population_test'],
          model_dict,
          group=data_dict[task]['group_test'],
      )
  )
  preds_dict[task]['pred_probs_y1_xa_stratified'] = preds_dict[task][
      'pred_probs_y_xa_stratified'
  ].map(lambda x: x[-1])

  # # Fit model to predict P(A | X)
  model_group_x = utils.fit_model(
      data_dict[task]['features_population_train'],
      data_dict[task]['group_encoded_train'],
      model_type=model_type,
      model_kwarg_dict={
          'categorical_features': data_dict[task][
              'categorical_indices_population'
          ]
      },
      model_cross_val=True,
  )
  # Apply P(A | X) model to the test data
  preds_dict[task]['pred_probs_group_x'] = model_group_x.predict_proba(
      data_dict[task]['features_population_test'],
  )

  # Fit model to predict P(A | Y)
  model_group_y = utils.fit_model(
      data_dict[task]['labels_train'].reshape(-1, 1),
      data_dict[task]['group_encoded_train'],
      model_type=model_type,
      model_cross_val=True,
  )
  # Apply P(A | Y) model to the test data
  preds_dict[task]['pred_probs_group_y'] = model_group_y.predict_proba(
      data_dict[task]['labels_test'].reshape(-1, 1)
  )
  preds_dict[task]['pred_probs_group_r_x'] = utils.fit_cross_val_predict(
      scipy.special.logit(preds_dict[task]['pred_probs_y_x'][:, -1]).reshape(
          -1, 1
      ),
      data_dict[task]['group_encoded_test'],
      model_type=model_type,
      model_cross_val=True,
  )
  preds_dict[task]['pred_probs_group_r_xa'] = utils.fit_cross_val_predict(
      scipy.special.logit(preds_dict[task]['pred_probs_y_xa'][:, -1]).reshape(
          -1, 1
      ),
      data_dict[task]['group_encoded_test'],
      model_type=model_type,
      model_cross_val=True,
  )
  preds_dict[task]['pred_probs_group_r_xa_stratified'] = (
      utils.fit_cross_val_predict(
          scipy.special.logit(
              preds_dict[task]['pred_probs_y_xa_stratified']
              .map(lambda x: x[-1])
              .values
          ).reshape(-1, 1),
          data_dict[task]['group_encoded_test'],
          model_type=model_type,
          model_cross_val=True,
      )
  )

In [None]:
group_id_map_df.columns = ['group', 'group_name']

In [None]:
for task in preds_dict.keys():
  print(task)
  preds_file_name = f'preds_{task}_{horizon}_{year}_{model_type}.parquet'
  preds_df = pd.DataFrame({
      key: utils.array_to_series(value)
      for key, value in preds_dict[task].items()
  })
  preds_df['labels'] = data_dict[task]['labels_test'] * 1
  preds_df['group'] = data_dict[task]['group_test']
  preds_df = preds_df.merge(group_id_map_df, how='left')
  preds_df.to_parquet(
      os.path.join(preds_data_path, preds_file_name), index=False
  )