<a href="https://colab.research.google.com/github/ecreager/Adult-Confounded/blob/main/generate_adult_confounded.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Note: contents of this notebook were adapted from notebooks provided by [the ARL codebase](https://github.com/google-research/google-research/tree/master/group_agnostic_fairness/data_utils)

In [1]:
"""Import packages."""
from __future__ import division
import pandas as pd
import numpy as np
import json
import os,sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
"""Download original dataset."""
!mkdir -p ./data/adult_orig
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data -O ./data/adult_orig/adult.data
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test -O ./data/adult_orig/adult.test
# remove whitespaces from csv
!sed -i 's/ //g' ./data/adult_orig/adult.data
!sed -i 's/ //g' ./data/adult_orig/adult.test
# remove trailing periods from test set (not sure why those are there...)
!sed -i 's/\.//g' ./data/adult_orig/adult.test
# # remove (empty) final lines
!head -n -1 ./data/adult_orig/adult.data > /tmp/adult.data
!mv /tmp/adult.data ./data/adult_orig/adult.data
!head -n -1 ./data/adult_orig/adult.test > /tmp/adult.test
!mv /tmp/adult.test ./data/adult_orig/adult.test

--2021-08-24 13:57:24--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3974305 (3.8M) [application/x-httpd-php]
Saving to: ‘./data/adult_orig/adult.data’


2021-08-24 13:57:25 (8.11 MB/s) - ‘./data/adult_orig/adult.data’ saved [3974305/3974305]

--2021-08-24 13:57:25--  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2003153 (1.9M) [application/x-httpd-php]
Saving to: ‘./data/adult_orig/adult.test’


2021-08-24 13:57:26 (4.67 MB/s) - ‘./data/adult_orig/adult.test’ saved [2003153/2003153]



In [3]:

"""Catalog of subsampling functions (select one in the next cell)."""

def corr_sg02_anti_sg13(df):
  """Y=1 correlated with membership in subgroups {0,2} and anti-correlated with 
  membership in subgroups {1,3}.
  """
  df_subgroups = [
    df[(df.race != 'Black') & (df.sex != 'Female')],  # 00: White Male
    df[(df.race != 'Black') & (df.sex == 'Female')],  # 01: White Female
    df[(df.race == 'Black') & (df.sex != 'Female')],  # 10: Black Male
    df[(df.race == 'Black') & (df.sex == 'Female')],  # 11: Black Female
  ]
  # observed probabilities
  p = dict()
  p['y=1|sg=0'] = (df_subgroups[0].income == '>50K').mean().item()
  p['y=1|sg=1'] = (df_subgroups[1].income == '>50K').mean().item()
  p['y=1|sg=2'] = (df_subgroups[2].income == '>50K').mean().item()
  p['y=1|sg=3'] = (df_subgroups[3].income == '>50K').mean().item()
  # target probabilities
  q = dict()
  q['y=1|sg=3'] = p['y=1|sg=3']  # SG3 observed Y=1 rate sets Y=1 rate for modified SG1 and SG3...
  q['y=1|sg=1'] = q['y=1|sg=3']
  q['y=1|sg=0'] = 1. - q['y=1|sg=3']  # ...and Y=0 rate for SG0 and SG2.
  q['y=1|sg=2'] = 1. - q['y=1|sg=3']
  modified_df = []
  for i, dfs in enumerate(df_subgroups):
    if i == 3:  # don't resample SG3 b/c it is so small
      df_ = dfs
    else:
      k = 'y=1|sg=%d' % i
      # importance weights
      weights = (dfs.income == '>50K') * (q[k] / p[k]) \
        + (dfs.income != '>50K') * ((1. - q[k]) / (1. - p[k]))
      df_ = dfs.sample(n=len(dfs), replace=True, weights=weights).reset_index(drop=True)
    modified_df.append(df_)
  print('df_subgroups | size | base rates')
  print('--------------------------------')
  for i, dfs in enumerate(df_subgroups):
    print(i, '%18s' % len(dfs), ' ' + '%.3f' % (dfs['income'] == '>50K').mean())
  print()
  print('modified_df  | size | base rates')
  print('--------------------------------')
  for i, mdf in enumerate(modified_df):
    print(i, '%18s' % len(mdf), ' ' + '%.3f' % (mdf.income == '>50K').mean())
  print()
  new_df = pd.concat(modified_df)  # merge all subgroups
  new_df = new_df.sample(frac=1.).reset_index(drop=True)  # shuffle rows
  return new_df

def corr_sg13_anti_sg02(df):
  """Y=1 correlated with membership in subgroups {1,3} and anti-correlated with 
  membership in subgroups {0,2}.
  """
  df_subgroups = [
    df[(df.race != 'Black') & (df.sex != 'Female')],  # 00: White Male
    df[(df.race != 'Black') & (df.sex == 'Female')],  # 01: White Female
    df[(df.race == 'Black') & (df.sex != 'Female')],  # 10: Black Male
    df[(df.race == 'Black') & (df.sex == 'Female')],  # 11: Black Female
  ]
  # observed probabilities
  p = dict()
  p['y=1|sg=0'] = (df_subgroups[0].income == '>50K').mean().item()
  p['y=1|sg=1'] = (df_subgroups[1].income == '>50K').mean().item()
  p['y=1|sg=2'] = (df_subgroups[2].income == '>50K').mean().item()
  p['y=1|sg=3'] = (df_subgroups[3].income == '>50K').mean().item()
  # target probabilities
  q = dict()
  q['y=1|sg=3'] = 1. - p['y=1|sg=3']  # SG3 observed Y=1 rate sets Y=0 rate for modified SG1 and SG3...
  q['y=1|sg=1'] = q['y=1|sg=3']
  q['y=1|sg=0'] = 1. - q['y=1|sg=3']  # ...and Y=1 rate for SG0 and SG2.
  q['y=1|sg=2'] = 1. - q['y=1|sg=3']
  modified_df = []
  for i, dfs in enumerate(df_subgroups):
    k = 'y=1|sg=%d' % i
    # importance weights
    weights = (dfs.income == '>50K') * (q[k] / p[k]) \
      + (dfs.income != '>50K') * ((1. - q[k]) / (1. - p[k]))
    df_ = dfs.sample(n=len(dfs), replace=True, weights=weights).reset_index(drop=True)
    modified_df.append(df_)
  print('df_subgroups | size | base rates')
  print('--------------------------------')
  for i, dfs in enumerate(df_subgroups):
    print(i, '%18s' % len(dfs), ' ' + '%.3f' % (dfs['income'] == '>50K').mean())
  print()
  print('modified_df  | size | base rates')
  print('--------------------------------')
  for i, mdf in enumerate(modified_df):
    print(i, '%18s' % len(mdf), ' ' + '%.3f' % (mdf.income == '>50K').mean())
  print()
  new_df = pd.concat(modified_df)  # merge all subgroups
  new_df = new_df.sample(frac=1.).reset_index(drop=True)  # shuffle rows
  return new_df


In [4]:
"""Specify the subgroup here by selecting from catalog of functions above."""
CONFOUND_FN = corr_sg02_anti_sg13 # uncomment this to generate dist'n used for training
# CONFOUND_FN = corr_sg13_anti_sg02 # uncomment this to generate dist'n used for testing


In [5]:
"""Print input/output dirs."""
pd.options.display.float_format = '{:,.2f}'.format
input_base_dir = './data/adult_orig/'
output_base_dir = './data/adult_conf/%s' % CONFOUND_FN.__name__
if not os.path.exists(output_base_dir):
  os.makedirs(output_base_dir)

print('%10s' % 'input dir', input_base_dir)
print('%10s' % 'output dir', output_base_dir)

 input dir ./data/adult_orig/
output dir ./data/adult_conf/corr_sg02_anti_sg13


In [6]:
"""Load original dataset"""
def convert_object_type_to_category(df):
  """Converts columns of type object to category."""
  df = pd.concat([df.select_dtypes(include=[], exclude=['object']),
                  df.select_dtypes(['object']).apply(pd.Series.astype, dtype='category')
                  ], axis=1).reindex(df.columns, axis=1)
  return df



TRAIN_FILE = os.path.join(input_base_dir,'adult.data')
TEST_FILE = os.path.join(input_base_dir,'adult.test')

columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

target_variable = "income"
target_value = ">50K"

with open(TRAIN_FILE, "r") as TRAIN_FILE:
  # train_df = pd.read_csv(TRAIN_FILE,sep=',',names=columns)
  train_df = pd.read_csv(TRAIN_FILE, names=columns)

with open(TEST_FILE, "r") as TEST_FILE:
  # test_df = pd.read_csv(TEST_FILE,sep=',',names=columns)
  test_df = pd.read_csv(TEST_FILE, names=columns)


# Convert columns of type ``object`` to ``category`` 
train_df = convert_object_type_to_category(train_df)
test_df = convert_object_type_to_category(test_df)
test_df = test_df.dropna()  # handle the weird "|1x3 Cross validator" row

In [7]:
train_df[:10]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [8]:
test_df[:10]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K
6,34,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K
7,29,?,227026.0,HS-grad,9.0,Never-married,?,Unmarried,Black,Male,0.0,0.0,40.0,United-States,<=50K
8,63,Self-emp-not-inc,104626.0,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103.0,0.0,32.0,United-States,>50K
9,24,Private,369667.0,Some-college,10.0,Never-married,Other-service,Unmarried,White,Female,0.0,0.0,40.0,United-States,<=50K
10,55,Private,104996.0,7th-8th,4.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,10.0,United-States,<=50K


In [9]:
"""Confound via importance-weighted resampling."""
print(CONFOUND_FN.__name__)
print()
train_df = CONFOUND_FN(train_df)
test_df = CONFOUND_FN(test_df)

corr_sg02_anti_sg13

df_subgroups | size | base rates
--------------------------------
0              20221  0.315
1               9216  0.118
2               1569  0.189
3               1555  0.058

modified_df  | size | base rates
--------------------------------
0              20221  0.940
1               9216  0.054
2               1569  0.943
3               1555  0.058

df_subgroups | size | base rates
--------------------------------
0              10052  0.310
1               4668  0.117
2                808  0.170
3                753  0.056

modified_df  | size | base rates
--------------------------------
0              10052  0.943
1               4668  0.061
2                808  0.948
3                753  0.056



In [10]:
"""Print some of the confounded data."""
train_df[:20]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,43,Private,104660,Masters,14,Widowed,Exec-managerial,Unmarried,White,Male,4934,0,40,United-States,>50K
1,47,Private,481987,HS-grad,9,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,United-States,>50K
2,42,Private,79586,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,Asian-Pac-Islander,Male,0,0,40,United-States,>50K
3,59,Private,258579,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,3103,0,35,United-States,>50K
4,46,Self-emp-not-inc,28281,Bachelors,13,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,40,United-States,>50K
5,36,Private,171676,Bachelors,13,Never-married,Sales,Not-in-family,White,Female,0,1741,40,United-States,<=50K
6,25,Private,137645,Bachelors,13,Never-married,Sales,Not-in-family,Black,Female,0,0,40,United-States,<=50K
7,33,Private,141841,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Black,Male,5178,0,40,United-States,>50K
8,44,State-gov,691903,Masters,14,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,60,United-States,>50K
9,41,Private,188615,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,>50K


In [11]:
"""Optionally write to disk."""

# legacy from the ARL codebase - these imporance weights could be used for an
# importance-weighted baseline

EPS = 1e-3
IPS_example_weights_without_label = {
  0: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex != 'Female')]) + EPS), # 00: White Male
  1: (len(train_df))/(len(train_df[(train_df.race != 'Black') & (train_df.sex == 'Female')]) + EPS), # 01: White Female
  2: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex != 'Female')]) + EPS), # 10: Black Male
  3: (len(train_df))/(len(train_df[(train_df.race == 'Black') & (train_df.sex == 'Female')]) + EPS)  # 11: Black Female
}
  
output_file_path = os.path.join(output_base_dir, 'IPS_example_weights_without_label.json')
# write
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(IPS_example_weights_without_label))
    output_file.close()

print(IPS_example_weights_without_label)

# Confirm that IPS specify a normalized joint dist'n over subgroups.
Z = 0.
for k, v in IPS_example_weights_without_label.items():
  Z += 1./v
print('this should be about 1.0:', Z)


IPS_example_weights_with_label = {
0: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')]) + EPS), # 000: Negative White Male
1: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')]) + EPS), # 001: Negative White Female
2: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')]) + EPS), # 010: Negative Black Male
3: (len(train_df))/(len(train_df[(train_df[target_variable] != target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')]) + EPS), # 011: Negative Black Female
4: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex != 'Female')]) + EPS), # 100: Positive White Male
5: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race != 'Black') & (train_df.sex == 'Female')]) + EPS), # 101: Positive White Female
6: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex != 'Female')]) + EPS), # 110: Positive Black Male
7: (len(train_df))/(len(train_df[(train_df[target_variable] == target_value) & (train_df.race == 'Black') & (train_df.sex == 'Female')]) + EPS), # 111: Positive Black Female
}
  
output_file_path = os.path.join(output_base_dir,'IPS_example_weights_with_label.json')
# write
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(IPS_example_weights_with_label))
    output_file.close()

print(IPS_example_weights_with_label)


# construct and write vocab
cat_cols = train_df.select_dtypes(include='category').columns
vocab_dict = {}
for col in cat_cols:
  vocab_dict[col] = list(set(train_df[col].cat.categories)-{"?"})
  
output_file_path = os.path.join(output_base_dir, 'vocabulary.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(vocab_dict))
    output_file.close()
print(vocab_dict)

# write means/stds
temp_dict = train_df.describe().to_dict()
mean_std_dict = {}
for key, value in temp_dict.items():
  mean_std_dict[key] = [value['mean'],value['std']]

output_file_path = os.path.join(output_base_dir,'mean_std.json')
with open(output_file_path, mode="w") as output_file:
    output_file.write(json.dumps(mean_std_dict))
    output_file.close()
print(mean_std_dict)

# write csv
output_file_path_train = os.path.join(output_base_dir, 'train.csv')
with open(output_file_path_train, mode="w") as output_file:
    train_df.to_csv(output_file,
                    index=False,
                    header=False)

output_file_path_test = os.path.join(output_base_dir, 'test.csv')
with open(output_file_path_test, mode="w") as output_file:
    test_df.to_csv(output_file,
                  index=False,
                  header=False)


{0: 1.6102565842314136, 1: 3.533094234690296, 2: 20.752695504974184, 3: 20.939536373288504}
this should be about 1.0: 1.0000001228463498
{0: 27.043997471762896, 1: 3.7366302803958824, 2: 361.78486905701044, 3: 22.225923395274133, 4: 1.7122047792919608, 5: 64.86242059278767, 6: 22.01553616258542, 7: 361.78486905701044}
{'workclass': ['State-gov', 'Self-emp-not-inc', 'Without-pay', 'Private', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'Never-worked'], 'education': ['7th-8th', '1st-4th', 'Some-college', 'Assoc-acdm', 'Masters', '10th', 'Assoc-voc', 'Bachelors', 'Prof-school', 'HS-grad', '12th', 'Doctorate', 'Preschool', '5th-6th', '9th', '11th'], 'marital-status': ['Separated', 'Married-civ-spouse', 'Widowed', 'Never-married', 'Married-AF-spouse', 'Divorced', 'Married-spouse-absent'], 'occupation': ['Sales', 'Handlers-cleaners', 'Prof-specialty', 'Farming-fishing', 'Adm-clerical', 'Tech-support', 'Armed-Forces', 'Other-service', 'Priv-house-serv', 'Protective-serv', 'Craft-repair', 'Exec