In [15]:
# Run code on Anaconda Env.
# Construct data source by downloading 2018 data
import folktables
from folktables import ACSDataSource, ACSIncome
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)  # have to use 'download=True' if data not already avaiable locally

features, label, group = ACSIncome.df_to_numpy(acs_data)  # split data into corresponding features, labels, and group



In [16]:
def mixRaceRates(data, target_bhn_ratio):
    # Input: complete training dataset as numpy array
    # Output: filtered dataset where ratio of bhn indiviudals satisfies the desired target (+/- 1%)
    # RAC1P column # for ASCIncome task -> 9

    # count occurances of BHN in the RAC1P column
    rac1p_values = data[:, 9]
    bhn_count = np.count_nonzero(rac1p_values == 2) + np.count_nonzero(rac1p_values == 3)
    
    # calc current ratio
    current_ratio = bhn_count / len(rac1p_values)

    # if the current ratio is within 5% of desired ratio, return the dataset
    if abs(current_ratio - target_bhn_ratio) < 0.01:
        return data

    # if current ratio > target ratio: remove bhn rows
    if current_ratio > target_bhn_ratio:
        # find indicies of bhn
        target_indicies = np.where((rac1p_values == 2) | (rac1p_values == 3))[0]
        np.random.shuffle(target_indicies)
        # calculate num of bhn rows to remove, then remove specified # of bhn rows
        remove_count = bhn_count - int(target_bhn_ratio * len(rac1p_values))
        data = np.delete(data, target_indicies[:remove_count], axis=0)
    else: 
    # current ratio < target ratio: remove non-bhn rows
        # find indicies of non-bhn
        target_indicies = np.where((rac1p_values != 2) & (rac1p_values != 3))[0]
        np.random.shuffle(target_indicies)
        # calculate num of non-bhn rows to remove, then remove specified # of non-bhn rows
        non_bhn_count = len(rac1p_values) - (np.count_nonzero(rac1p_values == 2) + np.count_nonzero(rac1p_values == 3))
        remove_count = non_bhn_count - int((1-target_bhn_ratio) * len(rac1p_values))
        data = np.delete(data, target_indicies[:remove_count], axis=0)
        
    # recalc current ratio
    '''
    rac1p_values = data[:, 9]
    bhn_count = np.count_nonzero(rac1p_values == 2) + np.count_nonzero(rac1p_values == 3)
    current_ratio = bhn_count / len(rac1p_values)
    print("updated ratio: ", current_ratio)
    '''

    return data


In [17]:
def confusion(pred, label):
    tn, fp, fn, tp = confusion_matrix(label, pred).ravel()
    acc = (tp + tn)/ (tp + tn + fp + fn)
    tpr = tp / (tp + fn)
    fnr = fn / (tp + fn)
    fpr = fp / (fp + tn)
    tnr = tn / (fp + tn)
    return [acc, tpr, fnr, fpr, tnr]

def evaluateRatio(features, label, group, bhn_ratio):
    # randomly split the data into training and testing
    # train-test split: 80/20
    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        features, label, group, test_size=0.2)

    # reshape y_train into column vector, then concatenate with X_train to reformat training data
    train_data = np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1)

    # filter training data to satisfy desired BHN ratio
    modified_train_data = mixRaceRates(train_data, bhn_ratio)
    
    new_X_train = modified_train_data[:, :-1] # get all cols except for last one
    new_y_train = modified_train_data[:, -1]  # get only the last col

    # create the pipeline: normalize data, then use logistic regression as classifier
    model = make_pipeline(StandardScaler(), RandomForestClassifier(max_depth=16, min_samples_leaf=3))
    # train the model with the training data
    model.fit(new_X_train, new_y_train)

    # make predictions on test data set
    yhat = model.predict(X_test)

    yhat_wa = yhat[(group_test == 1) | (group_test == 6)]  # all rows from prediction where group=1 (white) or group=6 (asian)
    yhat_bhn = yhat[(group_test == 2) | (group_test == 3)]  # all rows from prediction where group=2 (black) or group=3 (american indian)

    y_test_wa = y_test[(group_test == 1) | (group_test == 6)]  # all rows from test set where group=1 (white) or group=6 (asian)
    y_test_bhn = y_test[(group_test == 2) | (group_test == 3)]  # all rows from test set where group=2 (black) or group=3 (american indian)
    
    # get the acc, tpr, fnr, fpr, tnr data for WA and BHN groups
    wa_data = confusion(yhat_wa, y_test_wa)
    bhn_data = confusion(yhat_bhn, y_test_bhn)
    
    return wa_data, bhn_data

In [None]:
target_bhn_ratios = [0.05, 0.04, 0.03, 0.02, 0.01, 0]
# map: % bhn in training set -> [ACC, TPR, FNR, FPR, TNR]
wa_map = {}
bhn_map = {}

for ratio in target_bhn_ratios:
    wa_data, bhn_data = evaluateRatio(features, label, group, ratio)
    wa_map[ratio] = wa_data
    bhn_map[ratio] = bhn_data


