**Task: Create FPR/FNR/Accuracy graphs for different mixing rates for the Folktable**


Custom Prediction Task: ACSIncomeSexAllFeatures : Income greater the $50k on target SEX from ACS 2018 Data for CA

**CA 2018, 2015, 2017**

CA 2018

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import folktables
from folktables import ACSDataSource

def fetch_data_for_state_year(state, year, prediction_task):
    data_source = ACSDataSource(survey_year=year, horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True, join_household=True)
    features, label, group = prediction_task.df_to_numpy(acs_data)

    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeSexAllFeatures = folktables.BasicProblem(
    features=[ # ** --- BASED ON 2018 DATA --- **
        'AGEP', # Age (1-99)\n", 
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
        'HISP', # Recoded detailed Hispanic origin 
        # Original ACSIncomeFeatures, removed non-integer values
        'DIVISION', # Division code based on 2010 Census definitions
        'SPORDER', # Person number
        'PUMA', # Public use microdata area code, population >100k
        'REGION', # Region code based on 2010 Census definitions
        'ST', # State code based on 2010 Census definitions
        'ADJINC', # Adjustment factor for income and earnings dollar amounts
        'PWGTP', # Person's weight
        'CIT', # Citizenship status
        'CITWP', # Year of naturalization write-in.
        'DDRS', # Self-care difficulty
        'DEAR', # Hearing difficulty
        'DEYE', # Vision difficulty
        'DOUT', # Independent living difficulty
        'DPHY', # Ambulatory difficulty
        'DRAT', # Veteran service connected disability rating (percentage)
        'DRATX', # Veteran service connected disability rating (checkbox)
        'DREM', # Cognitive difficulty
        'ENG', # Ability to speak English
        
        'FER', # Gave birth to child within the past 12 months
        'GCL', # Grandparents living with grandchildren
        'GCM', # Length of time responsible for grandchildren
        'GCR', # Grandparents responsible for grandchildren
        'HINS1', # Insurance through a current or former employee
        'HINS2', # Insurance purchased directly from an insurance
        'HINS3', # Medicare, for people 65 or older, or people with certain disabilities
        'HINS4', # TRICARE or other military health care
        'HINS6', # VA (including those who have ever used or enrolled for VA health care)
        'HINS7', # Indian health service
#         'INTP', # Interest, dividends, and net rental income past 12 months (signed, use ADJINC to adjust to constant dollars)
#         'JWMNP', # Travel time to work
        'JWRIP', # Vehicle occupancy
        'JWTR', # Means of transportation to work
        'LANX', # Language other than English spoken at home
        'MAR', # Marital status
        'MARHD', # Divorced in the past 12 months
        'MARHM', # Married in the past 12 months
        'MARHW', # Widowed in the past 12 months
#         'MARHYP', # Year last married
        'MIG', # Military status
        'MIL', # Military service
        'MLPA', # Served September 2001 or later
        'MLPB', # Served August 1990 - August 2001 (including Persian Gulf War)
        'MLPCD', # Served May 1975 - July 1990
        'MLPE', # Served Vietnam era (August 1964 - April 1975)
        'MLPFG', # Served February 1955 - July 1964
        'MLPH', # Served Korean War (July 1950 - January 1955)
        'MLPI', # Served January 1947 - June 1950
        'MLPJ', # Served World War II (December 1941 - December 1946)
        
        'MLPK', # Served November 1941 or earlier
        'NWAB', # Temporary absence from work
        'NWAV', # Available for work
        'NWLA', # On layoff from work
        'NWLK', # Looking for work
        'NWRE', # formed of recall
        'OIP', # All other income past 12 months
        'PAP', # Public assistance income past 12 months
        'RELP', # Relationship
        'RETP', # Retirement income past 12 months
        'SCH', # School enrollment
        'SCHG', # Grade level attending
#         'SEMP', # Self-employment income past 12 months 
#         'SSIP', # Supplementary Security Income past 12 months
#         'SSP', # Social security income past 12 months
#         'WAGP', # Wages or salary income past 12 months
        'WKL', # When last worked
        'WKW', # Weeks worked during past 12 months
        'WRK', # Worked last week
#         'YOEP', # Year of entry
        'ANC', # Ancestry recode
        
        
#         'ANC1P', # Recoded Detailed Ancestry - first entry
#         'ANC2P', # Recorded Detailed Ancestry - second entry
#         'DECADE', # Decade of entry
        'DIS', # Disability recode
        'DRIVESP', # Number of vehicles calculated from JWRI
        'ESP', # Employment status of parents
        'ESR', # Employment status recode
#         'FOD1P', # Recoded field of degree - first entry
#         'FOD2P', # Recoded field of degree - second entry
        'HICOV', # Health insurance coverage recode
#         'INDP', # Industry recode for 2018 and later based on 2017 IND codes
        'JWAP', # Time of arrival at work - hour and minute
        
        'JWDP', # Time od departure for work - hour and minute
        'LANP', # Language spoken at home
        'MIGPUMA', # Migration PUMA based on 2010 Census definition
        'MIGSP', # Migration recode - State or foreign country code
        'MSP', # Married, spouse present / spouse absent
#         'NAICSP', # North American Industry Classification System (NAICS) recode for 2018 codes
        'NATIVITY', # Nativity (native or foreign born)
        'NOP', # Nativity of parent
        'OC', # Own child 
        
        
        'PAOC', # Presence and age of own children
        'PERNP', # Total person's earnings
        'POVPIP', # Income-to-poverty ratio recode
        'POWPUMA', # Place of work PU<A based on 2010 Census definitions
#         'POWSP', # Place of work - State or foreign country recode
        'PRIVCOV', # Private health insurance coverage recode
        'PUBCOV', # Public health coverage recode
        'QTRBIR', # Quarter of birth
        
        'RAC2P', # Recoded detailed race code
        'RAC3P', # recoded detailed race code
        'RACAIAN', # American Indian or Alaska Native recode
        'RACASN', # Asian recode
        'RACBLK', # Black or African American recode
        'RACNH', # Native Hawaiian recode
        'RACNUM', # Number of major race groups represented
        'RACPI', # Other Pacific Islander recode
        'RACSOR', # Some other race recode (Some other race alone or in combination with one or more other races)
        'RACWHT', # White recode 
        'RC', # Related child
        
        
        'SCIENGP', # Field of Degree Science and Engineering Flag - NSF Definition
#         'SCIENGRLP', # Field of Degree Science and Engineering Related Dlag - NSF Definition
        'SFN', # Subfamily number
        'SFR', # Subfamily relationship
#         'SOCP', # Standard Occupational Classification (SOC) codes for 2018 and later
        'VPS', # Veteran period of service
        'WAOB', # World area of birth
        'FAGEP', # Age allocation flag
        'FANCP', # Ancestry allocation flag
        
        
        'FCITP', # Citizenship allocation flag
        'FCITWP', # Year of naturalization write-in allocation flag
        'FCOWP', # Class of worker allocation flag
        'FDDRSP', # Self-care difficulty allocation flag
        'FDEARP', # Hearing difficulty allocation flag
        'FDEYEP', # Vision difficulty allocation flag
        'FDISP', # Disability recode allocation flag
        'FDOUTP', # Independent living difficulty allocation flag
        'FDPHYP', # Ambulatory difficulty allocation flag
        
        
        'FDRATP', # Disability rating percentage allocation flag
#         'FDRATXP', # Disability rating checkbox allocation flag
        'FDREMP', # Cognitive difficulty allocation flag
        'FENGP', # Ability to speak English allocation flag
        'FESRP', # Employment status recode allocation flag
        'FFERP', # Gave birth to child within the past 12 months allocation flag
        'FFODP', # Field of Degree allocation flag
        
        
        'FGCLP', # Grandparents living with grandchildren allocation flag
        'FGCMP', # Length of time responsible for grandchildren allocation flag
        'FGCRP', # Grandparents responsible for grandchildren allocation flag
        'FHICOVP', # Health insurance coverage recode allocation flag
        'FHINS1P', # Insurance through a current of former employer or union allocation flag
        'FHINS2P', # Insurance purchased directly from an insurance company
        'FHINS3C', # Medicare coverage given through the eligibility coverage edit
        'FHINS3P', # Medicare, for people 65 years or older, or people with certain disabilities allocation flag
        'FHINS4C', # Medicaid coverage given through the eligibility coverage edit
        'FHINS4P', # Medicaid, medical assistance, or any kind of government-assistance plan
        'FHINS5C', # TRICARE coverage given through the eligibility coverage edit
        'FHINS5P', # TRICARE or other military health care allocation flag
        
        'FHINS6P', # VA (including those who have ever used or enrolled for VA health care) allocation flag
        'FHINS7P', # Indian health service allocation flag
        'FHISP', # Detailed Hispanic origin allocation flag
        'FINDP', # Industry allocation flag
        'FINTP', # Intrest, dividend, and net rental income allocation flag
        'FJWDP', # Time of departure to work allocation flag
        'FJWMNP', # Travel time to work allocation flag
        'FJWTRP', # Means of transportation to work allocation flag
        'FLANP', # Language spoken at home allocation flag
        'FLANXP', # Language other than English allocation flag
        
        'FMARP', # Marital status allocation flag
        'FMARHDP', # Divorced in the past 12 months allocation flag
        'FMARHMP', # Married in the past 12 months allocation flag
        'FMARHTP', # Times married allocation flag
        'FMARHWP', # Widowed in the past 12 months allocation flag
        'FMARHYP', # Year last married allocation flag
        'FMIGP', # Mobility status allocation flag
        'FMIGSP', # Migration state allocation flag
        'FMILPP', # Military periods of service allocation flag
        'FMILSP', # Military service allocation flag
        'FOCCP', # Occupation allocation flag
        'FOIP', # All other income allocation flag
        'FPAP', # Public assistance income allocation flag
        
        'FPERNP', # Total person's earnings allocation flag
        'FPINCP', # Total person's income (signed) allocation flag
        'FPOWSP', # Place of work state allocation flag
        'FPRIVCOVP', # Private health insurance coverage recode allocation flag
        'FPUBCOVP', # Public health coverage recode allocation flag
        'FRACP', # Detailed race allocation flag
        'FRELP', # Relationship allocation flag
        'FRETP', # Retirement income allocation flag
        'FSCHGP', # Grade attending allocation flag
        'FSCHLP', # Highest education allocation age
        'FSCHP', # School enrollment allocation flag
        'FSEMP', # Self-employment income allocation flag
        
        'FSEXP', # Sex allocation flag
        'FSSIP', # Supplementary Security Income Allocation Flag
        'FSSP', # Social security Income allocation flag
        'FWAGP', # Wages and salary income allocation flag
        'FWKLP', # Last worked allocation flag
        'FWKWP', # Weeks worked past 12 months allocation flag
        'FWRKP', # Worked last week allocation flag
        'FYOEP', # Year of entry allocation flag
        
        
        # *** Housing **** # (More relevant for ACSMobility or ACSTravelTime prediction task)
        'ACCESS', # Access to the Internet
        'ACR', # Lot size (number of acres for house)
        'AGS', # Sales of agricultural products
        
        'BDSP', # Number of bedrooms
        'BLD', # Units in Structure
        'BROADBND', # Cellular data plan for smartphone or other mobile device
        
        'COMPOTHX', # Other computer equipment
        'CONP', # Condo fee
        'DIALUP', # Dialup service
        'ELEFP', # Electricity cost flag variable
        'ELEP', # Electricity cost
        'FS', # Yearly Food stamp recipiency
        'FULFP', # Fuel cost flag variable
        'FULP', # Fuel cost 
        'GASFP', # Gas cost flag variable
        'GASP', # Gas cost
        'HFL', # House heating fuel
        'HISPEED', # Broadband (high speed) internet service
        
        'INSP', # Fire/hazard/flood insurance
        'LAPTOP', # Laptop of desktop
        'MHP', # Mobile home costs , 
        'MRGI', # First mortgage payment (includes insurance)'
        'MRGP', # First mortgage payment (amount)'
        'MRGT', # First mortgage payment includes real estate taxes
        'MRGX', # First mortgage status
        'OTHSVCEX', # Other Internet service
        'REFR', # Refrigerator
        
        
        'RMSP', # Number of rooms
        'RNTM', # Meals included in rent
        'RNTP', # Monthly rent
        'RWAT', # Ho and cold running water
        'RWATPR', # Running water
        'SATELLITE', # Satellite Internet Service
        'SINK', # Sink with a faucet
        'SMARTPHONE', # Smartphone
        
        'SMP', # Total payment on all second and junior mortages and home equity loans (monthly)
        'TABLET', # Tablet or other wireless computer
        'TEL', # Telephone service
        'TEN', # Tenure
        'TYPE', # Type of unit
        'VACS', # Vacancy status
        'VALP', # Property value
        'VEH', # Vehicles (1 ton or less) available
        'WATFP', # Water cost flag variable
        'WATP', # Water cost (yearly cost)
        'YBL', # When structure first built
        
        'FES', # Family type and employment status
        'FINCP', # Family income
        'FPARC', # Family presence and age of related children
        'GRNTP', # Gross rent (monthly amount)
        'GRPIP', # Gross rent as a percentage of household income past 12 months
        'HHL', # Household language
        'HHLANP', # Detailed household language
        'HHT', # Household / family type
        
        'HINCP', # Household income (past 12 months)
        'HUGCL', # Household with grandparent
        'HUPAC', # HH presence and age of  children
        'HUPAOC', # HH presence and age of own children
        'HUPARC', # HH presence and age of related children
        'KIT', # Complete kitchen facilities
        
        'LNGI', # Limited English speaking household
        'MULTG', # Multigenerational household   
        'MV', # When moved into this house or apartment
        'NOC', # Number of own children in household (unweighted)
        'NP', # Number of persons in this household
        'NPF', # Number of persons in family (unweighted)
        'NPP', # Grandparent headed household with no parent present
        'NR', # Presence of nonrelative in household
        'NRC', # Number of related children in household (unweighted)
        'OCPIP', # Selected monthly owner costs as a persentage of household income during the past 12 months
        
        'PARTNER', # Unmarried partner household
        'PLM', # Complete plumbing facilities
        'PSF', # Presence of subfamilies in household
        'R18', # Presence of persons under 18 years in household
        'R60', # Presence of persons 60 years and over in household (unweighted)
        'R65', # Presence of persons 65 years and over in household (unweighted)
        'RESMODE', # Response mode
        'SMOCP', # Selected monthly owner costs
        'SMX', # Second or junior mortgage or home equity loan status
        'SRNT', # Specified rental unit
        'SSMC', # Same-sex married couple thresholds
        'SVAL', # Specified owner unit
        'TAXAMT', # Property taxes (yearly real estate taxes)
        'WGTP', # Housing Unit Weight
        'WIF', # Workers in family during the past 12 months
        'WKEXREL', # Work experience of householder and spouse
        'WORKSTAT', # Work status of householder or spouse in family households
        'FACCESSP', # Access to the Internet allocation flag
        'FACRP', # Lot size allocation flag
        'FAGSP', # Sales of Agricultural Products allocation flag
        
        'FBDSP', # Number of bedrooms allocation flag
        'FBLDP', # Units in structure allocation flag
        'FBROADBNDP', # Cellular data plan for a smartphone or other mobile device allocation flag
        'FCOMPOTHXP', # Other computer equipment allocation flag
        'FCONP', # Condominium fee allocation flag
        'FDIALUPP', # Dial-up service allocation flag
        'FELEP', # Electricity (monthly cost allocation flag)
        'FFINCP', # Family income (past 12 months) allocation flag
        'FFSP', # Yearly food stamp recipiency allocation flag
        'FFULP', # Fuel cost (yearly, fuels other than gas / electricity) allocation flag
        'FGASP', # Gas (monthly cost) allocation flag
        'FGRNTP', # Gross rent (monthly amount) allocation flag
        'FHFLP', # House heating fuel allocation flag
        'FHINCP', # Household income (past 12 months) allocation flag
        'FHISPEEDP', # Broadband (high speed) Internet service such as cable allocation flag
        'FINSP', # Fire, hazard, flood insurance (yearly amount) allocation flag
        'FKITP', # Complete kitchen facilities allocation flag
        'FLAPTOPP', # Laptop or desktop allocation flag
        
        'FMHP', # Mobile home costs (yearly amount) allocation flag
        'FMRGP', # First mortgage payment (monthly amount) allocation flag
        'FMRGTP', # First mortgage paymnet includes real estate taxes allocation flag
        'FMRGXP', # First mortgage status allocation flag
        'FMVP', # When moved into this house or apartment allocation
        'FOTHSVCEXP', # Other Internet service allocation flag
        'FPLMP', # Complete plumbing facilities allocation flag
        'FREFRP', # Refrigerator allocation flag
        'FRNTMP', # Meals inclyded in rent allocation flag
        'FRNTP', # Monthly rent allocation flag
        'FRWATP', # Hot and cold running water allocation flag
        'FSATELLITEP', # Satellite internet service allocation flag
        'FSMARTPHONP', # Smartphone allocation flag
        'FSMOCP', # Selected monthly owner cost allocation flag
        'FSMP', # Total payment on second and junior mortgages and home equity loans (monthly payment) allocation flag
        'FSMXHP', # Home equity loan status alocation flag
        'FSMXSP', # Second mortgage status allocation flag
        'FTABLETP', # Tablet or other portable wireless computer allocation flag
        'FTAXP', # Property taxes (yearly amount) allocation flag
        'FTELP', # Telephone service allocation flag
        'FTENP', # Tenure allocation flag
        'FVACSP', # Vacancy status allocation flag
        'FVEHP', # Vehicles available allocation flag
        'FWATP', # Water (yearly cost) allocation flag
        
    ],
    target='PINCP', 
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)


ACSIncomeSex = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # Sex (1,2)\n",
        'RAC1P', # Race (1-9)\n",
        'HISP', # Recoded detailed Hispanic origin
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes

# Mixing rate is determined by multiplying total number of original females by mixing rate, then 
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups, prediction_task):
#     i = 1

    initial_size = len(features)
#     if (i == 1) {
#     print("initial size", initial_size)
    num_males_original = np.sum(groups == 1)
#     print("num males original", num_males_original)
    num_females_original = np.sum(groups == 2)
#     print("num females original", num_females_original)
#     i -=1;
#     }


    female_indices = np.where(groups == 2)[0]
#     print(female_indices.shape)
#     print(female_indices)
    male_indices = np.where(groups == 1) [0]
#     print(male_indices.shape)
#     print(male_indices)

#     num_female_to_sample = int(len(female_indices) * mixing_rate)
    num_female_to_sample = int(len(female_indices) * 0.5)
    num_male_to_sample = num_female_to_sample
    
    num_female_mix_rate = num_female_to_sample * mixing_rate
    num_male_mix_rate = num_male_to_sample * (1 - mixing_rate)
    
#     print("For mixing rate: ", mixing_rate)
    
#     print("Num Female to sample", num_female_to_sample)
#     num_male_to_sample = modified_size - num_female_to_sample
#     print("Num male to sample", num_male_to_sample)
    
#     print("Num Female for Mixing rate", num_female_mix_rate)
#     num_male_to_sample = modified_size - num_
#     print("Num male for Mixing rate", num_male_mix_rate)
    
    
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_sample, replace=False)
#     print("subsampled_female_indices, mixing_rate", subsampled_female_indices, mixing_rate)
    subsampled_male_indices = np.random.choice(male_indices, size=num_male_to_sample, replace=False)
#     print("subsampled_male_indices, mixing_rate", subsampled_male_indices, mixing_rate)


    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])
    
# #     print("Mixing rate: ", mixing_rate)
# #     print("Modified size: ",  modified_size)
# #     modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
#     modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]
# #     print("Modified features size: ", modified_features.size)
#     modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
# #     print("Modified labels size: ", modified_labels.size)
#     modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])
# #     print("Modified groups size: ", modified_groups.size)

#     print(modified_size, len(modified_features))
    num_features = int(len(modified_features))
#     print(modified_size, len(modified_features))
    if modified_size is not None and int(modified_size) < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

#     model = make_pipeline(StandardScaler(), LogisticRegression())
# if(model == 'random') {
    model = make_pipeline(StandardScaler(),  RandomForestClassifier(max_depth=16, min_samples_leaf=3))

    
# }
# else if(model == 'logistic') {
#     model = make_pipeline(StandardScaler(), LogisticRegression())
# }
# else {
#     print("Training on random forest... (default option)")   
# }
#     model = RandomForestClassifier()

#     print(X_train.shape, X_train)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state_year('CA', '2018', ACSIncomeSexAllFeatures)

mixing_rates = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

# Total males in the dataset
total_males = len(np.where(ca_group == 1)[0])

print( "CA 2018 Data")

# Iterate over different fixed training set sizes
# fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
fixed_train_size = int(round(0.5 * total_females, -2))

# for fixed_train_size in fixed_train_sizes:
# print("Fixed training set size: ", modified_size)
print("Fixed training set size: ", fixed_train_size)


results_subset_list = []
num_iterations = 2 # 10 or 30
for iteration in range(num_iterations):
    results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                      'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
    for mixing_rate in mixing_rates:
        initial_size, modified_size, num_males_original, num_females_original, len_modified_features, fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group, ACSIncomeSexAllFeatures)

        results_subset['Mixing Rate'].append(mixing_rate)
        results_subset['FPR Male'].append(fpr_male)
        results_subset['FNR Male'].append(fnr_male)
        results_subset['Accuracy Male'].append(acc_male)
        results_subset['FPR Female'].append(fpr_female)
        results_subset['FNR Female'].append(fnr_female)
        results_subset['Accuracy Female'].append(acc_female)

    results_subset_list.append(results_subset)

average_results_subset = {'Mixing Rate': mixing_rates,
                          'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                                  'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

average_results_subset_df = pd.DataFrame(average_results_subset)

std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                      'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                      'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                      'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                      'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                      'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}

#     print(f"Original training set size: {len(initial_size)}")
# print("ITERATION: ", iteration)
print(f"Original training set size: {initial_size}")

#     print(f"Original number of females in train set: {np.sum(group_train == 2)}")
print(f"Original number of females in train set: {num_females_original}")

print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")

print(f"Original number of males in train set: {num_males_original}")
print(f"Percentage of total males that's being currently graphed: {fixed_train_size / total_males * 100}%")




fig, axs = plt.subplots(1, 3, figsize=(14, 5))
plt.title('CA 2018: ACSIncome MALE / FEMALE')
fig.suptitle(f'Training Set Size: {fixed_train_size}')

axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
axs[0].fill_between(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'], color='blue', alpha=0.2)
axs[0].fill_between(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'], average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'], color='red', alpha=0.2)
axs[0].set_xlabel('Mixing Rate')
axs[0].set_ylabel('False Positive Rate')
axs[0].set_title('FPR vs Mixing Rate')
axs[0].legend()
axs[0].set_ylim(0, 0.5)

axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
axs[1].fill_between(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'], average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'], color='blue', alpha=0.2)
axs[1].fill_between(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'], average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'], color='red', alpha=0.2)
axs[1].set_xlabel('Mixing Rate')
axs[1].set_ylabel('False Negative Rate')
axs[1].set_title('FNR vs Mixing Rate')
axs[1].legend()
axs[1].set_ylim(0, 0.5)

axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
axs[2].fill_between(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'], average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'], color='blue', alpha=0.2)
axs[2].fill_between(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'], average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'], color='red', alpha=0.2)
axs[2].set_xlabel('Mixing Rate')
axs[2].set_ylabel('Accuracy')
axs[2].set_title('Accuracy vs Mixing Rate')
axs[2].legend()
axs[2].set_ylim(0.5, 1.0)

plt.tight_layout()
plt.show()


CA 2015

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2015', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    initial_size = len(features)
    num_males_original = np.sum(groups == 1)
    num_females_original = np.sum(groups == 2)

    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state('CA')

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

print("CA 2015 Data")

# Iterate over different fixed training set sizes
fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
for fixed_train_size in fixed_train_sizes:
    print("Fixed training set size: ", fixed_train_size)

    results_subset_list = []
    num_iterations = 30
    for _ in range(num_iterations):
        results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                          'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
        for mixing_rate in mixing_rates:
            initial_size, _, num_males_original, num_females_original, _, \
            fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

            results_subset['Mixing Rate'].append(mixing_rate)
            results_subset['FPR Male'].append(fpr_male)
            results_subset['FNR Male'].append(fnr_male)
            results_subset['Accuracy Male'].append(acc_male)
            results_subset['FPR Female'].append(fpr_female)
            results_subset['FNR Female'].append(fnr_female)
            results_subset['Accuracy Female'].append(acc_female)

        results_subset_list.append(results_subset)

    average_results_subset = {'Mixing Rate': mixing_rates,
                              'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                              'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                              'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                              'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                              'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                              'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

    average_results_subset_df = pd.DataFrame(average_results_subset)

    std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}
    
    print(f"Original training set size: {len(X_train)}")
    print(f"Original number of females in train set: {np.sum(group_train == 2)}")
    print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")


    fig, axs = plt.subplots(1, 3, figsize=(14, 5))
    plt.title('CA 2018: ACSIncome MALE / FEMALE')
    fig.suptitle(f'Training Set Size: {fixed_train_size}')

    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                       average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                       color='blue', alpha=0.2)
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                       average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                       color='red', alpha=0.2)
    axs[0].set_xlabel('Mixing Rate')
    axs[0].set_ylabel('False Positive Rate')
    axs[0].set_title('FPR vs Mixing Rate')
    axs[0].legend()
    axs[0].set_ylim(0, 0.5)

    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                       average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                       color='blue', alpha=0.2)
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                       average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                       color='red', alpha=0.2)
    axs[1].set_xlabel('Mixing Rate')
    axs[1].set_ylabel('False Negative Rate')
    axs[1].set_title('FNR vs Mixing Rate')
    axs[1].legend()
    axs[1].set_ylim(0, 0.5)

    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                       average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                       color='blue', alpha=0.2)
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                       average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                       color='red', alpha=0.2)
    axs[2].set_xlabel('Mixing Rate')
    axs[2].set_ylabel('Accuracy')
    axs[2].set_title('Accuracy vs Mixing Rate')
    axs[2].legend()
    axs[2].set_ylim(0.5, 1.0)

    plt.tight_layout()
    plt.show()


CA 2017

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2017', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    initial_size = len(features)
    num_males_original = np.sum(groups == 1)
    num_females_original = np.sum(groups == 2)

    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state('CA')

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

print("CA 2017 Data")

# Iterate over different fixed training set sizes
fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
for fixed_train_size in fixed_train_sizes:
    print("Fixed training set size: ", fixed_train_size)

    results_subset_list = []
    num_iterations = 30
    for _ in range(num_iterations):
        results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                          'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
        for mixing_rate in mixing_rates:
            initial_size, _, num_males_original, num_females_original, _, \
            fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
            evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

            results_subset['Mixing Rate'].append(mixing_rate)
            results_subset['FPR Male'].append(fpr_male)
            results_subset['FNR Male'].append(fnr_male)
            results_subset['Accuracy Male'].append(acc_male)
            results_subset['FPR Female'].append(fpr_female)
            results_subset['FNR Female'].append(fnr_female)
            results_subset['Accuracy Female'].append(acc_female)

        results_subset_list.append(results_subset)

    average_results_subset = {'Mixing Rate': mixing_rates,
                              'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                              'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                              'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                              'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                              'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                              'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

    average_results_subset_df = pd.DataFrame(average_results_subset)

    std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}
    
    print(f"Original training set size: {len(X_train)}")
    print(f"Original number of females in train set: {np.sum(group_train == 2)}")
    print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")


    fig, axs = plt.subplots(1, 3, figsize=(14, 5))
    plt.title('CA 2018: ACSIncome MALE / FEMALE')
    fig.suptitle(f'Training Set Size: {fixed_train_size}')

    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                       average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                       color='blue', alpha=0.2)
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                       average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                       color='red', alpha=0.2)
    axs[0].set_xlabel('Mixing Rate')
    axs[0].set_ylabel('False Positive Rate')
    axs[0].set_title('FPR vs Mixing Rate')
    axs[0].legend()
    axs[0].set_ylim(0, 0.5)

    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                       average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                       color='blue', alpha=0.2)
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                       average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                       color='red', alpha=0.2)
    axs[1].set_xlabel('Mixing Rate')
    axs[1].set_ylabel('False Negative Rate')
    axs[1].set_title('FNR vs Mixing Rate')
    axs[1].legend()
    axs[1].set_ylim(0, 0.5)

    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                       average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                       color='blue', alpha=0.2)
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                       average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                       color='red', alpha=0.2)
    axs[2].set_xlabel('Mixing Rate')
    axs[2].set_ylabel('Accuracy')
    axs[2].set_title('Accuracy vs Mixing Rate')
    axs[2].legend()
    axs[2].set_ylim(0.5, 1.0)

    plt.tight_layout()
    plt.show()


FL 2018, 2015, 2017

FL 2018

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    initial_size = len(features)
    num_males_original = np.sum(groups == 1)
    num_females_original = np.sum(groups == 2)

    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state('FL')

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

print("FL 2018 Data")

# Iterate over different fixed training set sizes
fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
for fixed_train_size in fixed_train_sizes:
    print("Fixed training set size: ", fixed_train_size)

    results_subset_list = []
    num_iterations = 30
    for _ in range(num_iterations):
        results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                          'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
        for mixing_rate in mixing_rates:
            initial_size, _, num_males_original, num_females_original, _, \
            fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
            evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

            results_subset['Mixing Rate'].append(mixing_rate)
            results_subset['FPR Male'].append(fpr_male)
            results_subset['FNR Male'].append(fnr_male)
            results_subset['Accuracy Male'].append(acc_male)
            results_subset['FPR Female'].append(fpr_female)
            results_subset['FNR Female'].append(fnr_female)
            results_subset['Accuracy Female'].append(acc_female)

        results_subset_list.append(results_subset)

    average_results_subset = {'Mixing Rate': mixing_rates,
                              'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                              'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                              'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                              'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                              'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                              'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

    average_results_subset_df = pd.DataFrame(average_results_subset)

    std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}
    
    print(f"Original training set size: {len(X_train)}")
    print(f"Original number of females in train set: {np.sum(group_train == 2)}")
    print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")


    fig, axs = plt.subplots(1, 3, figsize=(14, 5))
    plt.title('CA 2018: ACSIncome MALE / FEMALE')
    fig.suptitle(f'Training Set Size: {fixed_train_size}')

    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                       average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                       color='blue', alpha=0.2)
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                       average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                       color='red', alpha=0.2)
    axs[0].set_xlabel('Mixing Rate')
    axs[0].set_ylabel('False Positive Rate')
    axs[0].set_title('FPR vs Mixing Rate')
    axs[0].legend()
    axs[0].set_ylim(0, 0.5)

    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                       average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                       color='blue', alpha=0.2)
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                       average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                       color='red', alpha=0.2)
    axs[1].set_xlabel('Mixing Rate')
    axs[1].set_ylabel('False Negative Rate')
    axs[1].set_title('FNR vs Mixing Rate')
    axs[1].legend()
    axs[1].set_ylim(0, 1.0)

    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                       average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                       color='blue', alpha=0.2)
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                       average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                       color='red', alpha=0.2)
    axs[2].set_xlabel('Mixing Rate')
    axs[2].set_ylabel('Accuracy')
    axs[2].set_title('Accuracy vs Mixing Rate')
    axs[2].legend()
    axs[2].set_ylim(0.5, 1.0)

    plt.tight_layout()
    plt.show()


FL 2015

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2015', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    initial_size = len(features)
    num_males_original = np.sum(groups == 1)
    num_females_original = np.sum(groups == 2)

    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state('FL')

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

print("FL 2015 Data")

# Iterate over different fixed training set sizes
fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
for fixed_train_size in fixed_train_sizes:
    print("Fixed training set size: ", fixed_train_size)

    results_subset_list = []
    num_iterations = 30
    for _ in range(num_iterations):
        results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                          'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
        for mixing_rate in mixing_rates:
            initial_size, _, num_males_original, num_females_original, _, \
            fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
            evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

            results_subset['Mixing Rate'].append(mixing_rate)
            results_subset['FPR Male'].append(fpr_male)
            results_subset['FNR Male'].append(fnr_male)
            results_subset['Accuracy Male'].append(acc_male)
            results_subset['FPR Female'].append(fpr_female)
            results_subset['FNR Female'].append(fnr_female)
            results_subset['Accuracy Female'].append(acc_female)

        results_subset_list.append(results_subset)

    average_results_subset = {'Mixing Rate': mixing_rates,
                              'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                              'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                              'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                              'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                              'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                              'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

    average_results_subset_df = pd.DataFrame(average_results_subset)

    std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}
    
    print(f"Original training set size: {len(X_train)}")
    print(f"Original number of females in train set: {np.sum(group_train == 2)}")
    print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")


    fig, axs = plt.subplots(1, 3, figsize=(14, 5))
    plt.title('CA 2018: ACSIncome MALE / FEMALE')
    fig.suptitle(f'Training Set Size: {fixed_train_size}')

    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                       average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                       color='blue', alpha=0.2)
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                       average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                       color='red', alpha=0.2)
    axs[0].set_xlabel('Mixing Rate')
    axs[0].set_ylabel('False Positive Rate')
    axs[0].set_title('FPR vs Mixing Rate')
    axs[0].legend()
    axs[0].set_ylim(0, 0.5)

    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                       average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                       color='blue', alpha=0.2)
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                       average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                       color='red', alpha=0.2)
    axs[1].set_xlabel('Mixing Rate')
    axs[1].set_ylabel('False Negative Rate')
    axs[1].set_title('FNR vs Mixing Rate')
    axs[1].legend()
    axs[1].set_ylim(0, 1.0)

    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                       average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                       color='blue', alpha=0.2)
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                       average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                       color='red', alpha=0.2)
    axs[2].set_xlabel('Mixing Rate')
    axs[2].set_ylabel('Accuracy')
    axs[2].set_title('Accuracy vs Mixing Rate')
    axs[2].legend()
    axs[2].set_ylim(0.5, 1.0)

    plt.tight_layout()
    plt.show()


FL 2017

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2017', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    initial_size = len(features)
    num_males_original = np.sum(groups == 1)
    num_females_original = np.sum(groups == 2)

    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state('FL')

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

# Iterate over different fixed training set sizes
fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
for fixed_train_size in fixed_train_sizes:
    print("Fixed training set size: ", fixed_train_size)

    results_subset_list = []
    num_iterations = 30
    for _ in range(num_iterations):
        results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                          'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
        for mixing_rate in mixing_rates:
            initial_size, _, num_males_original, num_females_original, _, \
            fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
            evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

            results_subset['Mixing Rate'].append(mixing_rate)
            results_subset['FPR Male'].append(fpr_male)
            results_subset['FNR Male'].append(fnr_male)
            results_subset['Accuracy Male'].append(acc_male)
            results_subset['FPR Female'].append(fpr_female)
            results_subset['FNR Female'].append(fnr_female)
            results_subset['Accuracy Female'].append(acc_female)

        results_subset_list.append(results_subset)

    average_results_subset = {'Mixing Rate': mixing_rates,
                              'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                              'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                              'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                              'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                              'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                              'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

    average_results_subset_df = pd.DataFrame(average_results_subset)

    std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}
    
    print(f"Original training set size: {len(X_train)}")
    print(f"Original number of females in train set: {np.sum(group_train == 2)}")
    print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")


    fig, axs = plt.subplots(1, 3, figsize=(14, 5))
    plt.title('CA 2018: ACSIncome MALE / FEMALE')
    fig.suptitle(f'Training Set Size: {fixed_train_size}')

    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                       average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                       color='blue', alpha=0.2)
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                       average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                       color='red', alpha=0.2)
    axs[0].set_xlabel('Mixing Rate')
    axs[0].set_ylabel('False Positive Rate')
    axs[0].set_title('FPR vs Mixing Rate')
    axs[0].legend()
    axs[0].set_ylim(0, 0.5)

    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                       average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                       color='blue', alpha=0.2)
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                       average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                       color='red', alpha=0.2)
    axs[1].set_xlabel('Mixing Rate')
    axs[1].set_ylabel('False Negative Rate')
    axs[1].set_title('FNR vs Mixing Rate')
    axs[1].legend()
    axs[1].set_ylim(0, 1.0)

    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                       average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                       color='blue', alpha=0.2)
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                       average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                       color='red', alpha=0.2)
    axs[2].set_xlabel('Mixing Rate')
    axs[2].set_ylabel('Accuracy')
    axs[2].set_title('Accuracy vs Mixing Rate')
    axs[2].legend()
    axs[2].set_ylim(0.5, 1.0)

    plt.tight_layout()
    plt.show()


TX 2018, 2015, 2017

TX 2018

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    initial_size = len(features)
    num_males_original = np.sum(groups == 1)
    num_females_original = np.sum(groups == 2)

    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state('TX')

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.07, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

print("TX 2018 Data")

# Iterate over different fixed training set sizes
fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
for fixed_train_size in fixed_train_sizes:
    print("Fixed training set size: ", fixed_train_size)

    results_subset_list = []
    num_iterations = 30
    for _ in range(num_iterations):
        results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                          'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
        for mixing_rate in mixing_rates:
            initial_size, _, num_males_original, num_females_original, _, \
            fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
            evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

            results_subset['Mixing Rate'].append(mixing_rate)
            results_subset['FPR Male'].append(fpr_male)
            results_subset['FNR Male'].append(fnr_male)
            results_subset['Accuracy Male'].append(acc_male)
            results_subset['FPR Female'].append(fpr_female)
            results_subset['FNR Female'].append(fnr_female)
            results_subset['Accuracy Female'].append(acc_female)

        results_subset_list.append(results_subset)

    average_results_subset = {'Mixing Rate': mixing_rates,
                              'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                              'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                              'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                              'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                              'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                              'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

    average_results_subset_df = pd.DataFrame(average_results_subset)

    std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}
    
    print(f"Original training set size: {len(X_train)}")
    print(f"Original number of females in train set: {np.sum(group_train == 2)}")
    print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")


    fig, axs = plt.subplots(1, 3, figsize=(14, 5))
    plt.title('CA 2018: ACSIncome MALE / FEMALE')
    fig.suptitle(f'Training Set Size: {fixed_train_size}')

    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                       average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                       color='blue', alpha=0.2)
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                       average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                       color='red', alpha=0.2)
    axs[0].set_xlabel('Mixing Rate')
    axs[0].set_ylabel('False Positive Rate')
    axs[0].set_title('FPR vs Mixing Rate')
    axs[0].legend()
    axs[0].set_ylim(0, 0.5)

    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                       average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                       color='blue', alpha=0.2)
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                       average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                       color='red', alpha=0.2)
    axs[1].set_xlabel('Mixing Rate')
    axs[1].set_ylabel('False Negative Rate')
    axs[1].set_title('FNR vs Mixing Rate')
    axs[1].legend()
    axs[1].set_ylim(0, 1.0)

    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                       average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                       color='blue', alpha=0.2)
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                       average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                       color='red', alpha=0.2)
    axs[2].set_xlabel('Mixing Rate')
    axs[2].set_ylabel('Accuracy')
    axs[2].set_title('Accuracy vs Mixing Rate')
    axs[2].legend()
    axs[2].set_ylim(0.5, 1.0)

    plt.tight_layout()
    plt.show()


TX 2015

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2017', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    initial_size = len(features)
    num_males_original = np.sum(groups == 1)
    num_females_original = np.sum(groups == 2)

    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state('CA')

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

print("TX 2015 Data")

# Iterate over different fixed training set sizes
fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
for fixed_train_size in fixed_train_sizes:
    print("Fixed training set size: ", fixed_train_size)

    results_subset_list = []
    num_iterations = 10
    for _ in range(num_iterations):
        results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                          'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
        for mixing_rate in mixing_rates:
            initial_size, _, num_males_original, num_females_original, _, \
            fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
            evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

            results_subset['Mixing Rate'].append(mixing_rate)
            results_subset['FPR Male'].append(fpr_male)
            results_subset['FNR Male'].append(fnr_male)
            results_subset['Accuracy Male'].append(acc_male)
            results_subset['FPR Female'].append(fpr_female)
            results_subset['FNR Female'].append(fnr_female)
            results_subset['Accuracy Female'].append(acc_female)

        results_subset_list.append(results_subset)

    average_results_subset = {'Mixing Rate': mixing_rates,
                              'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                              'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                              'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                              'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                              'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                              'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

    average_results_subset_df = pd.DataFrame(average_results_subset)

    std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}
    
    print(f"Original training set size: {len(X_train)}")
    print(f"Original number of females in train set: {np.sum(group_train == 2)}")
    print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")


    fig, axs = plt.subplots(1, 3, figsize=(14, 5))
    plt.title('CA 2018: ACSIncome MALE / FEMALE')
    fig.suptitle(f'Training Set Size: {fixed_train_size}')

    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                       average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                       color='blue', alpha=0.2)
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                       average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                       color='red', alpha=0.2)
    axs[0].set_xlabel('Mixing Rate')
    axs[0].set_ylabel('False Positive Rate')
    axs[0].set_title('FPR vs Mixing Rate')
    axs[0].legend()
    axs[0].set_ylim(0, 0.5)

    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                       average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                       color='blue', alpha=0.2)
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                       average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                       color='red', alpha=0.2)
    axs[1].set_xlabel('Mixing Rate')
    axs[1].set_ylabel('False Negative Rate')
    axs[1].set_title('FNR vs Mixing Rate')
    axs[1].legend()
    axs[1].set_ylim(0, 0.5)

    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                       average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                       color='blue', alpha=0.2)
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                       average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                       color='red', alpha=0.2)
    axs[2].set_xlabel('Mixing Rate')
    axs[2].set_ylabel('Accuracy')
    axs[2].set_title('Accuracy vs Mixing Rate')
    axs[2].legend()
    axs[2].set_ylim(0.5, 1.0)

    plt.tight_layout()
    fig.suptitle(f'Training Set Size: {fixed_train_size}')
    plt.show()


TX 2017

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2017', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSIncomeNew = folktables.BasicProblem(
    features=[
        'AGEP', # Age (1-99)\n",
        'COW', # Class of Worker (1-9)\n",
        'SCHL', # Education (1-24)\n",
        'MAR', # Marital status (1-5)\n",
        'OCCP', # Occupation 0000-9920\n",
        'POBP', # Place of birth 000-554\n",
        'RELP', ## Relationship (00-17)\n",
        'WKHP',# Hours worked per week (0-99)\n",
        'SEX', # sex (1,2)\n",
        'RAC1P', # race (1-9)\n",
    ],
    target='PINCP',
    target_transform=lambda x: x > 50000,
    group='SEX',
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)



# Evaluation function for different mixing rates and modified sizes
def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    initial_size = len(features)
    num_males_original = np.sum(groups == 1)
    num_females_original = np.sum(groups == 2)

    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

# Fetch data for California
ca_features, ca_label, ca_group = fetch_data_for_state('CA')

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

# Total females in the dataset
total_females = len(np.where(ca_group == 2)[0])

print("TX 2017 Data")

# Iterate over different fixed training set sizes
fixed_train_sizes = [int(round(0.1 * total_females, -2)), int(round(0.3 * total_females, -2)), int(round(0.5 * total_females, -2))]
for fixed_train_size in fixed_train_sizes:
    print("Fixed training set size: ", fixed_train_size)

    results_subset_list = []
    num_iterations = 10
    for _ in range(num_iterations):
        results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                          'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
        for mixing_rate in mixing_rates:
            initial_size, _, num_males_original, num_females_original, _, \
            fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
            evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

            results_subset['Mixing Rate'].append(mixing_rate)
            results_subset['FPR Male'].append(fpr_male)
            results_subset['FNR Male'].append(fnr_male)
            results_subset['Accuracy Male'].append(acc_male)
            results_subset['FPR Female'].append(fpr_female)
            results_subset['FNR Female'].append(fnr_female)
            results_subset['Accuracy Female'].append(acc_female)

        results_subset_list.append(results_subset)

    average_results_subset = {'Mixing Rate': mixing_rates,
                              'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                              'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                              'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                              'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                              'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                              'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

    average_results_subset_df = pd.DataFrame(average_results_subset)

    std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}
    
    print(f"Original training set size: {len(X_train)}")
    print(f"Original number of females in train set: {np.sum(group_train == 2)}")
    print(f"Percentage of total females that's being currently graphed: {fixed_train_size / total_females * 100}%")


    fig, axs = plt.subplots(1, 3, figsize=(14, 5))
    plt.title('CA 2018: ACSIncome MALE / FEMALE')
    fig.suptitle(f'Training Set Size: {fixed_train_size}')

    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
    axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                       average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                       color='blue', alpha=0.2)
    axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                       average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                       color='red', alpha=0.2)
    axs[0].set_xlabel('Mixing Rate')
    axs[0].set_ylabel('False Positive Rate')
    axs[0].set_title('FPR vs Mixing Rate')
    axs[0].legend()
    axs[0].set_ylim(0, 0.5)

    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
    axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                       average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                       color='blue', alpha=0.2)
    axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                       average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                       color='red', alpha=0.2)
    axs[1].set_xlabel('Mixing Rate')
    axs[1].set_ylabel('False Negative Rate')
    axs[1].set_title('FNR vs Mixing Rate')
    axs[1].legend()
    axs[1].set_ylim(0, 0.5)

    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
    axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                       average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                       color='blue', alpha=0.2)
    axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                       average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                       average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                       color='red', alpha=0.2)
    axs[2].set_xlabel('Mixing Rate')
    axs[2].set_ylabel('Accuracy')
    axs[2].set_title('Accuracy vs Mixing Rate')
    axs[2].legend()
    axs[2].set_ylim(0.5, 1.0)

    plt.tight_layout()
    fig.suptitle(f'Training Set Size: {fixed_train_size}')
    plt.show()


In [None]:
results_df

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSEmployment = folktables.BasicProblem(
    features=[
        'AGEP',
        'SCHL',
        'MAR',
        'RELP',
        'DIS',
        'ESP',
        'CIT',
        'MIG',
        'MIL',
        'ANC',
        'NATIVITY',
        'DEAR',
        'DEYE',
        'DREM',
        'SEX',
        'RAC1P',
    ],
    target='ESR',
    target_transform=lambda x: x == 1,
    group='SEX',
    preprocess=lambda x: x,
    postprocess=lambda x: np.nan_to_num(x, -1),
)


def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    # Calculate the initial size of the dataset
#     initial_size = len(features)
#     print("Initial size", initial_size)
#     num_males_original = np.sum(groups == 1)
#     print("Number of Males Original", num_males_original)
#     num_females_original = np.sum(groups == 2)
#     print("Number of Females Original", num_females_original)


    # Adjust the mixing rate by subsampling the 'female' group
    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    # Create the modified dataset
    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    # If the modified size is specified, subsample to the desired size
    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    # Split the modified dataset into train and test sets
    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    # Create and fit the model
    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate FPR, FNR, and Accuracy for male (group = 1) and female (group = 2)
    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_ca = data_source.get_data(states=["CA"], download=True)
ca_features, ca_label, ca_group = ACSEmployment.df_to_numpy(acs_ca)

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# modified_sizes = [4000, 8000, 10000]

totalFemales = len(np.where(ca_group == 2)[0])
print("total Females in dataset: ", totalFemales)

# fix training set size to 0.5 * Females, round to nearest multiple of 100
fixed_train_size = int(round(0.5 * totalFemales, -2))
print("fixed training set size: ", fixed_train_size)



# fixed_train_set_size = 0.5 * num_females_original

results = {'Mixing Rate': [], 'Initial Size': [], 'Modified Size': [], 'Num Males Original': [],
           'Num Females Original': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
           'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}

# Separate graphs for FPR, FNR, and Accuracy for each training set size
# for modified_size in modified_sizes:# ... (previous code)

# Separate graphs for FPR, FNR, and Accuracy for each training set size
# for modified_size in modified_sizes:
results_subset_list = []  # List to store results of each iteration
num_iterations = 5
for _ in range(num_iterations):  # Set the number of iterations
    results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                      'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
    for mixing_rate in mixing_rates:
        initial_size, _, num_males_original, num_females_original, _, \
        fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
        evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

        results_subset['Mixing Rate'].append(mixing_rate)
        results_subset['FPR Male'].append(fpr_male)
        results_subset['FNR Male'].append(fnr_male)
        results_subset['Accuracy Male'].append(acc_male)
        results_subset['FPR Female'].append(fpr_female)
        results_subset['FNR Female'].append(fnr_female)
        results_subset['Accuracy Female'].append(acc_female)

    results_subset_list.append(results_subset)

# Calculate the average values across iterations
average_results_subset = {'Mixing Rate': mixing_rates,
                          'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

average_results_subset_df = pd.DataFrame(average_results_subset)

# Calculate standard deviation across iterations
std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                      'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                      'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                      'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                      'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                      'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}

# Plotting for FPR, FNR, and Accuracy with shaded regions
fig, axs = plt.subplots(1, 3, figsize=(14, 5))  # Use 1 row and 3 columns
fig.suptitle(f'Training Set Size: {fixed_train_size}')

# Plot FPR for Male and Female with shaded region for standard deviation
axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                   average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                   color='blue', alpha=0.2)
axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                   average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                   color='red', alpha=0.2)
axs[0].set_xlabel('Mixing Rate')
axs[0].set_ylabel('False Positive Rate')
axs[0].set_title('FPR vs Mixing Rate')
axs[0].legend()
axs[0].set_ylim(0, 0.5)

# Plot FNR for Male and Female with shaded region for standard deviation
axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                   average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                   color='blue', alpha=0.2)
axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                   average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                   color='red', alpha=0.2)
axs[1].set_xlabel('Mixing Rate')
axs[1].set_ylabel('False Negative Rate')
axs[1].set_title('FNR vs Mixing Rate')
axs[1].legend()
axs[1].set_ylim(0, 0.5)

# Plot Accuracy for Male and Female with shaded region for standard deviation
axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                   average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                   color='blue', alpha=0.2)
axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                   average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                   color='red', alpha=0.2)
axs[2].set_xlabel('Mixing Rate')
axs[2].set_ylabel('Accuracy')
axs[2].set_title('Accuracy vs Mixing Rate')
axs[2].legend()
axs[2].set_ylim(0.5, 1.0)

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import folktables
from folktables import ACSDataSource

def fetch_data_for_state(state):
    data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
    acs_data = data_source.get_data(states=[state], download=True)
    features, label, group = ACSIncomeNew.df_to_numpy(acs_data)
    return features, label, group

def calculate_metrics(y_true, y_pred):
    # Check if the confusion matrix is empty
    if np.sum(y_true == 1) == 0 or np.sum(y_true == 0) == 0:
        fpr, fnr, acc = np.nan, np.nan, np.nan
    else:
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        # Extract values from the confusion matrix
        tn, fp, fn, tp = cm.ravel()
        fpr = fp / (fp + tn)
        fnr = fn / (fn + tp)
        acc = accuracy_score(y_true, y_pred)
    return fpr, fnr, acc

ACSEmployment = folktables.BasicProblem(
    features=[
        'AGEP',
        'SCHL',
        'MAR',
        'RELP',
        'DIS',
        'ESP',
        'CIT',
        'MIG',
        'MIL',
        'ANC',
        'NATIVITY',
        'DEAR',
        'DEYE',
        'DREM',
        'SEX',
        'RAC1P',
    ],
    target='ESR',
    target_transform=lambda x: x == 1,
    group='SEX',
    preprocess=lambda x: x,
    postprocess=lambda x: np.nan_to_num(x, -1),
)


def evaluate_model_with_mixing_rate_and_size(mixing_rate, modified_size, features, labels, groups):
    # Calculate the initial size of the dataset
#     initial_size = len(features)
#     print("Initial size", initial_size)
#     num_males_original = np.sum(groups == 1)
#     print("Number of Males Original", num_males_original)
#     num_females_original = np.sum(groups == 2)
#     print("Number of Females Original", num_females_original)


    # Adjust the mixing rate by subsampling the 'female' group
    female_indices = np.where(groups == 2)[0]
    num_female_to_keep = int(len(female_indices) * mixing_rate)
    subsampled_female_indices = np.random.choice(female_indices, size=num_female_to_keep, replace=False)

    # Create the modified dataset
    modified_features = np.concatenate([features[groups == 1], features[subsampled_female_indices]])
    modified_labels = np.concatenate([labels[groups == 1], labels[subsampled_female_indices]])
    modified_groups = np.concatenate([groups[groups == 1], groups[subsampled_female_indices]])

    # If the modified size is specified, subsample to the desired size
    if modified_size is not None and modified_size < len(modified_features):
        subsample_indices = np.random.choice(np.arange(len(modified_features)), size=modified_size, replace=False)
        modified_features = modified_features[subsample_indices]
        modified_labels = modified_labels[subsample_indices]
        modified_groups = modified_groups[subsample_indices]

    # Split the modified dataset into train and test sets
    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
        modified_features, modified_labels, modified_groups, test_size=0.2, random_state=0)

    # Create and fit the model
    model = make_pipeline(StandardScaler(), LogisticRegression())
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate FPR, FNR, and Accuracy for male (group = 1) and female (group = 2)
    fpr_male, fnr_male, acc_male = calculate_metrics(y_test[group_test == 1], y_pred[group_test == 1])
    fpr_female, fnr_female, acc_female = calculate_metrics(y_test[group_test == 2], y_pred[group_test == 2])

    return initial_size, modified_size, num_males_original, num_females_original, len(modified_features), \
           fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female

data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_ca = data_source.get_data(states=["CA"], download=True)
ca_features, ca_label, ca_group = ACSEmployment.df_to_numpy(acs_ca)

# Split data for training
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    ca_features, ca_label, ca_group, test_size=0.3, random_state=0)

# Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

mixing_rates = [0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
# modified_sizes = [4000, 8000, 10000]

totalFemales = len(np.where(ca_group == 2)[0])
print("total Females in dataset: ", totalFemales)

# fix training set size to 0.5 * Females, round to nearest multiple of 100
fixed_train_size = int(round(0.25 * totalFemales, -2))
print("fixed training set size: ", fixed_train_size)
print('\n')



# fixed_train_set_size = 0.5 * num_females_original

results = {'Mixing Rate': [], 'Initial Size': [], 'Modified Size': [], 'Num Males Original': [],
           'Num Females Original': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
           'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}

# Separate graphs for FPR, FNR, and Accuracy for each training set size
# for modified_size in modified_sizes:# ... (previous code)

# Separate graphs for FPR, FNR, and Accuracy for each training set size
# for modified_size in modified_sizes:
results_subset_list = []  # List to store results of each iteration
num_iterations = 2
for _ in range(num_iterations):  # Set the number of iterations
    results_subset = {'Mixing Rate': [], 'FPR Male': [], 'FNR Male': [], 'Accuracy Male': [],
                      'FPR Female': [], 'FNR Female': [], 'Accuracy Female': []}
    for mixing_rate in mixing_rates:
        initial_size, _, num_males_original, num_females_original, _, \
        fpr_male, fnr_male, acc_male, fpr_female, fnr_female, acc_female = \
        evaluate_model_with_mixing_rate_and_size(mixing_rate, fixed_train_size, ca_features, ca_label, ca_group)

        results_subset['Mixing Rate'].append(mixing_rate)
        results_subset['FPR Male'].append(fpr_male)
        results_subset['FNR Male'].append(fnr_male)
        results_subset['Accuracy Male'].append(acc_male)
        results_subset['FPR Female'].append(fpr_female)
        results_subset['FNR Female'].append(fnr_female)
        results_subset['Accuracy Female'].append(acc_female)

    results_subset_list.append(results_subset)

# Calculate the average values across iterations
average_results_subset = {'Mixing Rate': mixing_rates,
                          'FPR Male': np.mean([result['FPR Male'] for result in results_subset_list], axis=0),
                          'FNR Male': np.mean([result['FNR Male'] for result in results_subset_list], axis=0),
                          'Accuracy Male': np.mean([result['Accuracy Male'] for result in results_subset_list], axis=0),
                          'FPR Female': np.mean([result['FPR Female'] for result in results_subset_list], axis=0),
                          'FNR Female': np.mean([result['FNR Female'] for result in results_subset_list], axis=0),
                          'Accuracy Female': np.mean([result['Accuracy Female'] for result in results_subset_list], axis=0)}

average_results_subset_df = pd.DataFrame(average_results_subset)

# Calculate standard deviation across iterations
std_results_subset = {'FPR Male': np.std([result['FPR Male'] for result in results_subset_list], axis=0),
                      'FNR Male': np.std([result['FNR Male'] for result in results_subset_list], axis=0),
                      'Accuracy Male': np.std([result['Accuracy Male'] for result in results_subset_list], axis=0),
                      'FPR Female': np.std([result['FPR Female'] for result in results_subset_list], axis=0),
                      'FNR Female': np.std([result['FNR Female'] for result in results_subset_list], axis=0),
                      'Accuracy Female': np.std([result['Accuracy Female'] for result in results_subset_list], axis=0)}

# Plotting for FPR, FNR, and Accuracy with shaded regions
fig, axs = plt.subplots(1, 3, figsize=(14, 5))  # Use 1 row and 3 columns
fig.suptitle(f'Training Set Size: {fixed_train_size}')

# Plot FPR for Male and Female with shaded region for standard deviation
axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Male'], label='Male', color='blue')
axs[0].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FPR Female'], label='Female', color='red')
axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['FPR Male'] - std_results_subset['FPR Male'],
                   average_results_subset_df['FPR Male'] + std_results_subset['FPR Male'],
                   color='blue', alpha=0.2)
axs[0].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['FPR Female'] - std_results_subset['FPR Female'],
                   average_results_subset_df['FPR Female'] + std_results_subset['FPR Female'],
                   color='red', alpha=0.2)
axs[0].set_xlabel('Mixing Rate')
axs[0].set_ylabel('False Positive Rate')
axs[0].set_title('FPR vs Mixing Rate')
axs[0].legend()
axs[0].set_ylim(0, 0.5)

# Plot FNR for Male and Female with shaded region for standard deviation
axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Male'], label='Male', color='blue')
axs[1].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['FNR Female'], label='Female', color='red')
axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['FNR Male'] - std_results_subset['FNR Male'],
                   average_results_subset_df['FNR Male'] + std_results_subset['FNR Male'],
                   color='blue', alpha=0.2)
axs[1].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['FNR Female'] - std_results_subset['FNR Female'],
                   average_results_subset_df['FNR Female'] + std_results_subset['FNR Female'],
                   color='red', alpha=0.2)
axs[1].set_xlabel('Mixing Rate')
axs[1].set_ylabel('False Negative Rate')
axs[1].set_title('FNR vs Mixing Rate')
axs[1].legend()
axs[1].set_ylim(0, 0.5)

# Plot Accuracy for Male and Female with shaded region for standard deviation
axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Male'], label='Male', color='blue')
axs[2].plot(average_results_subset_df['Mixing Rate'], average_results_subset_df['Accuracy Female'], label='Female', color='red')
axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['Accuracy Male'] - std_results_subset['Accuracy Male'],
                   average_results_subset_df['Accuracy Male'] + std_results_subset['Accuracy Male'],
                   color='blue', alpha=0.2)
axs[2].fill_between(average_results_subset_df['Mixing Rate'],
                   average_results_subset_df['Accuracy Female'] - std_results_subset['Accuracy Female'],
                   average_results_subset_df['Accuracy Female'] + std_results_subset['Accuracy Female'],
                   color='red', alpha=0.2)
axs[2].set_xlabel('Mixing Rate')
axs[2].set_ylabel('Accuracy')
axs[2].set_title('Accuracy vs Mixing Rate')
axs[2].legend()
axs[2].set_ylim(0.5, 1.0)

plt.tight_layout()
plt.show()