In [9]:
# !pip install folktables

In [1]:
from folktables import ACSDataSource, ACSEmployment,ACSIncome
import numpy as np
from sklearn.model_selection import train_test_split
import os
import pandas as pd

In [2]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
# acs_data = data_source.get_data(states=["AL", "AK", "AZ", "AR", "CA"], download=True)
# acs_data = data_source.get_data(download=True)

In [3]:
# state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
#                "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
#                "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
#                "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
#                "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

state_codes = ["AL", "AK", "AZ", "NY", "CA"]
dfs = {}
for state_code in state_codes:
    acs_data = data_source.get_data(states=[state_code],download=True)
    features, label, group = ACSIncome.df_to_pandas(acs_data)
    dfs[state_code] = (features, label)


In [4]:
all_len=[]
for state_code, (features, label) in dfs.items():
    all_len.append(len(label))
    print(f"State: {state_code}, Features Length: {len(features)}, Label Length: {len(label)}")

State: AL, Features Length: 22268, Label Length: 22268
State: AK, Features Length: 3546, Label Length: 3546
State: AZ, Features Length: 33277, Label Length: 33277
State: NY, Features Length: 103021, Label Length: 103021
State: CA, Features Length: 195665, Label Length: 195665


In [5]:
merge_dfs={}

for state_code, (features, label) in dfs.items():    
    merge_df = pd.concat([features, label], axis=1)
    merge_df = merge_df.dropna()
    merge_df['PINCP'] = merge_df['PINCP'].replace({True: '>50K', False: '<=50K'})
    merge_dfs[state_code] = merge_df
    
for state_code, df in merge_dfs.items():
    print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 22268
State: AK, df Length: 3546
State: AZ, df Length: 33277
State: NY, df Length: 103021
State: CA, df Length: 195665


In [6]:
import pickle

# Save the dictionary to a file
with open('dfs.pickle', 'wb') as f:
    pickle.dump(merge_dfs, f)

In [7]:
with open('dfs.pickle', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [8]:
for state_code, df in dfs_loaded.items():
    print(df.head())
    print(f"State: {state_code}, df Length: {len(df)}")

   AGEP  COW  SCHL  MAR    OCCP  POBP  RELP  WKHP  SEX  RAC1P  PINCP
0  18.0  1.0  18.0  5.0  4720.0  13.0  17.0  21.0  2.0    2.0  <=50K
1  53.0  5.0  17.0  5.0  3605.0  18.0  16.0  40.0  1.0    1.0  <=50K
2  41.0  1.0  16.0  5.0  7330.0   1.0  17.0  40.0  1.0    1.0  <=50K
3  18.0  6.0  18.0  5.0  2722.0   1.0  17.0   2.0  2.0    1.0  <=50K
4  21.0  5.0  19.0  5.0  3870.0  12.0  17.0  50.0  1.0    1.0  <=50K
State: AL, df Length: 22268
   AGEP  COW  SCHL  MAR    OCCP  POBP  RELP  WKHP  SEX  RAC1P  PINCP
0  19.0  5.0  16.0  5.0  5120.0  48.0  17.0  40.0  2.0    5.0  <=50K
1  23.0  1.0  18.0  5.0  9130.0   2.0  17.0  77.0  1.0    1.0  <=50K
2  19.0  1.0  19.0  5.0  4521.0   2.0  17.0  70.0  1.0    6.0  <=50K
3  33.0  3.0  17.0  3.0  8140.0   2.0  16.0  40.0  1.0    1.0  <=50K
4  62.0  1.0  19.0  1.0  1050.0  29.0  17.0  84.0  1.0    9.0   >50K
State: AK, df Length: 3546
   AGEP  COW  SCHL  MAR    OCCP   POBP  RELP  WKHP  SEX  RAC1P  PINCP
0  19.0  1.0  16.0  5.0  4150.0    4.0  17.0  2

In [41]:
# for state_code, df in dfs_loaded.items():
#     df_temp=dfs_loaded[state_code].reset_index(drop=True)
#     train_data, test_data = train_test_split(df_temp, test_size=0.2, random_state=42)
#     train_data.to_csv(f'50_clients_data/raw_data/{state_code}.data', header=False, index=False)
    
#     test_data['PINCP'] = test_data['PINCP'].astype(str) + '.'

#     test_data.to_csv(f'50_clients_data/raw_data/{state_code}.test', header=False, index=False)
    
#     df_temp.to_csv(f'50_clients_data/raw_data/{state_code}.csv', header=False, index=False)

# Torch2 -- Group and Subgroup

In [42]:
merge_df=merge_dfs["CA"].copy()

In [9]:
# # filter the dataframe for each combination

#White Men
WM = merge_df.loc[(merge_df['SEX'] == 1) & (merge_df['RAC1P'] == 1)]
#Black Men
BM = merge_df.loc[(merge_df['SEX'] == 1) & (merge_df['RAC1P'] == 2)]
#white Women
WW = merge_df.loc[(merge_df['SEX'] == 2) & (merge_df['RAC1P'] == 1)]
#Black Women
BW = merge_df.loc[(merge_df['SEX'] == 2) & (merge_df['RAC1P'] == 2)]

len(WM),len(BM),len(WW),len(BW)

(64793, 4204, 56213, 4353)

In [10]:
# =====Training=============
WW[:50000].to_csv(f'50_clients_data/client_subG_splits/WW.data', header=False, index=False)
WM[:60000].to_csv(f'50_clients_data/client_subG_splits/WM.data', header=False, index=False)
BW[:5000].to_csv(f'50_clients_data/client_subG_splits/BW.data', header=False, index=False)
BM[:4000].to_csv(f'50_clients_data/client_subG_splits/BM.data', header=False, index=False)


# =====Testing=============


WW.loc[50000:, 'PINCP'] = WW.loc[50000:, 'PINCP'].astype(str) + '.'
WW[50000:].to_csv(f'50_clients_data/client_subG_splits/WW.test', header=False, index=False)

WM.loc[60000:, 'PINCP'] = WM.loc[60000:, 'PINCP'].astype(str) + '.'
WM[60000:].to_csv(f'50_clients_data/client_subG_splits/WM.test', header=False, index=False)

BW.loc[5000:, 'PINCP'] = BW.loc[5000:, 'PINCP'].astype(str) + '.'
BW[5000:].to_csv(f'50_clients_data/client_subG_splits/BW.test', header=False, index=False)

BM.loc[4000:, 'PINCP'] = BM.loc[4000:, 'PINCP'].astype(str) + '.'
BM[4000:].to_csv(f'50_clients_data/client_subG_splits/BM.test', header=False, index=False)

#-------------- Group-------------

In [11]:
merge_df=merge_dfs["NY"].copy()
len(merge_df)

103021

In [12]:
men = merge_df.loc[(merge_df['SEX'] == 1)]
women = merge_df.loc[(merge_df['SEX'] == 2)]
white = merge_df.loc[(merge_df['RAC1P'] == 1)]
black = merge_df.loc[(merge_df['RAC1P'] == 2)]

len(men),len(women),len(white),len(black)

(52178, 50843, 73665, 11647)

In [13]:
# =====Training=============
men[:40000].to_csv(f'50_clients_data/client_subG_splits/men.data', header=False, index=False)
women[:40000].to_csv(f'50_clients_data/client_subG_splits/men.data', header=False, index=False)
white[:40000].to_csv(f'50_clients_data/client_subG_splits/white.data', header=False, index=False)
black[:10000].to_csv(f'50_clients_data/client_subG_splits/black.data', header=False, index=False)


# =====Testing=============


men.loc[40000:, 'PINCP'] = men.loc[40000:, 'PINCP'].astype(str) + '.'
men[40000:].to_csv(f'50_clients_data/client_subG_splits/men.test', header=False, index=False)

women.loc[40000:, 'PINCP'] = women.loc[40000:, 'PINCP'].astype(str) + '.'
women[40000:].to_csv(f'50_clients_data/client_subG_splits/women.test', header=False, index=False)

white.loc[40000:, 'PINCP'] = white.loc[40000:, 'PINCP'].astype(str) + '.'
white[40000:].to_csv(f'50_clients_data/client_subG_splits/white.test', header=False, index=False)

black.loc[10000:, 'PINCP'] = black.loc[10000:, 'PINCP'].astype(str) + '.'
black[10000:].to_csv(f'50_clients_data/client_subG_splits/black.test', header=False, index=False)

# Take random 100 data points from Clients

In [13]:
import os

In [14]:
# Training Ground Truth

In [15]:
folder_path = '50_clients_data/raw_data/'
for file_name in os.listdir(folder_path):
    if file_name.endswith('.data'):
        df = pd.read_csv(os.path.join(folder_path, file_name), header=None)
        temp_df=df.sample(n=100)
        base_name = os.path.splitext(file_name)[0]
        temp_df.to_csv(f'50_clients_data/client_raw_data_100_sample/{base_name}_100.data', header=False, index=False)


In [16]:
# Testing

In [17]:
folder_path = '50_clients_data/raw_data/'
for file_name in os.listdir(folder_path):
    if file_name.endswith('.test'):
        df = pd.read_csv(os.path.join(folder_path, file_name), header=None)
        temp_df=df.sample(n=100)
        base_name = os.path.splitext(file_name)[0]
        temp_df.to_csv(f'50_clients_data/client_raw_data_100_test/{base_name}_100.test', header=False, index=False)

In [19]:
# for state_code, df in dfs_loaded.items():
#     df_temp=dfs_loaded[state_code].reset_index(drop=True)
#     train_data, test_data = train_test_split(df_temp, test_size=0.2, random_state=42)
    
#     train_data.to_csv(f'50_clients_data/client_raw_data_100_sample/{state_code}_100.data', header=False, index=False)

# sample_100_AK.to_csv(f'50_clients_data/raw_data/AK_100.data', header=False, index=False)

In [21]:
# for state_code, (features, label) in dfs.items():
#     # take 30%
#     num_rows_to_keep = int(len(features) * 0.3) 
#     random_indices = np.random.choice(len(features), num_rows_to_keep, replace=False)
#     reduced_features = features.iloc[random_indices]
#     reduced_label = label.iloc[random_indices]
#     dfs[state_code] = (reduced_features, reduced_label)

# for state_code, (reduced_features, reduced_label) in dfs.items():
#     print(f"State: {state_code}, Reduced Features Length: {len(reduced_features)}, Reduced Label Length: {len(reduced_label)}")

In [16]:
import pickle

# Save the dictionary to a file
with open('dfs.pickle', 'wb') as f:
    pickle.dump(merge_dfs, f)

In [17]:
with open('dfs.pickle', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [18]:
for state_code, df in dfs_loaded.items():
    print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 22268


In [20]:
dfs_loaded["AL"].reset_index(drop=True)

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,18.0,1.0,18.0,5.0,4720.0,13.0,17.0,21.0,2.0,2.0,<=50K
1,53.0,5.0,17.0,5.0,3605.0,18.0,16.0,40.0,1.0,1.0,<=50K
2,41.0,1.0,16.0,5.0,7330.0,1.0,17.0,40.0,1.0,1.0,<=50K
3,18.0,6.0,18.0,5.0,2722.0,1.0,17.0,2.0,2.0,1.0,<=50K
4,21.0,5.0,19.0,5.0,3870.0,12.0,17.0,50.0,1.0,1.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
22263,20.0,6.0,19.0,5.0,4251.0,1.0,4.0,25.0,1.0,1.0,<=50K
22264,63.0,1.0,16.0,1.0,440.0,1.0,0.0,48.0,1.0,1.0,>50K
22265,65.0,2.0,21.0,5.0,420.0,1.0,2.0,40.0,2.0,1.0,>50K
22266,37.0,1.0,16.0,4.0,340.0,6.0,0.0,50.0,2.0,1.0,<=50K
