In [1]:
from folktables import ACSDataSource, ACSEmployment,ACSIncome
import numpy as np
from sklearn.model_selection import train_test_split
import os
import pandas as pd

In [2]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["AL","AK"], download=True)
# acs_data = data_source.get_data(download=True)

In [3]:
# state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
#                "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
#                "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
#                "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
#                "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

state_codes=["AL","AK","AZ"]

dfs = {}
for state_code in state_codes:
    acs_data = data_source.get_data(states=[state_code],download=True)
    features, label, group = ACSIncome.df_to_pandas(acs_data)
    dfs[state_code] = (features, label)


In [4]:
all_len=[]
for state_code, (features, label) in dfs.items():
    all_len.append(len(label))
    print(f"State: {state_code}, Features Length: {len(features)}, Label Length: {len(label)}")

State: AL, Features Length: 22268, Label Length: 22268
State: AK, Features Length: 3546, Label Length: 3546
State: AZ, Features Length: 33277, Label Length: 33277


In [5]:
merge_dfs={}

for state_code, (features, label) in dfs.items():    
    merge_df = pd.concat([features, label], axis=1)
    merge_df = merge_df.dropna()
    merge_df['PINCP'] = merge_df['PINCP'].replace({True: '>50K', False: '<=50K'})
    merge_dfs[state_code] = merge_df
    
for state_code, df in merge_dfs.items():
    print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 22268
State: AK, df Length: 3546
State: AZ, df Length: 33277


In [6]:
import pickle

# Save the dictionary to a file
with open('dfs.pickle', 'wb') as f:
    pickle.dump(merge_dfs, f)

In [7]:
with open('dfs.pickle', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [8]:
for state_code, df in dfs_loaded.items():
    print(df.head())
    print(f"State: {state_code}, df Length: {len(df)}")

   AGEP  COW  SCHL  MAR    OCCP  POBP  RELP  WKHP  SEX  RAC1P  PINCP
0  18.0  1.0  18.0  5.0  4720.0  13.0  17.0  21.0  2.0    2.0  <=50K
1  53.0  5.0  17.0  5.0  3605.0  18.0  16.0  40.0  1.0    1.0  <=50K
2  41.0  1.0  16.0  5.0  7330.0   1.0  17.0  40.0  1.0    1.0  <=50K
3  18.0  6.0  18.0  5.0  2722.0   1.0  17.0   2.0  2.0    1.0  <=50K
4  21.0  5.0  19.0  5.0  3870.0  12.0  17.0  50.0  1.0    1.0  <=50K
State: AL, df Length: 22268
   AGEP  COW  SCHL  MAR    OCCP  POBP  RELP  WKHP  SEX  RAC1P  PINCP
0  19.0  5.0  16.0  5.0  5120.0  48.0  17.0  40.0  2.0    5.0  <=50K
1  23.0  1.0  18.0  5.0  9130.0   2.0  17.0  77.0  1.0    1.0  <=50K
2  19.0  1.0  19.0  5.0  4521.0   2.0  17.0  70.0  1.0    6.0  <=50K
3  33.0  3.0  17.0  3.0  8140.0   2.0  16.0  40.0  1.0    1.0  <=50K
4  62.0  1.0  19.0  1.0  1050.0  29.0  17.0  84.0  1.0    9.0   >50K
State: AK, df Length: 3546
   AGEP  COW  SCHL  MAR    OCCP   POBP  RELP  WKHP  SEX  RAC1P  PINCP
0  19.0  1.0  16.0  5.0  4150.0    4.0  17.0  2

In [9]:
for state_code, df in dfs_loaded.items():
    df_temp=dfs_loaded[state_code].reset_index(drop=True)
    train_data, test_data = train_test_split(df_temp, test_size=0.2, random_state=42)
    
    train_data.to_csv(f'50_clients_data/raw_data/{state_code}.data', header=False, index=False)
    
    test_data['PINCP'] = test_data['PINCP'].astype(str) + '.'

    test_data.to_csv(f'50_clients_data/raw_data/{state_code}.test', header=False, index=False)
    
    # df_temp.to_csv(f'50_clients_data/raw_data/{state_code}.csv', header=False, index=False)

# Take random 100 data points from Clients

In [13]:
# Training Ground Truth

In [10]:
folder_path = '50_clients_data/raw_data/'
for file_name in os.listdir(folder_path):
    if file_name.endswith('.data'):
        df = pd.read_csv(os.path.join(folder_path, file_name), header=None)
        temp_df=df.sample(n=100)
        base_name = os.path.splitext(file_name)[0]
        temp_df.to_csv(f'50_clients_data/client_raw_data_100_sample/{base_name}_100.data', header=False, index=False)


In [19]:


# Testing

In [11]:
folder_path = '50_clients_data/raw_data/'
for file_name in os.listdir(folder_path):
    if file_name.endswith('.test'):
        df = pd.read_csv(os.path.join(folder_path, file_name), header=None)
        temp_df=df.sample(n=100)
        base_name = os.path.splitext(file_name)[0]
        temp_df.to_csv(f'50_clients_data/client_raw_data_100_test/{base_name}_100.test', header=False, index=False)

In [19]:
# for state_code, df in dfs_loaded.items():
#     df_temp=dfs_loaded[state_code].reset_index(drop=True)
#     train_data, test_data = train_test_split(df_temp, test_size=0.2, random_state=42)
    
#     train_data.to_csv(f'50_clients_data/client_raw_data_100_sample/{state_code}_100.data', header=False, index=False)

# sample_100_AK.to_csv(f'50_clients_data/raw_data/AK_100.data', header=False, index=False)

In [21]:
# for state_code, (features, label) in dfs.items():
#     # take 30%
#     num_rows_to_keep = int(len(features) * 0.3) 
#     random_indices = np.random.choice(len(features), num_rows_to_keep, replace=False)
#     reduced_features = features.iloc[random_indices]
#     reduced_label = label.iloc[random_indices]
#     dfs[state_code] = (reduced_features, reduced_label)

# for state_code, (reduced_features, reduced_label) in dfs.items():
#     print(f"State: {state_code}, Reduced Features Length: {len(reduced_features)}, Reduced Label Length: {len(reduced_label)}")

In [58]:
import pickle

# Save the dictionary to a file
with open('dfs.pickle', 'wb') as f:
    pickle.dump(merge_dfs, f)

In [59]:
with open('dfs.pickle', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [60]:
for state_code, df in dfs_loaded.items():
    print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 4454
State: AK, df Length: 709
State: AZ, df Length: 6655
State: AR, df Length: 2786
State: CA, df Length: 39133
State: CO, df Length: 6261
State: CT, df Length: 3957
State: DE, df Length: 943
State: FL, df Length: 19785
State: GA, df Length: 10183
State: HI, df Length: 1546
State: ID, df Length: 1653
State: IL, df Length: 13403
State: IN, df Length: 7004
State: IA, df Length: 3549
State: KS, df Length: 3161
State: KY, df Length: 4401
State: LA, df Length: 4133
State: ME, df Length: 1400
State: MD, df Length: 6608
State: MA, df Length: 8023
State: MI, df Length: 10002
State: MN, df Length: 6204
State: MS, df Length: 2638
State: MO, df Length: 6333
State: MT, df Length: 1093
State: NE, df Length: 2157
State: NV, df Length: 2961
State: NH, df Length: 1593
State: NJ, df Length: 9556
State: NM, df Length: 1742
State: NY, df Length: 20604
State: NC, df Length: 10413
State: ND, df Length: 891
State: OH, df Length: 12427
State: OK, df Length: 3583
State: OR, df Length: 4

In [61]:
dfs_loaded["TX"].reset_index(drop=True)

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,20.0,2.0,16.0,5.0,5420.0,48.0,0.0,12.0,2.0,1.0,<=50K
1,27.0,3.0,19.0,1.0,2320.0,18.0,0.0,40.0,2.0,1.0,<=50K
2,56.0,1.0,19.0,5.0,4710.0,48.0,15.0,50.0,1.0,1.0,>50K
3,27.0,1.0,21.0,5.0,4000.0,48.0,0.0,38.0,2.0,1.0,<=50K
4,76.0,3.0,21.0,1.0,5940.0,35.0,1.0,13.0,1.0,1.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
27180,60.0,1.0,16.0,1.0,9130.0,48.0,0.0,40.0,1.0,1.0,<=50K
27181,74.0,1.0,21.0,1.0,4435.0,6.0,0.0,45.0,1.0,1.0,>50K
27182,43.0,1.0,16.0,5.0,4720.0,48.0,0.0,40.0,2.0,1.0,<=50K
27183,32.0,1.0,19.0,5.0,5410.0,6.0,0.0,40.0,1.0,1.0,>50K
