In [1]:
# !pip install folktables

In [1]:
from folktables import ACSDataSource, ACSEmployment,ACSIncome
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import folktables

In [32]:
data_source = ACSDataSource(survey_year='2019', horizon='1-Year', survey='person')
# acs_data = data_source.get_data(states=["AL", "AK", "AZ", "AR", "CA"], download=True)
# acs_data = data_source.get_data(download=True)

In [33]:
# ============= Load DFs Dicts directly ===================

In [34]:
ACSIncome = folktables.BasicProblem(
    features=[ 'AGEP','SCHL','COW', 'MAR','DIS','ESP','CIT','MIG','MIL', 'ANC','NATIVITY','DEAR',
        'DEYE','DREM','SEX','RELSHIPP','RAC1P','PUMA','POBP','ST', 'OCCP','JWTRNS', 'POWPUMA','WKHP'] ,
    target='PINCP',        
    target_transform=lambda x: x > 50000,    
    preprocess=folktables.adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

In [35]:
state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
               "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
               "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
               "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
               "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

# state_codes = ["AL","FL","GA","CA","IL","OH","PA","TX", "NY", ]
# state_codes = [ "CA","FL","GA","IL","OH","PA","TX","NY"]

dfs = {}
for state_code in state_codes:
    acs_data = data_source.get_data(states=[state_code],download=True)
    # print(acs_data.columns)
    features, label, group = ACSIncome.df_to_pandas(acs_data)
    dfs[state_code] = (features, label)


In [36]:
all_len=[]
for state_code, (features, label) in dfs.items():
    all_len.append(len(label))
    print(f"State: {state_code}, Features Length: {len(features)}, Label Length: {len(label)}")

State: AL, Features Length: 22798, Label Length: 22798
State: AK, Features Length: 3417, Label Length: 3417
State: AZ, Features Length: 34178, Label Length: 34178
State: AR, Features Length: 14104, Label Length: 14104
State: CA, Features Length: 197193, Label Length: 197193
State: CO, Features Length: 32264, Label Length: 32264
State: CT, Features Length: 19872, Label Length: 19872
State: DE, Features Length: 4755, Label Length: 4755
State: FL, Features Length: 100426, Label Length: 100426
State: GA, Features Length: 51032, Label Length: 51032
State: HI, Features Length: 7624, Label Length: 7624
State: ID, Features Length: 8564, Label Length: 8564
State: IL, Features Length: 66051, Label Length: 66051
State: IN, Features Length: 35222, Label Length: 35222
State: IA, Features Length: 17620, Label Length: 17620
State: KS, Features Length: 15793, Label Length: 15793
State: KY, Features Length: 22003, Label Length: 22003
State: LA, Features Length: 20897, Label Length: 20897
State: ME, Fea

In [37]:
check_df=dfs["AL"][0]
# check_df.head()

In [38]:
check_df[['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RAC1P', 'WKHP', 'SEX']]

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RAC1P,WKHP,SEX
0,19.0,2.0,19.0,5.0,5240.0,1.0,2.0,32.0,1.0
1,39.0,4.0,21.0,3.0,3602.0,1.0,1.0,40.0,2.0
2,19.0,1.0,19.0,5.0,4720.0,1.0,2.0,31.0,1.0
3,39.0,4.0,21.0,3.0,3602.0,1.0,1.0,40.0,2.0
4,21.0,1.0,19.0,5.0,4150.0,36.0,1.0,12.0,1.0
...,...,...,...,...,...,...,...,...,...
22793,28.0,1.0,19.0,5.0,5240.0,12.0,2.0,40.0,1.0
22794,37.0,1.0,16.0,3.0,6410.0,1.0,1.0,30.0,1.0
22795,43.0,1.0,21.0,1.0,300.0,1.0,1.0,40.0,1.0
22796,46.0,1.0,20.0,1.0,2145.0,1.0,1.0,40.0,2.0


In [39]:
dfs["AL"][1]

Unnamed: 0,PINCP
0,False
1,True
2,False
3,True
4,False
...,...
22793,False
22794,False
22795,True
22796,True


In [40]:
check_df.columns

Index(['AGEP', 'SCHL', 'COW', 'MAR', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC',
       'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RELSHIPP', 'RAC1P', 'PUMA',
       'POBP', 'ST', 'OCCP', 'JWTRNS', 'POWPUMA', 'WKHP'],
      dtype='object')

In [41]:
# col_names=['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'WKHP', 'SEX', 'RAC1P', 'PINCP']

In [42]:
merge_dfs={}
sample_size=12000

total_data=0
for state_code, (features, label) in dfs.items():   

    merge_df = pd.concat([features, label], axis=1)
    merge_df = merge_df.dropna()
    merge_df=merge_df[['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELSHIPP', 'WKHP', 'SEX','RAC1P','PINCP']]
    # print(len(merge_df))
    merge_df['PINCP'] = merge_df['PINCP'].replace({True: '>50K', False: '<=50K'})

    # Random sampling with maintaning the statistics 
    merge_df['strat'] = merge_df['SEX'].astype(str) + '_' + merge_df['RELSHIPP'].astype(str)
    
    # Check for rare combinations (occur less than 2 times)
    combination_counts = merge_df['strat'].value_counts()
    rare_combinations = combination_counts[combination_counts < 2].index

    
    # print("rare_combinations:",len(rare_combinations))

    merge_df = merge_df[~merge_df['strat'].isin(rare_combinations)]
    print(len(merge_df))  


    # To handle the splitiing properly
    ratio_of_df= sample_size/len(merge_df)
    
    ratio_of_df = 0.1

    sampled_df, _ = train_test_split(
                        merge_df,
                        train_size=ratio_of_df,
                        stratify=merge_df['strat'],
                        random_state=42 )

    sampled_df = sampled_df.drop('strat', axis=1)
    sampled_df = sampled_df.reset_index(drop=True)

    print("sampled_df",len(sampled_df))
    total_data +=len(sampled_df)
    merge_dfs[state_code] = sampled_df
    # merge_dfs[state_code] = merge_df
    
# for state_code, df in merge_dfs.items():
#     label_counts = df['PINCP'].value_counts()
#     print(f"State: {state_code}, df Length: {len(df)}, Label Counts: {label_counts.to_dict()}")

#     print(f"State: {state_code}, df Length: {len(df)}")

22796
sampled_df 2279
3414
sampled_df 341
34178
sampled_df 3417
14104
sampled_df 1410
197193
sampled_df 19719
32264
sampled_df 3226
19872
sampled_df 1987
4754
sampled_df 475
100426
sampled_df 10042
51031
sampled_df 5103
7624
sampled_df 762
8563
sampled_df 856
66051
sampled_df 6605
35222
sampled_df 3522
17618
sampled_df 1761
15792
sampled_df 1579
22002
sampled_df 2200
20894
sampled_df 2089
7103
sampled_df 710
32864
sampled_df 3286
40638
sampled_df 4063
50475
sampled_df 5047
31243
sampled_df 3124
13159
sampled_df 1315
31588
sampled_df 3158
5545
sampled_df 554
10815
sampled_df 1081
14915
sampled_df 1491
7902
sampled_df 790
48353
sampled_df 4835
9027
sampled_df 902
104330
sampled_df 10433
52040
sampled_df 5204
4452
sampled_df 445
62102
sampled_df 6210
17789
sampled_df 1778
21876
sampled_df 2187
68002
sampled_df 6800
5929
sampled_df 592
25084
sampled_df 2508
4947
sampled_df 494
34285
sampled_df 3428
138293
sampled_df 13829
16773
sampled_df 1677
3769
sampled_df 376
46107
sampled_df 4610
4095

In [43]:
total_data

166771

In [44]:
# Save the dictionary to a file
with open('dfs_test.pickle', 'wb') as f:
    pickle.dump(merge_dfs, f)

In [45]:
with open('dfs_test.pickle', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [46]:
for state_code, df in dfs_loaded.items():
    # print(df.head())
    print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 2279
State: AK, df Length: 341
State: AZ, df Length: 3417
State: AR, df Length: 1410
State: CA, df Length: 19719
State: CO, df Length: 3226
State: CT, df Length: 1987
State: DE, df Length: 475
State: FL, df Length: 10042
State: GA, df Length: 5103
State: HI, df Length: 762
State: ID, df Length: 856
State: IL, df Length: 6605
State: IN, df Length: 3522
State: IA, df Length: 1761
State: KS, df Length: 1579
State: KY, df Length: 2200
State: LA, df Length: 2089
State: ME, df Length: 710
State: MD, df Length: 3286
State: MA, df Length: 4063
State: MI, df Length: 5047
State: MN, df Length: 3124
State: MS, df Length: 1315
State: MO, df Length: 3158
State: MT, df Length: 554
State: NE, df Length: 1081
State: NV, df Length: 1491
State: NH, df Length: 790
State: NJ, df Length: 4835
State: NM, df Length: 902
State: NY, df Length: 10433
State: NC, df Length: 5204
State: ND, df Length: 445
State: OH, df Length: 6210
State: OK, df Length: 1778
State: OR, df Length: 2187
State: 

In [47]:
# NY is removed... We use it for Testing 

state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
               "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
               "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
               "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
               "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [49]:
all_dfs=[]
for state_code, df in dfs_loaded.items():
    df_temp=dfs_loaded[state_code].reset_index(drop=True)
    all_dfs.append(df_temp)
    final_df = pd.concat(all_dfs, ignore_index=True)

    # train_data, test_data = train_test_split(df_temp, test_size=0.1, random_state=42)
    
    
    # test_data['PINCP'] = test_data['PINCP'].astype(str) + '.'
    # print(test_data.head())

    # test_data.to_csv(f'50_clients_data/raw_data/{state_code}.test', header=False, index=False)
    
    # df_temp.to_csv(f'50_clients_data/raw_data/{state_code}.csv', header=False, index=False)

final_df.to_csv(f'50_clients_data_testing/raw_data/testing.data', header=False, index=False)
final_df.to_csv(f'50_clients_data_testing/raw_data/testing.test', header=False, index=False)

In [48]:
final_df.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELSHIPP,WKHP,SEX,RAC1P,PINCP
0,59.0,1.0,13.0,2.0,7150.0,1.0,37.0,47.0,1.0,3.0,<=50K
1,70.0,1.0,21.0,1.0,5000.0,18.0,20.0,45.0,1.0,1.0,>50K
2,45.0,1.0,14.0,1.0,6355.0,1.0,20.0,47.0,1.0,2.0,<=50K
3,40.0,3.0,16.0,1.0,4220.0,1.0,20.0,40.0,2.0,1.0,<=50K
4,60.0,1.0,20.0,1.0,1980.0,26.0,20.0,45.0,1.0,1.0,>50K


In [25]:
len(final_df)

final_df

166771

# Group split

In [26]:
pd.unique(final_df['SEX'])

array([1., 2.])

In [27]:
pd.unique(final_df['RAC1P'])

array([3., 1., 2., 8., 9., 6., 5., 4., 7.])

In [17]:
df_sex_1 = final_df[final_df['SEX'] == 1.0]
df_sex_2 = final_df[final_df['SEX'] == 2.0]

In [18]:
len(df_sex_1), len(df_sex_2)

(86936, 79835)

In [19]:
df_RAC_1 = final_df[final_df['RAC1P'] == 1.0]
df_RAC_2 = final_df[final_df['RAC1P'] == 2.0]

In [20]:
len(df_RAC_1), len(df_RAC_2)

(130463, 14480)

In [70]:
df_sex_1.to_csv(f'50_clients_data_testing/client_subG_splits/male.data', header=False, index=False)
df_sex_2.to_csv(f'50_clients_data_testing/client_subG_splits/female.data', header=False, index=False)



# --------TO AVOID CHANGES IN ADULT CLASS----------
df_sex_1.to_csv(f'50_clients_data_testing/client_subG_splits/male.test', header=False, index=False)
df_sex_2.to_csv(f'50_clients_data_testing/client_subG_splits/female.test', header=False, index=False)

In [71]:
df_RAC_1.to_csv(f'50_clients_data_testing/client_subG_splits/white.data', header=False, index=False)
df_RAC_2.to_csv(f'50_clients_data_testing/client_subG_splits/black.data', header=False, index=False)



# --------TO AVOID CHANGES IN ADULT CLASS----------
df_RAC_1.to_csv(f'50_clients_data_testing/client_subG_splits/white.test', header=False, index=False)
df_RAC_2.to_csv(f'50_clients_data_testing/client_subG_splits/black.test', header=False, index=False)

# subGroup split ------Pending

In [22]:
#White Men
WM = final_df.loc[(final_df['SEX'] == 1) & (final_df['RAC1P'] == 1)]
#Black Men
BM = final_df.loc[(final_df['SEX'] == 1) & (final_df['RAC1P'] == 2)]
#White Women
WW = final_df.loc[(final_df['SEX'] == 2) & (final_df['RAC1P'] == 1)]
#Black Women
BW = final_df.loc[(final_df['SEX'] == 2) & (final_df['RAC1P'] == 2)]

print("Length: ",len(WM),len(BM),len(WW),len(BW))

WW.to_csv(f'50_clients_data_testing/client_subG_splits/WW.data', header=False, index=False)
WM.to_csv(f'50_clients_data_testing/client_subG_splits/WM.data', header=False, index=False)
BW.to_csv(f'50_clients_data_testing/client_subG_splits/BW.data', header=False, index=False)
BM.to_csv(f'50_clients_data_testing/client_subG_splits/BM.data', header=False, index=False)


#========= TO avoid adult class error
WW.to_csv(f'50_clients_data_testing/client_subG_splits/WW.test', header=False, index=False)
WM.to_csv(f'50_clients_data_testing/client_subG_splits/WM.test', header=False, index=False)
BW.to_csv(f'50_clients_data_testing/client_subG_splits/BW.test', header=False, index=False)
BM.to_csv(f'50_clients_data_testing/client_subG_splits/BM.test', header=False, index=False)

Length:  68843 6750 61620 7730


In [18]:
# for state_code, df in dfs_loaded.items():
#     df_temp=dfs_loaded[state_code].reset_index(drop=True)
#     train_data, test_data = train_test_split(df_temp, test_size=0.2, random_state=42)

In [44]:
# with open('dfs.pickle', 'rb') as f:
#     dfs_loaded = pickle.load(f)

# merge_df=dfs_loaded["NY"].reset_index(drop=True)

In [20]:
df_temp.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,53.0,5.0,16.0,1.0,4000.0,56.0,0.0,40.0,2.0,3.0,<=50K
1,60.0,1.0,19.0,3.0,8140.0,26.0,0.0,50.0,1.0,1.0,>50K
2,52.0,3.0,20.0,1.0,2310.0,6.0,1.0,12.0,2.0,1.0,<=50K
3,22.0,1.0,20.0,5.0,310.0,56.0,0.0,30.0,1.0,1.0,<=50K
4,58.0,1.0,19.0,1.0,4700.0,26.0,0.0,40.0,2.0,1.0,<=50K


In [21]:
merge_df=merge_dfs["NY"].copy()

In [22]:
col_names=['AGEP', 'COW', 'SCHL', 'MAR', 'OCCP', 'POBP', 'RELP', 'WKHP', 'SEX', 'RAC1P', 'PINCP']

In [23]:
state_code="NY"
temp_df_train=pd.read_csv(f'50_clients_data/raw_data/{state_code}.data', header=None)
temp_df_test=pd.read_csv(f'50_clients_data/raw_data/{state_code}.test', header=None)

temp_df_train.columns = col_names
temp_df_test.columns = col_names
temp_df_train.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,44.0,1.0,11.0,3.0,4110.0,36.0,2.0,45.0,2.0,1.0,<=50K
1,22.0,1.0,16.0,5.0,4720.0,36.0,2.0,15.0,1.0,2.0,<=50K
2,54.0,1.0,23.0,3.0,3010.0,36.0,0.0,50.0,1.0,1.0,>50K
3,46.0,1.0,22.0,1.0,20.0,138.0,1.0,65.0,2.0,1.0,>50K
4,64.0,6.0,20.0,3.0,4230.0,36.0,11.0,25.0,2.0,1.0,<=50K


In [24]:
len(temp_df_train)

9600

In [25]:
temp_df_test.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,19.0,1.0,15.0,5.0,9620.0,210.0,3.0,15.0,1.0,6.0,<=50K.
1,54.0,1.0,16.0,1.0,5000.0,36.0,1.0,40.0,2.0,1.0,>50K.
2,34.0,1.0,21.0,1.0,9142.0,207.0,10.0,48.0,1.0,6.0,<=50K.
3,56.0,2.0,22.0,1.0,2310.0,36.0,0.0,35.0,2.0,1.0,>50K.
4,55.0,1.0,22.0,5.0,110.0,414.0,0.0,40.0,1.0,1.0,<=50K.


In [26]:
merge_df_train =temp_df_train
merge_df_test =temp_df_test
len(merge_df_train),len(merge_df_test)

(9600, 2400)

In [27]:
merge_df_train.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,44.0,1.0,11.0,3.0,4110.0,36.0,2.0,45.0,2.0,1.0,<=50K
1,22.0,1.0,16.0,5.0,4720.0,36.0,2.0,15.0,1.0,2.0,<=50K
2,54.0,1.0,23.0,3.0,3010.0,36.0,0.0,50.0,1.0,1.0,>50K
3,46.0,1.0,22.0,1.0,20.0,138.0,1.0,65.0,2.0,1.0,>50K
4,64.0,6.0,20.0,3.0,4230.0,36.0,11.0,25.0,2.0,1.0,<=50K


In [28]:
#White Men
WM = merge_df_train.loc[(merge_df_train['SEX'] == 1) & (merge_df_train['RAC1P'] == 1)]
#Black Men
BM = merge_df_train.loc[(merge_df_train['SEX'] == 1) & (merge_df_train['RAC1P'] == 2)]
#White Women
WW = merge_df_train.loc[(merge_df_train['SEX'] == 2) & (merge_df_train['RAC1P'] == 1)]
#Black Women
BW = merge_df_train.loc[(merge_df_train['SEX'] == 2) & (merge_df_train['RAC1P'] == 2)]

print("Length: ",len(WM),len(BM),len(WW),len(BW))

WW.to_csv(f'50_clients_data/client_subG_splits/WW.data', header=False, index=False)
WM.to_csv(f'50_clients_data/client_subG_splits/WM.data', header=False, index=False)
BW.to_csv(f'50_clients_data/client_subG_splits/BW.data', header=False, index=False)
BM.to_csv(f'50_clients_data/client_subG_splits/BM.data', header=False, index=False)

Length:  3537 449 3353 593


In [29]:
WM.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
2,54.0,1.0,23.0,3.0,3010.0,36.0,0.0,50.0,1.0,1.0,>50K
6,41.0,3.0,22.0,1.0,230.0,36.0,0.0,50.0,1.0,1.0,>50K
7,27.0,1.0,21.0,5.0,2640.0,42.0,12.0,45.0,1.0,1.0,>50K
10,52.0,1.0,17.0,1.0,310.0,109.0,1.0,98.0,1.0,1.0,<=50K
11,55.0,1.0,21.0,1.0,5860.0,36.0,0.0,35.0,1.0,1.0,<=50K


In [38]:
# # # filter the dataframe for each combination

# #White Men
# WM_test = merge_df_test.loc[(merge_df_test['SEX'] == 1) & (merge_df_test['RAC1P'] == 1)]
# #Black Men
# BM_test = merge_df_test.loc[(merge_df_test['SEX'] == 1) & (merge_df_test['RAC1P'] == 2)]
# #white Women
# WW_test = merge_df_test.loc[(merge_df_test['SEX'] == 2) & (merge_df_test['RAC1P'] == 1)]
# #Black Women
# BW_test = merge_df_test.loc[(merge_df_test['SEX'] == 2) & (merge_df_test['RAC1P'] == 2)]

# print("Length: ",len(WM_test),len(BM_test),len(WW_test),len(BW_test))

# WW_test.to_csv(f'50_clients_data/client_subG_splits/WW.test', header=False, index=False)
# WM_test.to_csv(f'50_clients_data/client_subG_splits/WM.test', header=False, index=False)
# BW_test.to_csv(f'50_clients_data/client_subG_splits/BW.test', header=False, index=False)
# BM_test.to_csv(f'50_clients_data/client_subG_splits/BM.test', header=False, index=False)

# BM_test.head()

In [39]:
# WM_test['PINCP'] = WM_test['PINCP'].astype(str) + '.'


In [40]:

# # =====Training=============

# WW_cut= 111
# WM_cut= 111
# BW_cut=111
# BM_cut=111

# # =====Testing=============


# WW.loc[WW_cut:, 'PINCP'] = WW.loc[WW_cut:, 'PINCP'].astype(str) + '.'
# WW[WW_cut:].to_csv(f'50_clients_data/client_subG_splits/WW.test', header=False, index=False)

# WM.loc[WM_cut:, 'PINCP'] = WM.loc[WM_cut:, 'PINCP'].astype(str) + '.'
# WM[WM_cut:].to_csv(f'50_clients_data/client_subG_splits/WM.test', header=False, index=False)

# BW.loc[BW_cut:, 'PINCP'] = BW.loc[BW_cut:, 'PINCP'].astype(str) + '.'
# BW[BW_cut:].to_csv(f'50_clients_data/client_subG_splits/BW.test', header=False, index=False)

# BM.loc[BM_cut:, 'PINCP'] = BM.loc[BM_cut:, 'PINCP'].astype(str) + '.'
# BM[BM_cut:].to_csv(f'50_clients_data/client_subG_splits/BM.test', header=False, index=False)


# WW[WW_cut:].head()


#-------------- Group data with NY-------------
### subgroup data with NY

In [12]:
merge_df=merge_dfs["NY"].copy()
len(merge_df)

103021

In [13]:
men = merge_df.loc[(merge_df['SEX'] == 1)]
women = merge_df.loc[(merge_df['SEX'] == 2)]
white = merge_df.loc[(merge_df['RAC1P'] == 1)]
black = merge_df.loc[(merge_df['RAC1P'] == 2)]

print("Length: ",len(men),len(women),len(white),len(black))

# =====Training=============
men[:40000].to_csv(f'50_clients_data/client_subG_splits/men.data', header=False, index=False)
women[:40000].to_csv(f'50_clients_data/client_subG_splits/women.data', header=False, index=False)

white[:40000].to_csv(f'50_clients_data/client_subG_splits/white.data', header=False, index=False)
black[:10000].to_csv(f'50_clients_data/client_subG_splits/black.data', header=False, index=False)


# =====Testing=============


men.loc[40000:, 'PINCP'] = men.loc[40000:, 'PINCP'].astype(str) + '.'
men[40000:].to_csv(f'50_clients_data/client_subG_splits/men.test', header=False, index=False)

women.loc[40000:, 'PINCP'] = women.loc[40000:, 'PINCP'].astype(str) + '.'
women[40000:].to_csv(f'50_clients_data/client_subG_splits/women.test', header=False, index=False)

white.loc[40000:, 'PINCP'] = white.loc[40000:, 'PINCP'].astype(str) + '.'
white[40000:].to_csv(f'50_clients_data/client_subG_splits/white.test', header=False, index=False)

black.loc[10000:, 'PINCP'] = black.loc[10000:, 'PINCP'].astype(str) + '.'
black[10000:].to_csv(f'50_clients_data/client_subG_splits/black.test', header=False, index=False)

(52178, 50843, 73665, 11647)

# Take random 100 data points from Clients

In [13]:
import os

In [14]:
# Training Ground Truth

In [4]:
folder_path = '50_clients_data/raw_data/'
for file_name in os.listdir(folder_path):
    if file_name.endswith('.data'):
        df = pd.read_csv(os.path.join(folder_path, file_name), header=None)
        temp_df=df.sample(n=100)
        base_name = os.path.splitext(file_name)[0]
        temp_df.to_csv(f'50_clients_data/client_raw_data_100_sample/{base_name}_100.data', header=False, index=False)


In [16]:
# Testing

In [5]:
folder_path = '50_clients_data/raw_data/'
for file_name in os.listdir(folder_path):
    if file_name.endswith('.test'):
        df = pd.read_csv(os.path.join(folder_path, file_name), header=None)
        temp_df=df.sample(n=100)
        base_name = os.path.splitext(file_name)[0]
        temp_df.to_csv(f'50_clients_data/client_raw_data_100_test/{base_name}_100.test', header=False, index=False)

In [19]:
# for state_code, df in dfs_loaded.items():
#     df_temp=dfs_loaded[state_code].reset_index(drop=True)
#     train_data, test_data = train_test_split(df_temp, test_size=0.2, random_state=42)
    
#     train_data.to_csv(f'50_clients_data/client_raw_data_100_sample/{state_code}_100.data', header=False, index=False)

# sample_100_AK.to_csv(f'50_clients_data/raw_data/AK_100.data', header=False, index=False)

In [21]:
# for state_code, (features, label) in dfs.items():
#     # take 30%
#     num_rows_to_keep = int(len(features) * 0.3) 
#     random_indices = np.random.choice(len(features), num_rows_to_keep, replace=False)
#     reduced_features = features.iloc[random_indices]
#     reduced_label = label.iloc[random_indices]
#     dfs[state_code] = (reduced_features, reduced_label)

# for state_code, (reduced_features, reduced_label) in dfs.items():
#     print(f"State: {state_code}, Reduced Features Length: {len(reduced_features)}, Reduced Label Length: {len(reduced_label)}")

In [16]:
import pickle

# Save the dictionary to a file
with open('dfs.pickle', 'wb') as f:
    pickle.dump(merge_dfs, f)

In [17]:
with open('dfs.pickle', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [18]:
for state_code, df in dfs_loaded.items():
    print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 22268


In [20]:
dfs_loaded["AL"].reset_index(drop=True)

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,18.0,1.0,18.0,5.0,4720.0,13.0,17.0,21.0,2.0,2.0,<=50K
1,53.0,5.0,17.0,5.0,3605.0,18.0,16.0,40.0,1.0,1.0,<=50K
2,41.0,1.0,16.0,5.0,7330.0,1.0,17.0,40.0,1.0,1.0,<=50K
3,18.0,6.0,18.0,5.0,2722.0,1.0,17.0,2.0,2.0,1.0,<=50K
4,21.0,5.0,19.0,5.0,3870.0,12.0,17.0,50.0,1.0,1.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
22263,20.0,6.0,19.0,5.0,4251.0,1.0,4.0,25.0,1.0,1.0,<=50K
22264,63.0,1.0,16.0,1.0,440.0,1.0,0.0,48.0,1.0,1.0,>50K
22265,65.0,2.0,21.0,5.0,420.0,1.0,2.0,40.0,2.0,1.0,>50K
22266,37.0,1.0,16.0,4.0,340.0,6.0,0.0,50.0,2.0,1.0,<=50K
