In [1]:
import torch
import numpy as np
import pandas as pd
from datasets.base_dataset import BaseDataset
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset, DataLoader
import sys
sys.path.append("..")
from utils import to_numeric
import pickle

In [36]:
class ADULT(BaseDataset):

    def __init__(self, name='ADULT', single_bit_binary=False, device='cpu', random_state=42, name_state="AL"):
        super(ADULT, self).__init__(name=name, device=device, random_state=random_state)

        self.features = {
            'AGEP': None,
            'COW': None,
            'SCHL': None,
            'MAR': None,
            'OCCP': None,
            'POBP': None,
            'RELP': None,
            'WKHP': None,
            'SEX': None,
            'RAC1P': None,      
            'PINCP': ['>50K', '<=50K']
        }
        
        self.single_bit_binary = single_bit_binary
        self.label = 'PINCP'

        self.train_features = {key: self.features[key] for key in self.features.keys() if key != self.label}

        # name_state="GA"
        self.train_data_df = pd.read_csv(f'50_clients_data/raw_data/{name_state}.data', delimiter=',', names=list(self.features.keys()), engine='python')
        self.test_data_df = pd.read_csv(f'50_clients_data/raw_data/{name_state}.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')

        train_data = self.train_data_df.to_numpy()
        test_data = self.test_data_df.to_numpy()

        train_rows_to_keep = [not ('?' in row) for row in train_data]
        test_rows_to_keep = [not ('?' in row) for row in test_data]

        train_data = train_data[train_rows_to_keep]
        test_data = test_data[test_rows_to_keep]

        # remove the annoying dot from the test labels
        for row in test_data:
            # print(len(row))
            # print(row[-1])

            row[-1] = row[-1][:-1]

        # convert to numeric features
        train_data_num = to_numeric(train_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)
        test_data_num = to_numeric(test_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)

        # split features and labels
        Xtrain, Xtest = train_data_num[:, :-1].astype(np.float32), test_data_num[:, :-1].astype(np.float32)
        ytrain, ytest = train_data_num[:, -1].astype(np.float32), test_data_num[:, -1].astype(np.float32)

        print(name_state,len(Xtrain))
        print(np.unique(ytrain))
        print(np.unique(ytest))
        
        self.num_features = Xtrain.shape[1]

        # transfer to torch
        self.Xtrain, self.Xtest = torch.tensor(Xtrain).to(self.device), torch.tensor(Xtest).to(self.device)
        self.ytrain, self.ytest = torch.tensor(ytrain, dtype=torch.long).to(self.device), torch.tensor(ytest, dtype=torch.long).to(self.device)

        # set to train mode as base
        self.train()

        # calculate the standardization statistics
        self._calculate_mean_std()

        # calculate the histograms and feature bounds
        self._calculate_categorical_feature_distributions_and_continuous_bounds()

In [37]:
state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
               "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
               "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
               "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
               "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [38]:
for state_code in state_codes:
    sta_name=state_code
    adult_dataset = ADULT(name_state=sta_name)
    adult_dataset.standardize()
    dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

    with open(f'50_clients_data/processed_data/{sta_name}.pkl', 'wb') as f:
        pickle.dump(dataloader, f)
        
    dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
    with open(f'50_clients_data/processed_data/{sta_name}_test.pkl', 'wb') as f:
        pickle.dump(dataloader, f)

AL 17814
[0. 1.]
[0. 1.]
AK 2836
[0. 1.]
[0. 1.]
AZ 26621
[0. 1.]
[0. 1.]
AR 11143
[0. 1.]
[0. 1.]
CA 156532
[0. 1.]
[0. 1.]
CO 25044
[0. 1.]
[0. 1.]
CT 15828
[0. 1.]
[0. 1.]
DE 3770
[0. 1.]
[0. 1.]
FL 79140
[0. 1.]
[0. 1.]
GA 40732
[0. 1.]
[0. 1.]
HI 6184
[0. 1.]
[0. 1.]
ID 6612
[0. 1.]
[0. 1.]
IL 53612
[0. 1.]
[0. 1.]
IN 28017
[0. 1.]
[0. 1.]
IA 14196
[0. 1.]
[0. 1.]
KS 12645
[0. 1.]
[0. 1.]
KY 17604
[0. 1.]
[0. 1.]
LA 16533
[0. 1.]
[0. 1.]
ME 5601
[0. 1.]
[0. 1.]
MD 26433
[0. 1.]
[0. 1.]
MA 32091
[0. 1.]
[0. 1.]
MI 40006
[0. 1.]
[0. 1.]
MN 24816
[0. 1.]
[0. 1.]
MS 10551
[0. 1.]
[0. 1.]
MO 25331
[0. 1.]
[0. 1.]
MT 4370
[0. 1.]
[0. 1.]
NE 8628
[0. 1.]
[0. 1.]
NV 11845
[0. 1.]
[0. 1.]
NH 6372
[0. 1.]
[0. 1.]
NJ 38224
[0. 1.]
[0. 1.]
NM 6968
[0. 1.]
[0. 1.]
NY 82416
[0. 1.]
[0. 1.]
NC 41653
[0. 1.]
[0. 1.]
ND 3564
[0. 1.]
[0. 1.]
OH 49708
[0. 1.]
[0. 1.]
OK 14333
[0. 1.]
[0. 1.]
OR 17535
[0. 1.]
[0. 1.]
PA 54646
[0. 1.]
[0. 1.]
RI 4569
[0. 1.]
[0. 1.]
SC 19903
[0. 1.]
[0. 1.]
SD 3919
[0

In [43]:
state_name="CA"

client_data_dir="50_clients_data/processed_data/"

with open(client_data_dir+f'{state_name}.pkl', 'rb') as f:
    train_data_all_client  = pickle.load(f)

len(train_data_all_client)*32

156544

557

In [32]:
sta_name=state_code
adult_dataset = ADULT(name_state=sta_name)

6184
[0. 1.]
[0. 1.]


In [33]:
adult_dataset.standardize()

In [34]:
adult_dataset.Xtrain

tensor([[ 1.0452,  1.7328,  0.4168,  ..., -1.3975, -0.9475, -1.4733],
        [-0.1802, -0.7781,  0.1062,  ...,  0.1364, -0.9475,  1.2291],
        [-0.3092, -0.7781, -0.8253,  ..., -1.5509,  1.0553,  1.2291],
        ...,
        [ 1.1097, -0.7781,  0.1062,  ...,  0.1364,  1.0553,  0.2157],
        [ 1.9481,  2.2349,  1.0378,  ..., -2.7781,  1.0553,  0.5535],
        [-1.2121, -0.7781,  1.0378,  ...,  0.1364, -0.9475,  0.2157]])

In [35]:
dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

with open(f'50_clients_data/processed_data/{sta_name}.pkl', 'wb') as f:
    pickle.dump(dataloader, f)


dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

with open(f'50_clients_data/processed_data/{sta_name}_test.pkl', 'wb') as f:
    pickle.dump(dataloader, f)

In [None]:
with open('WI.pickle', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [21]:
# for state_code, (features, label) in dfs.items():
#     # take 30%
#     num_rows_to_keep = int(len(features) * 0.3) 
#     random_indices = np.random.choice(len(features), num_rows_to_keep, replace=False)
#     reduced_features = features.iloc[random_indices]
#     reduced_label = label.iloc[random_indices]
#     dfs[state_code] = (reduced_features, reduced_label)

# for state_code, (reduced_features, reduced_label) in dfs.items():
#     print(f"State: {state_code}, Reduced Features Length: {len(reduced_features)}, Reduced Label Length: {len(reduced_label)}")

In [58]:
# import pickle

# # Save the dictionary to a file
# with open('dfs.pickle', 'wb') as f:
#     pickle.dump(merge_dfs, f)

In [59]:
# with open('dfs.pickle', 'rb') as f:
#     dfs_loaded = pickle.load(f)

In [60]:
# for state_code, df in dfs_loaded.items():
#     print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 4454
State: AK, df Length: 709
State: AZ, df Length: 6655
State: AR, df Length: 2786
State: CA, df Length: 39133
State: CO, df Length: 6261
State: CT, df Length: 3957
State: DE, df Length: 943
State: FL, df Length: 19785
State: GA, df Length: 10183
State: HI, df Length: 1546
State: ID, df Length: 1653
State: IL, df Length: 13403
State: IN, df Length: 7004
State: IA, df Length: 3549
State: KS, df Length: 3161
State: KY, df Length: 4401
State: LA, df Length: 4133
State: ME, df Length: 1400
State: MD, df Length: 6608
State: MA, df Length: 8023
State: MI, df Length: 10002
State: MN, df Length: 6204
State: MS, df Length: 2638
State: MO, df Length: 6333
State: MT, df Length: 1093
State: NE, df Length: 2157
State: NV, df Length: 2961
State: NH, df Length: 1593
State: NJ, df Length: 9556
State: NM, df Length: 1742
State: NY, df Length: 20604
State: NC, df Length: 10413
State: ND, df Length: 891
State: OH, df Length: 12427
State: OK, df Length: 3583
State: OR, df Length: 4

In [61]:
# dfs_loaded["TX"].reset_index(drop=True)

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,20.0,2.0,16.0,5.0,5420.0,48.0,0.0,12.0,2.0,1.0,<=50K
1,27.0,3.0,19.0,1.0,2320.0,18.0,0.0,40.0,2.0,1.0,<=50K
2,56.0,1.0,19.0,5.0,4710.0,48.0,15.0,50.0,1.0,1.0,>50K
3,27.0,1.0,21.0,5.0,4000.0,48.0,0.0,38.0,2.0,1.0,<=50K
4,76.0,3.0,21.0,1.0,5940.0,35.0,1.0,13.0,1.0,1.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
27180,60.0,1.0,16.0,1.0,9130.0,48.0,0.0,40.0,1.0,1.0,<=50K
27181,74.0,1.0,21.0,1.0,4435.0,6.0,0.0,45.0,1.0,1.0,>50K
27182,43.0,1.0,16.0,5.0,4720.0,48.0,0.0,40.0,2.0,1.0,<=50K
27183,32.0,1.0,19.0,5.0,5410.0,6.0,0.0,40.0,1.0,1.0,>50K
