In [1]:
import torch
import numpy as np
import pandas as pd
from datasets.base_dataset import BaseDataset
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset, DataLoader
import sys
sys.path.append("..")
from utils import to_numeric
import pickle

In [7]:
class ADULT(BaseDataset):

    def __init__(self, name='ADULT', single_bit_binary=False, device='cpu', random_state=42, name_state="CA"):
        super(ADULT, self).__init__(name=name, device=device, random_state=random_state)
        print(name_state)
        self.features = {
            'AGEP': None,
            'COW': None,
            'SCHL': None,
            'MAR': None,
            'OCCP': None,
            'POBP': None,
            'RELP': None,
            'WKHP': None,
            'SEX': None,
            'RAC1P': None,      
            'PINCP': ['>50K', '<=50K']
        }
        
        self.single_bit_binary = single_bit_binary
        self.label = 'PINCP'

        self.train_features = {key: self.features[key] for key in self.features.keys() if key != self.label}

        # name_state="CA"
        # self.train_data_df = pd.read_csv(f'50_clients_data/client_subG_splits/{name_state}.data', delimiter=',', names=list(self.features.keys()), engine='python')        
        # self.test_data_df = pd.read_csv(f'50_clients_data/client_subG_splits/{name_state}.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')


        self.train_data_df = pd.read_csv(f'50_clients_data_testing/raw_data/testing.data', delimiter=',', names=list(self.features.keys()), engine='python')        
        self.test_data_df = pd.read_csv(f'50_clients_data_testing/raw_data/testing.data', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')

        train_data = self.train_data_df.to_numpy()
        test_data = self.test_data_df.to_numpy()

        train_rows_to_keep = [not ('?' in row) for row in train_data]
        test_rows_to_keep = [not ('?' in row) for row in test_data]

        train_data = train_data[train_rows_to_keep]
        test_data = test_data[test_rows_to_keep]

        # remove the annoying dot from the test labels
        for row in test_data:
            # print(len(row))
            # print(row[-1])

            row[-1] = row[-1][:-1]

        # convert to numeric features
        train_data_num = to_numeric(train_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)
        test_data_num = to_numeric(test_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)

        # split features and labels
        Xtrain, Xtest = train_data_num[:, :-1].astype(np.float32), test_data_num[:, :-1].astype(np.float32)
        ytrain, ytest = train_data_num[:, -1].astype(np.float32), test_data_num[:, -1].astype(np.float32)

        print(name_state,len(Xtrain))
        print("ytrain",np.unique(ytrain))
        print("ytest",np.unique(ytest))
        
        self.num_features = Xtrain.shape[1]

        # transfer to torch
        self.Xtrain, self.Xtest = torch.tensor(Xtrain).to(self.device), torch.tensor(Xtest).to(self.device)
        self.ytrain, self.ytest = torch.tensor(ytrain, dtype=torch.long).to(self.device), torch.tensor(ytest, dtype=torch.long).to(self.device)

        # set to train mode as base
        self.train()

        # calculate the standardization statistics
        self._calculate_mean_std()

        # calculate the histograms and feature bounds
        self._calculate_categorical_feature_distributions_and_continuous_bounds()

In [8]:
state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
               "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
               "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
               "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
               "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

# state_codes=["AK"]
# state_codes=["BM","BW","WM","WW"]
# state_codes=["men","women","white","black"]

# pytorch 2.3 Loader -- SubGroup Processed


### all 166k data

In [9]:
adult_dataset = ADULT(name_state="CA")
adult_dataset.standardize()
dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

with open(f'50_clients_data/client_subG_processed/testing_all.pkl', 'wb') as f:
    pickle.dump(dataloader, f)
    

CA
CA 166771
ytrain [0. 1.]
ytest [0.]


In [5]:
# for state_code in state_codes:
#     state_name=state_code
#     adult_dataset = ADULT(name_state=state_name)
#     adult_dataset.standardize()
#     dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

#     with open(f'50_clients_data/client_subG_processed/testing_all.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)
        

BM


FileNotFoundError: [Errno 2] No such file or directory: '50_clients_data/client_subG_splits/testing.data'

# only for testing


In [9]:
for state_code in state_codes:
    state_name=state_code
    adult_dataset = ADULT(name_state=state_name)
    adult_dataset.standardize()
    dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

    with open(f'50_clients_data/client_subG_processed/{state_name}.pkl', 'wb') as f:
        pickle.dump(dataloader, f)
        
    # dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
    # dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
    # with open(f'50_clients_data/client_subG_processed/{state_name}_test.pkl', 'wb') as f:
    #     pickle.dump(dataloader, f)

BM
BM 76
ytrain [0. 1.]
ytest [0. 1.]
BW
BW 107
ytrain [0. 1.]
ytest []
WM
WM 592
ytrain [0. 1.]
ytest [0. 1.]
WW
WW 570
ytrain [0. 1.]
ytest [0. 1.]


In [8]:
adult_dataset.train_data_df

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,24.0,1.0,19.0,5.0,520.0,26.0,12.0,30.0,2.0,1.0,<=50K
1,41.0,5.0,22.0,3.0,2360.0,36.0,0.0,40.0,2.0,1.0,>50K
2,68.0,1.0,19.0,2.0,4055.0,36.0,0.0,20.0,2.0,1.0,<=50K
3,54.0,1.0,21.0,1.0,705.0,36.0,1.0,50.0,2.0,1.0,>50K
4,63.0,2.0,16.0,1.0,5160.0,36.0,1.0,40.0,2.0,1.0,>50K
...,...,...,...,...,...,...,...,...,...,...,...
4432,26.0,1.0,11.0,1.0,2545.0,36.0,1.0,1.0,2.0,1.0,<=50K
4433,54.0,1.0,19.0,1.0,5110.0,36.0,0.0,37.0,2.0,1.0,<=50K
4434,23.0,1.0,21.0,5.0,540.0,36.0,15.0,40.0,2.0,1.0,<=50K
4435,56.0,1.0,17.0,4.0,3603.0,36.0,0.0,38.0,2.0,1.0,<=50K


In [6]:
state_name="male"
client_data_dir="50_clients_data/client_subG_processed/"

with open(client_data_dir+f'{state_name}.pkl', 'rb') as f:
    train_data_all_client  = pickle.load(f)

len(train_data_all_client)*32

10016

In [15]:
# adult_dataset.standardize()
# adult_dataset.Xtest

# # SEX, RAC1P are same... thats why its 0

In [16]:
# adult_dataset.Xtest[2]

In [17]:
# adult_dataset.de_standardize()
# adult_dataset.Xtest

In [18]:
# for state_code in state_codes:
#     state_name=state_code
#     adult_dataset = ADULT(name_state=state_name)
#     adult_dataset.standardize()
#     dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

#     with open(f'50_clients_data/processed_data/{state_name}.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)
        
#     dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
#     with open(f'50_clients_data/processed_data/{state_name}_test.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)

In [8]:
state_name="WM"

client_data_dir="50_clients_data/client_subG_processed/"

with open(client_data_dir+f'{state_name}.pkl', 'rb') as f:
    train_data_all_client  = pickle.load(f)

len(train_data_all_client)*32

64000

In [6]:
# state_name=state_code
# adult_dataset = ADULT(name_state=state_name)

In [7]:
# adult_dataset.standardize()

In [8]:
# adult_dataset.Xtrain

In [9]:
# dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

# with open(f'50_clients_data/processed_data/{sta_name}.pkl', 'wb') as f:
#     pickle.dump(dataloader, f)


# dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

# with open(f'50_clients_data/processed_data/{sta_name}_test.pkl', 'wb') as f:
#     pickle.dump(dataloader, f)

In [10]:
with open('50_clients_data/processed_data/AL.pkl', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [11]:
# for state_code, (features, label) in dfs.items():
#     # take 30%
#     num_rows_to_keep = int(len(features) * 0.3) 
#     random_indices = np.random.choice(len(features), num_rows_to_keep, replace=False)
#     reduced_features = features.iloc[random_indices]
#     reduced_label = label.iloc[random_indices]
#     dfs[state_code] = (reduced_features, reduced_label)

# for state_code, (reduced_features, reduced_label) in dfs.items():
#     print(f"State: {state_code}, Reduced Features Length: {len(reduced_features)}, Reduced Label Length: {len(reduced_label)}")

In [12]:
# import pickle

# # Save the dictionary to a file
# with open('dfs.pickle', 'wb') as f:
#     pickle.dump(merge_dfs, f)

In [13]:
# with open('dfs.pickle', 'rb') as f:
#     dfs_loaded = pickle.load(f)

In [14]:
# for state_code, df in dfs_loaded.items():
#     print(f"State: {state_code}, df Length: {len(df)}")

In [15]:
# dfs_loaded["TX"].reset_index(drop=True)