In [8]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset, DataLoader
import pickle
import sys
sys.path.append("..")
from utils import to_numeric
from datasets.base_dataset import BaseDataset


In [9]:

class ADULT(BaseDataset):

    def __init__(self, name='ADULT', single_bit_binary=False, device='cpu', random_state=42,name_state=0):
        super(ADULT, self).__init__(name=name, device=device, random_state=random_state)

        self.features = {
            'age': None,
            'workclass': ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov',
                          'Without-pay', 'Never-worked'],
            'fnlwgt': None,
            'education': ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc',
                          '9th', '7th-8th', '12th', 'Masters',
                          '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'],
            'education-num': None,
            'marital-status': ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
                               'Married-spouse-absent', 'Married-AF-spouse'],
            'occupation': ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
                           'Prof-specialty', 'Handlers-cleaners',
                           'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving',
                           'Priv-house-serv', 'Protective-serv', 'Armed-Forces'],
            'relationship': ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'],
            'race': ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'],
            'sex': ['Female', 'Male'],
            'capital-gain': None,
            'capital-loss': None,
            'hours-per-week': None,
            'native-country': ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
                               'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece',
                               'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland',
                               'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland',
                               'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia',
                               'Hungary', 'Guatemala', 'Nicaragua', 'Scotland',
                               'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong',
                               'Holand-Netherlands'],
            'income': ['>50K', '<=50K']
        }

        self.single_bit_binary = single_bit_binary
        self.label = 'income'

        self.train_features = {key: self.features[key] for key in self.features.keys() if key != self.label}

        if name_state in ["WM","WW","BM","BW"]:
            train_data_df = pd.read_csv(f'../clients_data/raw_data/{name_state}.data', delimiter=',', names=list(self.features.keys()), engine='python')
            test_data_df = pd.read_csv(f'../clients_data/raw_data/client_adult.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')
        elif name_state in ["male","female","white","black"]:
            train_data_df = pd.read_csv(f'../clients_data/raw_data/{name_state}.data', delimiter=',', names=list(self.features.keys()), engine='python')
            test_data_df = pd.read_csv(f'../clients_data/raw_data/client_adult.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')
        elif name_state in ["client_adult"]:
            #do not change this.. this is for creating test split of 10k
            train_data_df = pd.read_csv(f'../clients_data/raw_data/{name_state}.test', delimiter=',', names=list(self.features.keys()), engine='python')
            test_data_df = pd.read_csv(f'../clients_data/raw_data/client_adult.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')
        elif name_state in ["data_pre_trained_model"]:
            
            train_data_df = pd.read_csv(f'../clients_data/raw_data/client_{name_state}.data', delimiter=',', names=list(self.features.keys()), engine='python')
            test_data_df = pd.read_csv(f'../clients_data/raw_data/client_adult.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')
        else:
            train_data_df = pd.read_csv(f'../clients_data/raw_data/client_{name_state}.data', delimiter=',', names=list(self.features.keys()), engine='python')        
            test_data_df = pd.read_csv(f'../clients_data/raw_data/client_{name_state}.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')
        
        print(f"training sample:: {name_state}.data and len is {len(train_data_df)}")
        print(f"testing sample:: {name_state}.test and len is {len(test_data_df)}")

        # train_data_df = pd.read_csv('datasets/ADULT/adult.data', delimiter=', ', names=list(self.features.keys()), engine='python')
        # test_data_df = pd.read_csv('datasets/ADULT/adult.test', delimiter=', ', names=list(self.features.keys()), skiprows=1, engine='python')

        train_data = train_data_df.to_numpy()
        test_data = test_data_df.to_numpy()

        # drop missing values
        # note that the category never worked always comes with a missing value for the occupation field, hence this
        # step effectively removes the never worked category from the dataset
        
        train_rows_to_keep = [not ('?' in row) for row in train_data]
        test_rows_to_keep = [not ('?' in row) for row in test_data]
        train_data = train_data[train_rows_to_keep]
        test_data = test_data[test_rows_to_keep]

        # remove the annoying dot from the test labels
        # for row in test_data:
        #     row[-1] = row[-1][:-1]

        # convert to numeric features
        train_data_num = to_numeric(train_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)
        test_data_num = to_numeric(test_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)

        # split features and labels
        Xtrain, Xtest = train_data_num[:, :-1].astype(np.float32), test_data_num[:, :-1].astype(np.float32)
        ytrain, ytest = train_data_num[:, -1].astype(np.float32), test_data_num[:, -1].astype(np.float32)
        self.num_features = Xtrain.shape[1]

        print(np.unique(ytrain))
        print(np.unique(ytest))

        # transfer to torch
        self.Xtrain, self.Xtest = torch.tensor(Xtrain).to(self.device), torch.tensor(Xtest).to(self.device)
        self.ytrain, self.ytest = torch.tensor(ytrain, dtype=torch.long).to(self.device), torch.tensor(ytest, dtype=torch.long).to(self.device)

        # set to train mode as base
        self.train()

        # calculate the standardization statistics
        self._calculate_mean_std()

        # calculate the histograms and feature bounds
        self._calculate_categorical_feature_distributions_and_continuous_bounds()


In [15]:
state_codes = [0,1,2,3,4,5,6,7,8,9]

for state_code in state_codes:
    state_name=state_code
    adult_dataset = ADULT(name_state=state_name)
    adult_dataset.standardize()
    dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

    with open(f'../clients_data/processed_data/{state_name}.pkl', 'wb') as f:
        pickle.dump(dataloader, f)
        
    dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
    with open(f'../clients_data/processed_data/{state_name}_test.pkl', 'wb') as f:
        pickle.dump(dataloader, f)

training sample:: 0.data and len is 2000
testing sample:: 0.test and len is 999
[0. 1.]
[0. 1.]
training sample:: 1.data and len is 2000
testing sample:: 1.test and len is 999
[0. 1.]
[0. 1.]
training sample:: 2.data and len is 2000
testing sample:: 2.test and len is 999
[0. 1.]
[0. 1.]
training sample:: 3.data and len is 2000
testing sample:: 3.test and len is 999
[0. 1.]
[0. 1.]
training sample:: 4.data and len is 2000
testing sample:: 4.test and len is 999
[0. 1.]
[0. 1.]
training sample:: 5.data and len is 2000
testing sample:: 5.test and len is 999
[0. 1.]
[0. 1.]
training sample:: 6.data and len is 2000
testing sample:: 6.test and len is 999
[0. 1.]
[0. 1.]
training sample:: 7.data and len is 2000
testing sample:: 7.test and len is 999
[0. 1.]
[0. 1.]
training sample:: 8.data and len is 2000
testing sample:: 8.test and len is 998
[0. 1.]
[0. 1.]
training sample:: 9.data and len is 2000
testing sample:: 9.test and len is 998
[0. 1.]
[0. 1.]


# 10K traning processed data

In [11]:
state_name="data_pre_trained_model"
adult_dataset = ADULT(name_state=state_name)
adult_dataset.standardize()
dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True) 
with open(f'../clients_data/processed_data/{state_name}.pkl', 'wb') as f:
    pickle.dump(dataloader, f)

training sample:: data_pre_trained_model.data and len is 2616
testing sample:: data_pre_trained_model.test and len is 9999
[0. 1.]
[0. 1.]


# testing data

In [12]:
state_name="client_adult"
adult_dataset = ADULT(name_state=state_name)
adult_dataset.standardize()
dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True) 
with open(f'../clients_data/processed_data/{state_name}.pkl', 'wb') as f:
    pickle.dump(dataloader, f)

training sample:: client_adult.data and len is 10000
testing sample:: client_adult.test and len is 9999
[0. 1.]
[0. 1.]


# group processed

In [13]:
#============================ For testing =============================
state_codes=["male","female","white","black"]

for state_code in state_codes:
    state_name=state_code
    adult_dataset = ADULT(name_state=state_name)
    adult_dataset.standardize()
    dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

    with open(f'../clients_data/client_subG_processed/{state_name}.pkl', 'wb') as f:
        pickle.dump(dataloader, f)
        
    dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
    with open(f'../clients_data/client_subG_processed/{state_name}_test.pkl', 'wb') as f:
        pickle.dump(dataloader, f)

training sample:: male.data and len is 6750
testing sample:: male.test and len is 9999
[0. 1.]
[0. 1.]
training sample:: female.data and len is 3250
testing sample:: female.test and len is 9999
[0. 1.]
[0. 1.]
training sample:: white.data and len is 8603
testing sample:: white.test and len is 9999
[0. 1.]
[0. 1.]
training sample:: black.data and len is 935
testing sample:: black.test and len is 9999
[0. 1.]
[0. 1.]


# Subgroup processed 

In [14]:
state_codes=["WM","WW","BM","BW"]

for state_code in state_codes:
    state_name=state_code
    adult_dataset = ADULT(name_state=state_name)
    adult_dataset.standardize()
    dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

    with open(f'../clients_data/client_subG_processed/{state_name}.pkl', 'wb') as f:
        pickle.dump(dataloader, f)
        
    dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
    with open(f'../clients_data/client_subG_processed/{state_name}_test.pkl', 'wb') as f:
        pickle.dump(dataloader, f)

training sample:: WM.data and len is 5975
testing sample:: WM.test and len is 9999
[0. 1.]
[0. 1.]
training sample:: WW.data and len is 2628
testing sample:: WW.test and len is 9999
[0. 1.]
[0. 1.]
training sample:: BM.data and len is 474
testing sample:: BM.test and len is 9999
[0. 1.]
[0. 1.]
training sample:: BW.data and len is 461
testing sample:: BW.test and len is 9999
[0. 1.]
[0. 1.]
