In [1]:
import torch
import numpy as np
import pandas as pd
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset, DataLoader
import pickle
import sys
sys.path.append("..")
from utils import to_numeric
from datasets.base_dataset import BaseDataset


In [4]:

class ADULT(BaseDataset):

    def __init__(self, name='ADULT', single_bit_binary=False, device='cpu', random_state=42,name_state=0):
        super(ADULT, self).__init__(name=name, device=device, random_state=random_state)

        self.features = {
            'age': None,
            'workclass': ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov',
                          'Without-pay', 'Never-worked'],
            'fnlwgt': None,
            'education': ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc',
                          '9th', '7th-8th', '12th', 'Masters',
                          '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool'],
            'education-num': None,
            'marital-status': ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
                               'Married-spouse-absent', 'Married-AF-spouse'],
            'occupation': ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
                           'Prof-specialty', 'Handlers-cleaners',
                           'Machine-op-inspct', 'Adm-clerical', 'Farming-fishing', 'Transport-moving',
                           'Priv-house-serv', 'Protective-serv', 'Armed-Forces'],
            'relationship': ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'],
            'race': ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black'],
            'sex': ['Female', 'Male'],
            'capital-gain': None,
            'capital-loss': None,
            'hours-per-week': None,
            'native-country': ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
                               'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece',
                               'South', 'China', 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland',
                               'Jamaica', 'Vietnam', 'Mexico', 'Portugal', 'Ireland',
                               'France', 'Dominican-Republic', 'Laos', 'Ecuador', 'Taiwan', 'Haiti', 'Columbia',
                               'Hungary', 'Guatemala', 'Nicaragua', 'Scotland',
                               'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong',
                               'Holand-Netherlands'],
            'income': ['>50K', '<=50K']
        }

        self.single_bit_binary = single_bit_binary
        self.label = 'income'

        self.train_features = {key: self.features[key] for key in self.features.keys() if key != self.label}

        print("WE take Recon. data for processing")
        train_data_df = pd.read_csv(f'../result_analysis/hello.data', delimiter=',', names=list(self.features.keys()), engine='python')        
        test_data_df = pd.read_csv(f'../clients_data/raw_data/client_{name_state}.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')
        print(train_data_df.head())
        print(f"training sample:: {name_state}.data and len is {len(train_data_df)}")
        print(f"testing sample:: {name_state}.test and len is {len(test_data_df)}")

        # train_data_df = pd.read_csv('datasets/ADULT/adult.data', delimiter=', ', names=list(self.features.keys()), engine='python')
        # test_data_df = pd.read_csv('datasets/ADULT/adult.test', delimiter=', ', names=list(self.features.keys()), skiprows=1, engine='python')

        train_data = train_data_df.to_numpy()
        test_data = test_data_df.to_numpy()

        # drop missing values
        # note that the category never worked always comes with a missing value for the occupation field, hence this
        # step effectively removes the never worked category from the dataset
        
        train_rows_to_keep = [not ('?' in row) for row in train_data]
        test_rows_to_keep = [not ('?' in row) for row in test_data]
        train_data = train_data[train_rows_to_keep]
        test_data = test_data[test_rows_to_keep]

        # remove the annoying dot from the test labels
        # for row in test_data:
        #     row[-1] = row[-1][:-1]

        # convert to numeric features
        train_data_num = to_numeric(train_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)
        test_data_num = to_numeric(test_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)

        # split features and labels
        Xtrain, Xtest = train_data_num[:, :-1].astype(np.float32), test_data_num[:, :-1].astype(np.float32)
        ytrain, ytest = train_data_num[:, -1].astype(np.float32), test_data_num[:, -1].astype(np.float32)
        self.num_features = Xtrain.shape[1]

        print(np.unique(ytrain))
        print(np.unique(ytest))

        # transfer to torch
        self.Xtrain, self.Xtest = torch.tensor(Xtrain).to(self.device), torch.tensor(Xtest).to(self.device)
        self.ytrain, self.ytest = torch.tensor(ytrain, dtype=torch.long).to(self.device), torch.tensor(ytest, dtype=torch.long).to(self.device)

        # set to train mode as base
        self.train()

        # calculate the standardization statistics
        self._calculate_mean_std()

        # calculate the histograms and feature bounds
        self._calculate_categorical_feature_distributions_and_continuous_bounds()


In [5]:
adult_dataset = ADULT(name_state=0) # namestate does not matter
adult_dataset.standardize()
# dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

# with open('../clients_data/processed_data/hello.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)

with open('../clients_data/processed_data/directInputX_hello.pkl', 'wb') as f:
        pickle.dump(adult_dataset.Xtrain, f)

WE take Recon. data for processing
   age         workclass  fnlwgt     education  education-num  \
0   36           Private  172974       HS-grad             10   
1   37      Self-emp-inc  189901     Bachelors             10   
2   86  Self-emp-not-inc  182707   Prof-school             18   
3   71           Private  246104  Some-college             14   
4   36           Private  174460       HS-grad             10   

       marital-status         occupation   relationship                race  \
0       Never-married  Machine-op-inspct      Own-child               White   
1  Married-civ-spouse    Exec-managerial  Not-in-family  Amer-Indian-Eskimo   
2  Married-civ-spouse     Prof-specialty           Wife               White   
3  Married-civ-spouse    Exec-managerial  Not-in-family               White   
4       Never-married      Other-service  Not-in-family               White   

    sex  capital-gain  capital-loss  hours-per-week native-country income  
0  Male             0  

In [16]:
adult_dataset.Xtrain.shape

torch.Size([16, 105])

In [None]:
adult_dataset.Xtrain[2:3,:100]

tensor([[ 2.5490, -1.7301, -0.2983,  5.6312, -0.1832, -0.2593, -0.2077, -0.0314,
          0.0000, -0.0073, -0.4578, -0.5471, -0.1714,  1.4764, -0.1380, -0.2102,
         -0.2077, -0.1046, -0.1224, -0.1093, -0.2461, -0.0771, -0.1488, -0.1093,
         -0.1224, -0.0444, -2.0276, -0.9136, -0.4195,  1.4531, -0.2024, -0.1683,
         -0.0771, -0.0314, -0.1971, -0.4113, -0.3267,  2.6549, -0.4064, -0.3948,
         -0.1944, -0.2323, -0.3898, -0.1744, -0.2102, -0.0444, -0.1652,  0.0000,
         -0.2024, -0.4064, -0.8330,  1.7165, -0.1621, -0.3864,  0.3965, -0.1714,
         -0.0945, -0.1182, -0.3041, -0.6937,  0.6937,  0.8126,  3.0399, -1.8255,
          0.3041,  0.0000, -0.0444, -0.0833, -0.0314, -0.0629,  0.0000, -0.0771,
         -0.0703, -0.0314,  0.0000, -0.0314, -0.0444,  0.0000,  0.0000, -0.0703,
         -0.0444, -0.0314,  0.0000, -0.0314, -0.1522, -0.0544, -0.0314, -0.0444,
         -0.0444,  0.0000,  0.0000, -0.0314, -0.0314,  0.0000, -0.0314, -0.0544,
         -0.0314, -0.0444,  

In [None]:
# tensor([[     2.5150,     -2.2710,     -0.5714,      5.3013,     -0.6019,
#              -1.4201,     -0.1139,      1.1448,      0.3965,      0.0021,
#               2.0328,     -0.8172,     -0.2513,      2.6770,     -2.1119,
#               0.0942,     -1.2541,      0.0855,     -0.2165,      0.0419,
#              -0.8917,     -0.4869,      0.1530,     -0.1718,     -0.0201,

In [1]:
# for i in range(0, 8): 
#     print(f"Index {i}:")
#     print(adult_dataset.Xtrain[i, :15]) 
#     print("-" * 50)  # Separator for readability

In [30]:
with open('../clients_data/processed_data/directInputX_hello.pkl', 'rb') as f:
    loaded_dataloader = pickle.load(f)

In [31]:
loaded_dataloader.requires_grad_(True)

tensor([[ 0.5827, -1.7439, -0.2916, -0.1781, -0.1840,  3.8690, -0.2085, -0.0315,
          0.0000, -0.8007, -0.4600, -0.5453, -0.1721, -0.6744, -0.1386, -0.2111,
         -0.2059, -0.1050, -0.1229, -0.1098,  4.0430, -0.0774, -0.1494, -0.1098,
         -0.1229, -0.0446,  0.6941, -0.9166, -0.4165, -0.6884,  4.9795, -0.1690,
         -0.0774, -0.0315, -0.1979, -0.4066, -0.3245, -0.3746, -0.4132,  2.5512,
         -0.1952, -0.2356, -0.3899, -0.1751, -0.2111, -0.0446, -0.1659,  0.0000,
         -0.2033, -0.4066, -0.8368,  1.7300, -0.1627, -0.3865,  0.3983, -0.1721,
         -0.0949, -0.1187, -0.3054, -0.6931,  0.6931, -0.1515,  2.7169,  0.4917,
          0.3054,  0.0000, -0.0446, -0.0836, -0.0315, -0.0631,  0.0000, -0.0774,
         -0.0706, -0.0315,  0.0000, -0.0315, -0.0446,  0.0000,  0.0000, -0.0706,
         -0.0446, -0.0315,  0.0000, -0.0315, -0.1528, -0.0546, -0.0315, -0.0446,
         -0.0446,  0.0000,  0.0000, -0.0315, -0.0315,  0.0000, -0.0315, -0.0546,
         -0.0315, -0.0446,  

In [5]:
# state_codes = [0,1,2,3,4,5,6,7,8,9]

# for state_code in state_codes:
#     state_name=state_code
#     adult_dataset = ADULT(name_state=state_name)
#     adult_dataset.standardize()
#     dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

#     with open(f'../clients_data/processed_data/{state_name}.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)
        
#     dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
#     with open(f'../clients_data/processed_data/{state_name}_test.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)

# 10K traning processed data

In [9]:
# state_name="data_pre_trained_model"
# adult_dataset = ADULT(name_state=state_name)
# adult_dataset.standardize()
# dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True) 
# with open(f'../clients_data/processed_data/{state_name}.pkl', 'wb') as f:
#     pickle.dump(dataloader, f)

# testing data

In [8]:
# state_name="client_adult"
# adult_dataset = ADULT(name_state=state_name)
# adult_dataset.standardize()
# dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True) 
# with open(f'../clients_data/processed_data/{state_name}.pkl', 'wb') as f:
#     pickle.dump(dataloader, f)

# group processed

In [7]:
# #============================ For testing =============================
# state_codes=["male","female","white","black"]

# for state_code in state_codes:
#     state_name=state_code
#     adult_dataset = ADULT(name_state=state_name)
#     adult_dataset.standardize()
#     dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

#     with open(f'../clients_data/client_subG_processed/{state_name}.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)
        
#     dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
#     with open(f'../clients_data/client_subG_processed/{state_name}_test.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)

# Subgroup processed 

In [6]:
# state_codes=["WM","WW","BM","BW"]

# for state_code in state_codes:
#     state_name=state_code
#     adult_dataset = ADULT(name_state=state_name)
#     adult_dataset.standardize()
#     dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

#     with open(f'../clients_data/client_subG_processed/{state_name}.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)
        
#     dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
#     dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  
    
#     with open(f'../clients_data/client_subG_processed/{state_name}_test.pkl', 'wb') as f:
#         pickle.dump(dataloader, f)