In [None]:
from folktables import ACSDataSource, ACSEmployment,ACSIncome
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from datasets.base_dataset import BaseDataset
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import Dataset, DataLoader
import sys
from utils import to_numeric
import pickle


In [4]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')


# data collection

In [3]:
state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
               "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
               "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
               "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
               "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]


dfs = {}
for state_code in state_codes:
    acs_data = data_source.get_data(states=[state_code],download=True)
    features, label, group = ACSIncome.df_to_pandas(acs_data)
    dfs[state_code] = (features, label)


In [4]:
all_len=[]
for state_code, (features, label) in dfs.items():
    all_len.append(len(label))
    print(f"State: {state_code}, Features Length: {len(features)}, Label Length: {len(label)}")

State: AL, Features Length: 22268, Label Length: 22268
State: AK, Features Length: 3546, Label Length: 3546
State: AZ, Features Length: 33277, Label Length: 33277
State: AR, Features Length: 13929, Label Length: 13929
State: CA, Features Length: 195665, Label Length: 195665
State: CO, Features Length: 31306, Label Length: 31306
State: CT, Features Length: 19785, Label Length: 19785
State: DE, Features Length: 4713, Label Length: 4713
State: FL, Features Length: 98925, Label Length: 98925
State: GA, Features Length: 50915, Label Length: 50915
State: HI, Features Length: 7731, Label Length: 7731
State: ID, Features Length: 8265, Label Length: 8265
State: IL, Features Length: 67016, Label Length: 67016
State: IN, Features Length: 35022, Label Length: 35022
State: IA, Features Length: 17745, Label Length: 17745
State: KS, Features Length: 15807, Label Length: 15807
State: KY, Features Length: 22006, Label Length: 22006
State: LA, Features Length: 20667, Label Length: 20667
State: ME, Featu

In [7]:
merge_dfs={}
sample_size=500

for state_code, (features, label) in dfs.items():   

    merge_df = pd.concat([features, label], axis=1)
    merge_df = merge_df.dropna()
    print(len(merge_df))
    merge_df['PINCP'] = merge_df['PINCP'].replace({True: '>50K', False: '<=50K'})

    # Random sampling with maintaning the statistics 
    # merge_df['strat'] = merge_df['SEX'].astype(str) + '_' + merge_df['RELP'].astype(str)
    merge_df['strat'] = merge_df['SEX'].astype(str) + '_' + merge_df['RELP'].astype(str) + '_' + merge_df['PINCP'].astype(str)
    
    # merge_df['strat'] = (
    #     merge_df['COW'].astype(str) + '_' +
    #     merge_df['MAR'].astype(str) + '_' +
    #     merge_df['SEX'].astype(str) + '_' +
    #     merge_df['RAC1P'].astype(str) + '_' +
    #     merge_df['PINCP'].astype(str)
    #     )

    # Check for rare combinations (occur less than 2 times)
    #     
    
    combination_counts = merge_df['strat'].value_counts()
    rare_combinations = combination_counts[combination_counts < 2].index
    # print(len(merge_df))
    print("rare_combinations:",len(rare_combinations))

    # Remove rare combination:- E.g 2.0_14.0_False from FL
    
    merge_df = merge_df[~merge_df['strat'].isin(rare_combinations)]
    print(len(merge_df))

    sampled_df, _ = train_test_split(
    merge_df,
    train_size=sample_size,
    stratify=merge_df['strat'],
    random_state=42  
)

    sampled_df = sampled_df.drop('strat', axis=1)
    sampled_df = sampled_df.reset_index(drop=True)

    merge_dfs[state_code] = sampled_df
    # merge_dfs[state_code] = merge_df
    
for state_code, df in merge_dfs.items():
    label_counts = df['PINCP'].value_counts()
    print(f"State: {state_code}, df Length: {len(df)}, Label Counts: {label_counts.to_dict()}")

    # print(f"State: {state_code}, df Length: {len(df)}")

22268
rare_combinations: 4
22264
3546
rare_combinations: 5
3541
33277
rare_combinations: 0
33277
13929
rare_combinations: 2
13927
195665
rare_combinations: 0
195665
31306
rare_combinations: 1
31305
19785
rare_combinations: 2
19783
4713
rare_combinations: 5
4708
98925
rare_combinations: 1
98924
50915
rare_combinations: 3
50912
7731
rare_combinations: 4
7727
8265
rare_combinations: 6
8259
67016
rare_combinations: 0
67016
35022
rare_combinations: 1
35021
17745
rare_combinations: 5
17740
15807
rare_combinations: 6
15801
22006
rare_combinations: 7
21999
20667
rare_combinations: 5
20662
7002
rare_combinations: 6
6996
33042
rare_combinations: 1
33041
40114
rare_combinations: 0
40114
50008
rare_combinations: 1
50007
31021
rare_combinations: 3
31018
13189
rare_combinations: 6
13183
31664
rare_combinations: 2
31662
5463
rare_combinations: 11
5452
10785
rare_combinations: 5
10780
14807
rare_combinations: 6
14801
7966
rare_combinations: 4
7962
47781
rare_combinations: 0
47781
8711
rare_combination

In [16]:
final_data = pd.concat(merge_dfs.values(), keys=merge_dfs.keys(), names=['state_code', 'index'])

final_data = final_data.reset_index()
final_data=final_data.drop(columns=['state_code', 'index'])


In [18]:
len(final_data)

25000

In [20]:
# take only 1 and 2 
final_data=final_data[(final_data['RAC1P'].isin([1, 2])) & (final_data['SEX'].isin([1, 2]))]
len(final_data)

22218

In [21]:
final_data.head()

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
1,33.0,1.0,16.0,3.0,8990.0,1.0,0.0,40.0,1.0,1.0,<=50K
2,76.0,1.0,16.0,1.0,4230.0,1.0,1.0,40.0,2.0,2.0,<=50K
3,79.0,1.0,1.0,1.0,9645.0,1.0,0.0,20.0,2.0,1.0,<=50K
4,33.0,1.0,18.0,1.0,1105.0,1.0,1.0,45.0,1.0,1.0,>50K
5,45.0,1.0,12.0,3.0,4230.0,1.0,0.0,40.0,2.0,1.0,<=50K


In [22]:
final_data['PINCP'].value_counts()

PINCP
<=50K    14253
>50K      7965
Name: count, dtype: int64

In [23]:
final_data['SEX'].value_counts()

SEX
1.0    11600
2.0    10618
Name: count, dtype: int64

In [24]:
final_data['RAC1P'].value_counts()

RAC1P
1.0    20304
2.0     1914
Name: count, dtype: int64

In [26]:
train_data, test_data = train_test_split(final_data, test_size=0.2, random_state=42)

In [27]:
train_data.to_csv(f'50_clients_data/raw_data/all_client_25K_data_train.data', header=False, index=False)

In [28]:
test_data['PINCP'] = test_data['PINCP'].astype(str) + '.'
print(test_data.head())

test_data.to_csv(f'50_clients_data/raw_data/all_client_25K.test', header=False, index=False)

       AGEP  COW  SCHL  MAR    OCCP  POBP  RELP  WKHP  SEX  RAC1P   PINCP
1845   65.0  3.0  22.0  1.0  2320.0   5.0   1.0  45.0  2.0    1.0   >50K.
8801   25.0  1.0  21.0  5.0  7000.0  22.0   2.0  40.0  1.0    1.0  <=50K.
11179  22.0  1.0  16.0  5.0  9645.0  27.0   2.0  50.0  1.0    1.0  <=50K.
9005   64.0  1.0  16.0  3.0  9121.0  36.0   0.0  28.0  1.0    1.0  <=50K.
11663  40.0  2.0  16.0  5.0  4010.0  28.0   2.0  40.0  1.0    2.0  <=50K.


In [29]:
# take BLACK AND WHITE

# data processing

In [5]:
sys.path.append("..")


In [6]:
class ADULT(BaseDataset):

    def __init__(self, name='ADULT', single_bit_binary=False, device='cpu', random_state=42, name_state="AL"):
        super(ADULT, self).__init__(name=name, device=device, random_state=random_state)
        print(name_state)
        self.features = {
            'AGEP': None,
            'COW': None,
            'SCHL': None,
            'MAR': None,
            'OCCP': None,
            'POBP': None,
            'RELP': None,
            'WKHP': None,
            'SEX': None,
            'RAC1P': None,      
            'PINCP': ['>50K', '<=50K']
        }
        
        self.single_bit_binary = single_bit_binary
        self.label = 'PINCP'

        self.train_features = {key: self.features[key] for key in self.features.keys() if key != self.label}

        # name_state="GA"
        print("Path is static")
        self.train_data_df = pd.read_csv(f'50_clients_data/raw_data/all_client_25K_data_train.data', delimiter=',', names=list(self.features.keys()), engine='python')
        self.test_data_df = pd.read_csv(f'50_clients_data/raw_data/all_client_25K.test', delimiter=',', names=list(self.features.keys()), skiprows=1, engine='python')

        train_data = self.train_data_df.to_numpy()
        test_data = self.test_data_df.to_numpy()

        train_rows_to_keep = [not ('?' in row) for row in train_data]
        test_rows_to_keep = [not ('?' in row) for row in test_data]

        train_data = train_data[train_rows_to_keep]
        test_data = test_data[test_rows_to_keep]

        # remove the annoying dot from the test labels
        for row in test_data:
            # print(len(row))
            # print(row[-1])

            row[-1] = row[-1][:-1]

        # convert to numeric features
        train_data_num = to_numeric(train_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)
        test_data_num = to_numeric(test_data, self.features, label=self.label, single_bit_binary=self.single_bit_binary)

        # split features and labels
        Xtrain, Xtest = train_data_num[:, :-1].astype(np.float32), test_data_num[:, :-1].astype(np.float32)
        ytrain, ytest = train_data_num[:, -1].astype(np.float32), test_data_num[:, -1].astype(np.float32)

        print(name_state,len(Xtrain))
        print("ytrain ",np.unique(ytrain))
        print("ytest ",np.unique(ytest))
        
        self.num_features = Xtrain.shape[1]

        # transfer to torch
        self.Xtrain, self.Xtest = torch.tensor(Xtrain).to(self.device), torch.tensor(Xtest).to(self.device)
        self.ytrain, self.ytest = torch.tensor(ytrain, dtype=torch.long).to(self.device), torch.tensor(ytest, dtype=torch.long).to(self.device)

        # set to train mode as base
        self.train()

        # calculate the standardization statistics
        self._calculate_mean_std()

        # calculate the histograms and feature bounds
        self._calculate_categorical_feature_distributions_and_continuous_bounds()

In [32]:
adult_dataset = ADULT()
adult_dataset.standardize()


AL
Path is static
AL 17774
ytrain  [0. 1.]
ytest  [0. 1.]


In [33]:

dataset = TensorDataset(adult_dataset.Xtrain, adult_dataset.ytrain)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

with open(f'50_clients_data/processed_data/all_client_25K_train.pkl', 'wb') as f:
        pickle.dump(dataloader, f)

dataset = TensorDataset(adult_dataset.Xtest, adult_dataset.ytest)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)  

with open(f'50_clients_data/processed_data/all_client_25K_test.pkl', 'wb') as f:
        pickle.dump(dataloader, f)



In [34]:
"""
    0: >50k
    1: <=50k
    
"""

'\n    0: >50k\n    1: <=50k\n    \n'

In [35]:
torch.unique(adult_dataset.ytest, return_counts=True)

(tensor([0, 1]), tensor([1574, 2869]))

In [36]:
torch.unique(adult_dataset.ytrain, return_counts=True)

(tensor([0, 1]), tensor([ 6390, 11384]))

In [38]:
adult_dataset.de_standardize()

In [39]:
adult_dataset.ytest

tensor([1, 0, 1,  ..., 0, 1, 1])

# Model Training

In [2]:
class LinReLU(nn.Module):

    """
    A linear layer followed by a ReLU activation layer.
    """    
    
    def __init__(self, in_size, out_size):
        super(LinReLU, self).__init__()      
        linear = nn.Linear(in_size, out_size)
        ReLU = nn.ReLU()
        # self.Dropout = nn.Dropout(0.25)
        self.layers = nn.Sequential(linear, ReLU)

    def reset_parameters(self):
        self.layers[0].reset_parameters()
        return self

    def forward(self, x):
        x = self.layers(x)
        return x

class FullyConnected(nn.Module):
    """
    A simple fully connected neural network with ReLU activations.
    """
    def __init__(self, input_size, layout):

        super(FullyConnected, self).__init__()
        layers = [nn.Flatten()]  # does not play any role, but makes the code neater
        prev_fc_size = input_size
        for i, fc_size in enumerate(layout):
            if i + 1 < len(layout):
                layers += [LinReLU(prev_fc_size, fc_size)]
            else:
                layers += [nn.Linear(prev_fc_size, 1), nn.Sigmoid()]
                # layers += [nn.Linear(prev_fc_size, fc_size)]
            prev_fc_size = fc_size
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        x = self.layers(x)
        return x

In [None]:
client_data_dir="50_clients_data/processed_data/"

layout = [100, 100, 2]
batch_size = 32
num_epochs = 10  
input_dim = 10
lr=0.01

model = FullyConnected(input_dim, layout)
criterion = nn.BCELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001) 



In [12]:
with open('50_clients_data/processed_data/all_client_25K_train.pkl', 'rb') as f:
    train_data_all_client  = pickle.load(f)

with open('50_clients_data/processed_data/all_client_25K_test.pkl', 'rb') as f:
    test_data  = pickle.load(f)

print(f"all data points_", len(train_data_all_client)*batch_size)   


all data points_ 17792


In [41]:

gradients = [] 

for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in train_data_all_client:
        labels=labels.unsqueeze(1).float()
        optimizer.zero_grad()
        
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        
        loss.backward()
        
        batch_gradients = []
        
        for param in model.parameters():
            if param.grad is not None:
                batch_gradients.append(param.grad.clone())
        gradients.append(batch_gradients)

        optimizer.step()
        
        running_loss += loss.item()
        predicted_classes = (outputs > 0.5).float()
        correct += (predicted_classes == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / len(train_data_all_client)
    accuracy = correct / total
    
    # print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}")
   
    model.eval()
    with torch.no_grad(): 
        val_running_loss = 0.0
        val_correct = 0
        val_total = 0
        
        for inputs, labels in test_data:
            labels=labels.unsqueeze(1).float()
            outputs = model(inputs)
            val_loss = criterion(outputs, labels)
            val_running_loss += val_loss.item()

            predicted_classes = (outputs > 0.5).float()
            val_correct += (predicted_classes == labels).sum().item()
            val_total += labels.size(0)

            
            # _, val_predicted = torch.max(outputs, 1)
            # val_total += labels.size(0)
            # val_correct += (val_predicted == labels).sum().item()
    
    val_epoch_loss = val_running_loss / len(test_data)
    val_accuracy = val_correct / val_total
    model.train()
    
    print(f"Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")


all data points_ 17792
Validation Loss: 0.4346, Validation Accuracy: 0.7907
Validation Loss: 0.4289, Validation Accuracy: 0.7961
Validation Loss: 0.4272, Validation Accuracy: 0.7943
Validation Loss: 0.4266, Validation Accuracy: 0.7947
Validation Loss: 0.4255, Validation Accuracy: 0.7950
Validation Loss: 0.4319, Validation Accuracy: 0.7941
Validation Loss: 0.4308, Validation Accuracy: 0.7945
Validation Loss: 0.4276, Validation Accuracy: 0.7947
Validation Loss: 0.4269, Validation Accuracy: 0.7952
Validation Loss: 0.4310, Validation Accuracy: 0.7954


In [42]:
# torch.save(gradients, 'AL_gradients.pth')

model_path = f"50_clients_data/clients_trained_model/pre_trained_model.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}\n")

Model saved to 50_clients_data/clients_trained_model/pre_trained_model.pth



# Transfer learning approach

In [3]:
client_data_dir="50_clients_data/processed_data/"
model_path = f"50_clients_data/clients_trained_model/pre_trained_model.pth"

layout = [100, 100, 2]
batch_size = 32
num_epochs = 10 
input_dim = 10
lr=0.01

model = FullyConnected(input_dim, layout)
criterion = nn.BCELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001) 

In [4]:
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [6]:
# tableak_FT/50_clients_data/processed_data/AZ.pkl

In [8]:

with open('50_clients_data/processed_data/AK.pkl', 'rb') as f:
    new_train_data_all_client  = pickle.load(f)

with open('50_clients_data/processed_data/AK_test.pkl', 'rb') as f:
    new_test_data  = pickle.load(f)

print(f"all data points_", len(new_train_data_all_client)*batch_size)   


print(f"all data points_", len(new_test_data)*batch_size)   


all data points_ 2848
all data points_ 736


In [21]:
gradients = [] 

for epoch in range(num_epochs):
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, labels in new_test_data:
        labels=labels.unsqueeze(1).float()
        optimizer.zero_grad()
        
        outputs = model(inputs)
        
        loss = criterion(outputs, labels)
        
        loss.backward()
        
        batch_gradients = []
        
        for param in model.parameters():
            if param.grad is not None:
                batch_gradients.append(param.grad.clone())
        gradients.append(batch_gradients)

        optimizer.step()
        
        running_loss += loss.item()
        predicted_classes = (outputs > 0.5).float()
        correct += (predicted_classes == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / len(train_data_all_client)
    accuracy = correct / total
    
    # print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}")
   
    model.eval()
    with torch.no_grad(): 
        val_running_loss = 0.0
        val_correct = 0
        val_total = 0
        
        for inputs, labels in new_train_data_all_client:
            labels=labels.unsqueeze(1).float()
            outputs = model(inputs)
            val_loss = criterion(outputs, labels)
            val_running_loss += val_loss.item()

            predicted_classes = (outputs > 0.5).float()
            val_correct += (predicted_classes == labels).sum().item()
            val_total += labels.size(0)

            
            # _, val_predicted = torch.max(outputs, 1)
            # val_total += labels.size(0)
            # val_correct += (val_predicted == labels).sum().item()
    
    val_epoch_loss = val_running_loss / len(test_data)
    val_accuracy = val_correct / val_total
    model.train()
    
    print(f"Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

Epoch [1/10], Training Loss: 0.0142, Training Accuracy: 0.8307
Validation Loss: 1.9419, Validation Accuracy: 0.7634
Epoch [2/10], Training Loss: 0.0142, Training Accuracy: 0.8350
Validation Loss: 1.9615, Validation Accuracy: 0.7613
Epoch [3/10], Training Loss: 0.0139, Training Accuracy: 0.8392
Validation Loss: 1.9626, Validation Accuracy: 0.7602
Epoch [4/10], Training Loss: 0.0134, Training Accuracy: 0.8378
Validation Loss: 1.9960, Validation Accuracy: 0.7606
Epoch [5/10], Training Loss: 0.0131, Training Accuracy: 0.8491
Validation Loss: 2.0020, Validation Accuracy: 0.7599
Epoch [6/10], Training Loss: 0.0132, Training Accuracy: 0.8547
Validation Loss: 2.0300, Validation Accuracy: 0.7567
Epoch [7/10], Training Loss: 0.0133, Training Accuracy: 0.8533
Validation Loss: 2.0439, Validation Accuracy: 0.7595
Epoch [8/10], Training Loss: 0.0124, Training Accuracy: 0.8561
Validation Loss: 2.0553, Validation Accuracy: 0.7574
Epoch [9/10], Training Loss: 0.0129, Training Accuracy: 0.8632
Validatio