In [1]:
import torch.nn as nn
import torch.optim as optim
import shutil
import os
from torch.utils.data import DataLoader
import torch
import numpy as np
import pandas as pd
import pickle
from fair_loss import FairLoss
from torch.nn import functional as F
from datasets import ADULT, German, Lawschool, HealthHeritage
from defenses import dp_defense

In [2]:
import torch
print(torch.__version__)

1.7.1


In [3]:
class LinReLU(nn.Module):

    """
    A linear layer followed by a ReLU activation layer.
    """    
    
    def __init__(self, in_size, out_size):
        super(LinReLU, self).__init__()      
        linear = nn.Linear(in_size, out_size)
        ReLU = nn.ReLU()
        # self.Dropout = nn.Dropout(0.25)
        self.layers = nn.Sequential(linear, ReLU)

    def reset_parameters(self):
        self.layers[0].reset_parameters()
        return self

    def forward(self, x):
        x = self.layers(x)
        return x

class FullyConnected(nn.Module):
    """
    A simple fully connected neural network with ReLU activations.
    """
    def __init__(self, input_size, layout):

        super(FullyConnected, self).__init__()
        layers = [nn.Flatten()]  # does not play any role, but makes the code neater
        prev_fc_size = input_size
        for i, fc_size in enumerate(layout):
            if i + 1 < len(layout):
                layers += [LinReLU(prev_fc_size, fc_size)]
            else:
                layers += [nn.Linear(prev_fc_size, 1), nn.Sigmoid()]
                # layers += [nn.Linear(prev_fc_size, fc_size)]
            prev_fc_size = fc_size
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        x = self.layers(x)
        return x

In [4]:
state_codes = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA",
               "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
               "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
               "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
               "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

# Normal training

In [5]:
# client_data_dir="50_clients_data/processed_data/"

# layout = [100, 100, 2]
# batch_size = 32
# num_epochs = 10  
# input_dim = 10

# model = FullyConnected(input_dim, layout)
# criterion = nn.BCELoss() 
# optimizer = optim.Adam(model.parameters(), lr=0.001) 

# for state_code in state_codes:
    
#     # print(state_code)    
#     state_name=state_code
    
#     with open(client_data_dir+f'{state_name}.pkl', 'rb') as f:
#         train_data_all_client  = pickle.load(f)
    
#     with open(client_data_dir+f'{state_name}_test.pkl', 'rb') as f:
#         test_data  = pickle.load(f)

#     print(f"data points_{state_code}", len(train_data_all_client)*batch_size)    
    
#     for epoch in range(num_epochs):
#         running_loss = 0.0
#         correct = 0
#         total = 0
    
#         for inputs, labels in train_data_all_client:
#             labels=labels.unsqueeze(1).float()
#             optimizer.zero_grad()
            
#             outputs = model(inputs)
            
#             loss = criterion(outputs, labels)
            
#             loss.backward()
#             optimizer.step()
            
#             running_loss += loss.item()
#             predicted_classes = (outputs > 0.5).float()
#             correct += (predicted_classes == labels).sum().item()
#             total += labels.size(0)
               
#         epoch_loss = running_loss / len(train_data_all_client)
#         accuracy = correct / total
        
#         # print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}")
       
#         model.eval()
#         with torch.no_grad(): 
#             val_running_loss = 0.0
#             val_correct = 0
#             val_total = 0
            
#             for inputs, labels in test_data:
#                 labels=labels.unsqueeze(1).float()
#                 outputs = model(inputs)
#                 val_loss = criterion(outputs, labels)
#                 val_running_loss += val_loss.item()

#                 predicted_classes = (outputs > 0.5).float()
#                 val_correct += (predicted_classes == labels).sum().item()
#                 val_total += labels.size(0)

                
#                 # _, val_predicted = torch.max(outputs, 1)
#                 # val_total += labels.size(0)
#                 # val_correct += (val_predicted == labels).sum().item()
        
#         val_epoch_loss = val_running_loss / len(test_data)
#         val_accuracy = val_correct / val_total
#         model.train()
        
#     print(f"Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
        
#     model_path = f"50_clients_data/clients_trained_model/{state_name}.pth"
#     torch.save(model.state_dict(), model_path)
#     print(f"Model saved to {model_path}\n")

# Training DP

In [6]:
# model = FullyConnected(input_dim, layout)
# criterion = nn.BCELoss() 
# optimizer = optim.Adam(model.parameters(), lr=0.001) 

In [14]:
k=0
client_data_dir="50_clients_data/processed_data/"

layout = [100, 100, 2]
batch_size = 32
num_epochs = 1
input_dim = 10
lr=0.001
noise_scale =0.1

for state_code in state_codes:
    # if k==1:
    #     break
    # print(state_code)    
    state_name=state_code
    
    model = FullyConnected(input_dim, layout)
    criterion = nn.BCELoss() 
    optimizer = optim.Adam(model.parameters(), lr=0.001) 
    
    with open(client_data_dir+f'{state_name}.pkl', 'rb') as f:
        train_data_all_client  = pickle.load(f)
    
    with open(client_data_dir+f'{state_name}_test.pkl', 'rb') as f:
        test_data  = pickle.load(f)

    print(f"data points_{state_code}", len(train_data_all_client)*batch_size)    
    
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in train_data_all_client:  

            # print("inputs:: ",inputs[:2])
            # print("labels:: ",labels[:2])
            
            permutation_indices = np.random.permutation(len(inputs))
            X_train_permuted, y_train_permuted = inputs[permutation_indices].detach().clone(), labels[permutation_indices].detach().clone()

            # print("X_train_permuted:: ",X_train_permuted[:2])
            # print("y_train_permuted:: ",y_train_permuted[:2])
            
            labels=labels.unsqueeze(1).float()
            y_train_permuted=y_train_permuted.unsqueeze(1).float()

            # print(y_train_permuted)
            # print(labels)
            
            # optimizer.zero_grad()
            model.zero_grad()
            
            outputs = model(X_train_permuted)
            loss = criterion(outputs, y_train_permuted)
                        
            grad = [g.detach() for g in torch.autograd.grad(loss, model.parameters(),retain_graph=True)]
            # print("grad",grad[0][:2])
            
            perturbed_grad = dp_defense(grad, noise_scale) if noise_scale > 0 else grad
            # print("perturbed_grad",perturbed_grad[0][:2])
            
            with torch.no_grad():
                for p, g in zip(model.parameters(), perturbed_grad):
                    p.data = p.data - lr * g
                               
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            predicted_classes = (outputs > 0.5).float()
            correct += (predicted_classes == y_train_permuted).sum().item()
            total += y_train_permuted.size(0)
               
        epoch_loss = running_loss / len(train_data_all_client)
        accuracy = correct / total
        
        # print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}")
        # k+=1
        model.eval()
        with torch.no_grad(): 
            val_running_loss = 0.0
            val_correct = 0
            val_total = 0
            
            for inputs, labels in test_data:
                labels=labels.unsqueeze(1).float()
                outputs = model(inputs)
                val_loss = criterion(outputs, labels)
                val_running_loss += val_loss.item()

                predicted_classes = (outputs > 0.5).float()
                val_correct += (predicted_classes == labels).sum().item()
                val_total += labels.size(0)
           
        
        val_epoch_loss = val_running_loss / len(test_data)
        val_accuracy = val_correct / val_total
        model.train()
        
    print(f"Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
        
    model_path = f"50_clients_data/client_DP_trained_model/{state_name}.pth"
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}\n")

data points_AL 17824
Validation Loss: 0.4283, Validation Accuracy: 0.7941
Model saved to 50_clients_data/client_DP_trained_model/AL.pth

data points_AK 2848
Validation Loss: 0.4762, Validation Accuracy: 0.7630
Model saved to 50_clients_data/client_DP_trained_model/AK.pth

data points_AZ 26624
Validation Loss: 0.4333, Validation Accuracy: 0.7949
Model saved to 50_clients_data/client_DP_trained_model/AZ.pth

data points_AR 11168
Validation Loss: 0.4218, Validation Accuracy: 0.8025
Model saved to 50_clients_data/client_DP_trained_model/AR.pth

data points_CA 156544
Validation Loss: 0.4251, Validation Accuracy: 0.8013
Model saved to 50_clients_data/client_DP_trained_model/CA.pth

data points_CO 25056
Validation Loss: 0.4659, Validation Accuracy: 0.7745
Model saved to 50_clients_data/client_DP_trained_model/CO.pth

data points_CT 15840
Validation Loss: 0.4437, Validation Accuracy: 0.7915
Model saved to 50_clients_data/client_DP_trained_model/CT.pth

data points_DE 3776
Validation Loss: 0.48

# Training with Fair Loss

In [9]:
# http://vi.le.gitlab.io/fair-loss/

In [49]:
# data_iter = iter(train_data_all_client)
# batch = next(data_iter)
# inputs, labels = batch
# print(inputs[0])

In [15]:
#Gender
# print(inputs[:, 8].detach().unique())

tensor([-0.9571,  1.0448])


In [5]:
client_data_dir="50_clients_data/processed_data/"

layout = [100, 100, 2]
batch_size = 32
num_epochs = 10  
input_dim = 10


In [5]:
# state_codes = ["ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
#                "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
#                "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
#                "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [6]:
model = FullyConnected(input_dim, layout)
criterion = nn.BCELoss() 
optimizer = optim.Adam(model.parameters(), lr=0.001) 

for state_code in state_codes:
    
    # print(state_code)    
    state_name=state_code

    model = FullyConnected(input_dim, layout)
    criterion = nn.BCELoss() 
    optimizer = optim.Adam(model.parameters(), lr=0.001) 
    
    with open(client_data_dir+f'{state_name}.pkl', 'rb') as f:
        train_data_all_client  = pickle.load(f)
    
    with open(client_data_dir+f'{state_name}_test.pkl', 'rb') as f:
        test_data  = pickle.load(f)

    print(f"data points_{state_code}", len(train_data_all_client)*batch_size)    
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
    
        for inputs, labels in train_data_all_client:
            fair_loss=FairLoss(torch.nn.BCELoss(), inputs[:, 8].detach().unique(), 'accuracy')
            # print(inputs[:, 8].detach().unique())
            
            labels=labels.unsqueeze(1).float()
            optimizer.zero_grad()
            
            outputs = model(inputs)
            
            loss_1 = criterion(outputs, labels)
            loss_2 = fair_loss(inputs[:, 8],outputs,labels)              
            final_loss=loss_1+loss_2
            final_loss.backward()
            
            optimizer.step()            
            running_loss += final_loss.item()
            predicted_classes = (outputs > 0.5).float()
            correct += (predicted_classes == labels).sum().item()
            total += labels.size(0)
            
        # print(loss2,"and",final_loss) 
        
        epoch_loss = running_loss / len(train_data_all_client)
        accuracy = correct / total
        
        # print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}")
       
        model.eval()
        with torch.no_grad(): 
            val_running_loss = 0.0
            val_correct = 0
            val_total = 0
            
            for inputs, labels in test_data:
                fair_loss=FairLoss(torch.nn.BCELoss(), inputs[:, 8].detach().unique(), 'accuracy')
                
                labels=labels.unsqueeze(1).float()
                outputs = model(inputs)
                val_loss1 = criterion(outputs, labels)
                val_loss2 = fair_loss(inputs[:, 8],outputs,labels) 
                val_final_loss=val_loss1+val_loss2
                
                val_running_loss += val_final_loss.item()

                predicted_classes = (outputs > 0.5).float()
                val_correct += (predicted_classes == labels).sum().item()
                val_total += labels.size(0)

        
        val_epoch_loss = val_running_loss / len(test_data)
        val_accuracy = val_correct / val_total
        model.train()
        
    # print(f"Training Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}")
    print(f"Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")   
    
    model_path = f"50_clients_data/clients_fair_trained_model/{state_name}.pth"
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}\n")

data points_AL 17824
Training Epoch [10/10], Training Loss: 0.8103, Training Accuracy: 0.8078
Validation Loss: 0.8219, Validation Accuracy: 0.8037
Model saved to 50_clients_data/clients_fair_trained_model/AL.pth

data points_AK 2848
Training Epoch [10/10], Training Loss: 0.8587, Training Accuracy: 0.7948
Validation Loss: 0.8838, Validation Accuracy: 0.7757
Model saved to 50_clients_data/clients_fair_trained_model/AK.pth

data points_AZ 26624
Training Epoch [10/10], Training Loss: 0.8229, Training Accuracy: 0.8040
Validation Loss: 0.8389, Validation Accuracy: 0.8044
Model saved to 50_clients_data/clients_fair_trained_model/AZ.pth

data points_AR 11168
Training Epoch [10/10], Training Loss: 0.7921, Training Accuracy: 0.8122
Validation Loss: 0.8747, Validation Accuracy: 0.8108
Model saved to 50_clients_data/clients_fair_trained_model/AR.pth

data points_CA 156544
Training Epoch [10/10], Training Loss: 0.8114, Training Accuracy: 0.8106
Validation Loss: 0.8165, Validation Accuracy: 0.8114
M

# DP and Fairness: Training

In [7]:
client_data_dir="50_clients_data/processed_data/"

layout = [100, 100, 2]
batch_size = 32
num_epochs = 10
input_dim = 10
lr=0.001
noise_scale =0.1

In [8]:
# state_codes = ["ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD",
#                "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ",
#                "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC",
#                "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

In [9]:
# k=0

for state_code in state_codes:
    # if k==1:
    #     break
    # print(state_code)    
    state_name=state_code
        
    model = FullyConnected(input_dim, layout)
    criterion = nn.BCELoss() 
    optimizer = optim.Adam(model.parameters(), lr=0.001) 
    
    with open(client_data_dir+f'{state_name}.pkl', 'rb') as f:
        train_data_all_client  = pickle.load(f)
    
    with open(client_data_dir+f'{state_name}_test.pkl', 'rb') as f:
        test_data  = pickle.load(f)

    print(f"data points_{state_code}", len(train_data_all_client)*batch_size)    
        
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in train_data_all_client:  
            
            permutation_indices = np.random.permutation(len(inputs))
            X_train_permuted, y_train_permuted = inputs[permutation_indices].detach().clone(), labels[permutation_indices].detach().clone()

            fair_loss=FairLoss(torch.nn.BCELoss(), X_train_permuted[:, 8].detach().unique(), 'accuracy')
            
            labels=labels.unsqueeze(1).float()
            y_train_permuted=y_train_permuted.unsqueeze(1).float()

            # print(y_train_permuted)
            # print(labels)
            
            # optimizer.zero_grad()
            model.zero_grad()
            
            outputs = model(X_train_permuted)
            # loss = criterion(outputs, y_train_permuted)

            loss_1 = criterion(outputs, y_train_permuted)
            loss_2 = fair_loss(X_train_permuted[:, 8],outputs,y_train_permuted)              
            final_loss=loss_1+loss_2
            
            grad = [g.detach() for g in torch.autograd.grad(final_loss, model.parameters(),retain_graph=True)]
            
            perturbed_grad = dp_defense(grad, noise_scale) if noise_scale > 0 else grad
            
            with torch.no_grad():
                for p, g in zip(model.parameters(), perturbed_grad):
                    p.data = p.data - lr * g
                               
            final_loss.backward()
            optimizer.step()
            
            running_loss += final_loss.item()
            predicted_classes = (outputs > 0.5).float()
            correct += (predicted_classes == y_train_permuted).sum().item()
            total += y_train_permuted.size(0)
               
        epoch_loss = running_loss / len(train_data_all_client)
        accuracy = correct / total
        
        # print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}")
        # k+=1
        model.eval()
        with torch.no_grad(): 
            val_running_loss = 0.0
            val_correct = 0
            val_total = 0
            
            for inputs, labels in test_data:
                fair_loss=FairLoss(torch.nn.BCELoss(), inputs[:, 8].detach().unique(), 'accuracy')
                
                labels=labels.unsqueeze(1).float()
                outputs = model(inputs)
                val_loss1 = criterion(outputs, labels)
                val_loss2 = fair_loss(inputs[:, 8],outputs,labels) 
                
                val_final_loss = val_loss1+val_loss2
                
                val_running_loss += val_final_loss.item()

                predicted_classes = (outputs > 0.5).float()
                val_correct += (predicted_classes == labels).sum().item()
                val_total += labels.size(0)
           
        
        val_epoch_loss = val_running_loss / len(test_data)
        val_accuracy = val_correct / val_total
        model.train()
        
    # print(f"Training Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Training Accuracy: {accuracy:.4f}") 
    print(f"Validation Loss: {val_epoch_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
        
    model_path = f"50_clients_data/clients_DP_Fair_trained_model/{state_name}.pth"
    torch.save(model.state_dict(), model_path)
    print(f"Model saved to {model_path}\n")

data points_AL 17824
Validation Loss: 0.8294, Validation Accuracy: 0.8028
Model saved to 50_clients_data/clients_DP_Fair_trained_model/AL.pth

data points_AK 2848
Validation Loss: 0.8874, Validation Accuracy: 0.7842
Model saved to 50_clients_data/clients_DP_Fair_trained_model/AK.pth

data points_AZ 26624
Validation Loss: 0.8357, Validation Accuracy: 0.8021
Model saved to 50_clients_data/clients_DP_Fair_trained_model/AZ.pth

data points_AR 11168
Validation Loss: 0.8249, Validation Accuracy: 0.8176
Model saved to 50_clients_data/clients_DP_Fair_trained_model/AR.pth

data points_CA 156544
Validation Loss: 0.8165, Validation Accuracy: 0.8113
Model saved to 50_clients_data/clients_DP_Fair_trained_model/CA.pth

data points_CO 25056
Validation Loss: 0.9146, Validation Accuracy: 0.7777
Model saved to 50_clients_data/clients_DP_Fair_trained_model/CO.pth

data points_CT 15840
Validation Loss: 0.8694, Validation Accuracy: 0.7955
Model saved to 50_clients_data/clients_DP_Fair_trained_model/CT.pth


In [3]:
# state_name="AL"

# client_data_dir="50_clients_data/processed_data/"

# with open(client_data_dir+f'{state_name}.pkl', 'rb') as f:
#     train_data_all_client  = pickle.load(f)

# with open(client_data_dir+f'{state_name}_test.pkl', 'rb') as f:
#     test_data  = pickle.load(f)
    

In [13]:
import torch
from fair_loss import FairLoss

model = torch.nn.Sequential(torch.nn.Linear(5, 1), torch.nn.ReLU())
data = torch.randint(0, 5, (10, 5), dtype=torch.float, requires_grad=True)

y_true = torch.randint(0, 5, (10, 1), dtype=torch.float)
y_pred = model(data)

# Let's say the sensitive attribute is in the second dimension
dim = 1
criterion = FairLoss(torch.nn.BCELoss(), data[:, dim].detach().unique(), 'accuracy')

print(data[:, dim], y_pred, y_true)
print(data[:, dim].shape, y_pred.shape, y_true.shape)

loss = criterion(data[:, dim], y_pred, y_true)

# loss = criterion(data[:, dim], y_pred.unsqueeze(1), y_true.unsqueeze(1))

loss.backward()
print(data)

tensor([2., 1., 2., 0., 2., 1., 0., 4., 0., 2.], grad_fn=<SelectBackward>) tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], grad_fn=<ReluBackward0>) tensor([[2.],
        [1.],
        [4.],
        [3.],
        [0.],
        [4.],
        [3.],
        [2.],
        [3.],
        [1.]])
torch.Size([10]) torch.Size([10, 1]) torch.Size([10, 1])
tensor([[1., 2., 2., 3., 3.],
        [3., 1., 0., 2., 0.],
        [4., 2., 3., 2., 3.],
        [3., 0., 3., 2., 1.],
        [3., 2., 4., 4., 0.],
        [4., 1., 1., 3., 1.],
        [1., 0., 2., 0., 1.],
        [4., 4., 0., 0., 0.],
        [1., 0., 0., 0., 4.],
        [3., 2., 1., 1., 0.]], requires_grad=True)


In [5]:
# Example of target with class indices
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
output = loss(input, target)
output.backward()
# Example of target with class probabilities
input = torch.randn(3, 5, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input, target)
output.backward()

RuntimeError: 1D target tensor expected, multi-target not supported

#  Testing one client

In [9]:
client_data_dir="50_clients_data/processed_data/"

with open(client_data_dir+'AL_test.pkl', 'rb') as f:
    test_data  = pickle.load(f)
    
model =  FullyConnected(input_dim, layout)
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [10]:
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0
test_loss = 0.0

criterion = nn.CrossEntropyLoss()

with torch.no_grad():  # Disable gradient calculation
    for inputs, labels in test_data:
        # Move data to the same device as model
        # inputs, labels = inputs.to(device), labels.to(device)
        
        outputs = model(inputs)
        # print(outputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

# Calculate average loss and accuracy
average_test_loss = test_loss / len(test_data)
accuracy = correct / total

print(f"Test Loss: {average_test_loss:.4f}, Test Accuracy: {accuracy:.4f}")

Test Loss: 0.4091, Test Accuracy: 0.8040


In [85]:
merge_dfs={}
for state_code, (features, label) in dfs.items():


    merge_df = pd.concat([features, label], axis=1)
    merge_df = merge_df.dropna()

    if 8000 <len(merge_df) < 20000:
        merge_df_sampled = merge_df.sample(frac=0.5, random_state=42)

    elif 20000 <len(merge_df) < 40000:
        merge_df_sampled = merge_df.sample(frac=0.4, random_state=42)
    
    elif 40000 <len(merge_df) < 70000:        
         merge_df_sampled = merge_df.sample(frac=0.3, random_state=42)
        
    elif 70000<len(merge_df)<99000:       
        merge_df_sampled = merge_df.sample(frac=0.2, random_state=42)

    elif len(merge_df)>100000:      
        merge_df_sampled = merge_df.sample(frac=0.1, random_state=42)
    else:
        merge_df_sampled = merge_df
    
    # merge_df_sampled = merge_df.sample(frac=0.2, random_state=42)
    merge_df_sampled['PINCP'] = merge_df_sampled['PINCP'].replace({True: '>50K', False: '<=50K'})

    merge_dfs[state_code] = merge_df_sampled

for state_code, df in merge_dfs.items():
    print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 8907
State: AK, df Length: 3546
State: AZ, df Length: 13311
State: AR, df Length: 6964
State: CA, df Length: 19566
State: CO, df Length: 12522
State: CT, df Length: 9892
State: DE, df Length: 4713
State: FL, df Length: 19785
State: GA, df Length: 15274
State: HI, df Length: 7731
State: ID, df Length: 4132
State: IL, df Length: 20105
State: IN, df Length: 14009
State: IA, df Length: 8872
State: KS, df Length: 7904
State: KY, df Length: 8802
State: LA, df Length: 8267
State: ME, df Length: 7002
State: MD, df Length: 13217
State: MA, df Length: 12034
State: MI, df Length: 15002
State: MN, df Length: 12408
State: MS, df Length: 6594
State: MO, df Length: 12666
State: MT, df Length: 5463
State: NE, df Length: 5392
State: NV, df Length: 7404
State: NH, df Length: 7966
State: NJ, df Length: 14334
State: NM, df Length: 4356
State: NY, df Length: 10302
State: NC, df Length: 15620
State: ND, df Length: 4455
State: OH, df Length: 18640
State: OK, df Length: 8958
State: OR, d

In [21]:
# for state_code, (features, label) in dfs.items():
#     # take 30%
#     num_rows_to_keep = int(len(features) * 0.3) 
#     random_indices = np.random.choice(len(features), num_rows_to_keep, replace=False)
#     reduced_features = features.iloc[random_indices]
#     reduced_label = label.iloc[random_indices]
#     dfs[state_code] = (reduced_features, reduced_label)

# for state_code, (reduced_features, reduced_label) in dfs.items():
#     print(f"State: {state_code}, Reduced Features Length: {len(reduced_features)}, Reduced Label Length: {len(reduced_label)}")

In [58]:
import pickle

# Save the dictionary to a file
with open('dfs.pickle', 'wb') as f:
    pickle.dump(merge_dfs, f)

In [59]:
with open('dfs.pickle', 'rb') as f:
    dfs_loaded = pickle.load(f)

In [60]:
for state_code, df in dfs_loaded.items():
    print(f"State: {state_code}, df Length: {len(df)}")

State: AL, df Length: 4454
State: AK, df Length: 709
State: AZ, df Length: 6655
State: AR, df Length: 2786
State: CA, df Length: 39133
State: CO, df Length: 6261
State: CT, df Length: 3957
State: DE, df Length: 943
State: FL, df Length: 19785
State: GA, df Length: 10183
State: HI, df Length: 1546
State: ID, df Length: 1653
State: IL, df Length: 13403
State: IN, df Length: 7004
State: IA, df Length: 3549
State: KS, df Length: 3161
State: KY, df Length: 4401
State: LA, df Length: 4133
State: ME, df Length: 1400
State: MD, df Length: 6608
State: MA, df Length: 8023
State: MI, df Length: 10002
State: MN, df Length: 6204
State: MS, df Length: 2638
State: MO, df Length: 6333
State: MT, df Length: 1093
State: NE, df Length: 2157
State: NV, df Length: 2961
State: NH, df Length: 1593
State: NJ, df Length: 9556
State: NM, df Length: 1742
State: NY, df Length: 20604
State: NC, df Length: 10413
State: ND, df Length: 891
State: OH, df Length: 12427
State: OK, df Length: 3583
State: OR, df Length: 4

In [61]:
dfs_loaded["TX"].reset_index(drop=True)

Unnamed: 0,AGEP,COW,SCHL,MAR,OCCP,POBP,RELP,WKHP,SEX,RAC1P,PINCP
0,20.0,2.0,16.0,5.0,5420.0,48.0,0.0,12.0,2.0,1.0,<=50K
1,27.0,3.0,19.0,1.0,2320.0,18.0,0.0,40.0,2.0,1.0,<=50K
2,56.0,1.0,19.0,5.0,4710.0,48.0,15.0,50.0,1.0,1.0,>50K
3,27.0,1.0,21.0,5.0,4000.0,48.0,0.0,38.0,2.0,1.0,<=50K
4,76.0,3.0,21.0,1.0,5940.0,35.0,1.0,13.0,1.0,1.0,<=50K
...,...,...,...,...,...,...,...,...,...,...,...
27180,60.0,1.0,16.0,1.0,9130.0,48.0,0.0,40.0,1.0,1.0,<=50K
27181,74.0,1.0,21.0,1.0,4435.0,6.0,0.0,45.0,1.0,1.0,>50K
27182,43.0,1.0,16.0,5.0,4720.0,48.0,0.0,40.0,2.0,1.0,<=50K
27183,32.0,1.0,19.0,5.0,5410.0,6.0,0.0,40.0,1.0,1.0,>50K
