In [None]:
import pandas as pd
import numpy as np
import cvxpy as cp
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from scipy.special import expit
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.modules.loss import _Loss
from torch import Tensor
from sklearn.utils import resample



data = pd.read_csv('.../adult_reconstruction.csv')
data = pd.DataFrame(data)

data['native-country'] = data['native-country'].replace(' ?',np.nan)
data['workclass'] = data['workclass'].replace(' ?',np.nan)
data['occupation'] = data['occupation'].replace(' ?',np.nan)

data = data.drop(['education-num'], axis=1)
data.dropna(how='any',inplace=True)


def bi(input):
  return np.where(input > 50000, 1, 0)


for col in set(data.columns) - set(data.describe().columns):
  data[col] = data[col].astype('category')

def oneHotCatVars(df, df_cols):

    df_1 = df.drop(columns = df_cols, axis = 1)
    df_2 = pd.get_dummies(df[df_cols])

    return (pd.concat([df_1, df_2], axis=1, join='inner'))

data_preprocessed = oneHotCatVars(data, data.select_dtypes('category').columns)
normalize_columns = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']

def normalize(columns):
  scaler = preprocessing.StandardScaler()
  data_preprocessed[columns] = scaler.fit_transform(data_preprocessed[columns])


normalize(normalize_columns)

data_w = data_preprocessed[data_preprocessed['race_White'] == 1]
data_b = data_preprocessed[data_preprocessed['race_Black'] == 1]

data_preprocessed = pd.concat([data_w, data_b])

x_train, x_test = train_test_split(data_preprocessed)



train_x = x_train.drop(['income'],axis=1)
train_x = train_x.drop(['race_White'],axis=1)
train_x = train_x.drop(['race_Black'],axis=1)
train_label = bi(x_train['income'].to_numpy())
train_sen = x_train['race_Black'].to_numpy()
test_x = x_test.drop(['income'],axis=1)
test_x = test_x.drop(['race_White'],axis=1)
test_x = test_x.drop(['race_Black'],axis=1)
test_label = bi(x_test['income'].to_numpy())
test_sen = x_test['race_Black'].to_numpy()


class Classifier(nn.Module):
    def __init__(self, input_size = 104, hidden_size = 256, latent_size = 50):
        super(Classifier, self).__init__()

        self.dense1 = nn.Linear(input_size, hidden_size)
        self.dense2 = nn.Linear(hidden_size, 2)

    def forward(self, x):
        x = F.relu(self.dense1(x))
        x = self.dense2(x)

        return F.log_softmax(x,-1)

class Classifier_complx(nn.Module):
    def __init__(self, input_size = 104, hidden_size = 256, latent_size = 50):
        super(Classifier_complx, self).__init__()

        self.dense1 = nn.Linear(input_size, hidden_size)
        self.dense2 = nn.Linear(hidden_size, hidden_size)
        self.dense3 = nn.Linear(hidden_size, hidden_size)
        self.dense4 = nn.Linear(hidden_size, 2)

    def forward(self, x):
        x = F.relu(self.dense1(x))
        x = F.relu(self.dense2(x))
        x = F.relu(self.dense3(x))
        x = self.dense4(x)

        return return F.log_softmax(x,-1)

class ShelterOutcomeDataset(Dataset):
    def __init__(self, X, Y, A):
        self.x = X.to_numpy().astype(np.float32)
        self.y = Y
        self.a =  A

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx], self.a[idx]


class ShelterDataset(Dataset):
    def __init__(self, X, Y):
        self.x = X.to_numpy().astype(np.float32)
        self.y = Y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

train_ds = ShelterDataset(train_x, train_label)
test_ds = ShelterOutcomeDataset(test_x, test_label, test_sen)

batch_size_train, batch_size_test = 256, test_x.shape[0]
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size_train, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size_test, shuffle=False)

device = 'cuda:2'

In [None]:
# teacher model

epochs = 150

cls_cplx = Classifier_complx().to(device)
optimizer = optim.Adam(cls_cplx.parameters(), lr = 1e-3, weight_decay = 1e-4)
criterion = torch.nn.BCEWithLogitsLoss().to(device)


def train(epoch):

  cls_cplx.train()

  for batch_idx, (x, y, a) in enumerate(train_loader):
        x, y = x.to(device).float(), a.to(device), y.to(device).float()
        pred = cls_cplx(x)

        loss = criterion(pred, y.view(-1,1))
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
          train_losses_cplx.append(loss.item())
          train_counter_cplx.append(
            (batch_idx*128) + ((epoch-1)*len(train_loader.dataset)))
        if batch_idx % 20 == 0:
          print(f'Epoch {epoch}: [{batch_idx*len(x)}/{len(train_loader.dataset)}] Loss: {loss.item()}')

train_losses_cplx = []
train_counter_cplx = []

for epoch in range(1, epochs+1):
  train(epoch)

In [None]:
# student model

class LabelSmoothingCrossEntropy(_Loss):
    def __init__(self, eps: float = 0.1, size_average=None, reduce=None, reduction: str = 'mean'):
        super().__init__(size_average, reduce, reduction)
        self.eps = eps

    def forward(self, input: Tensor, target: Tensor) -> Tensor:
        loss = (- target * input[:,1] - (1-target)* input[:,0])
        if self.reduction == "none":
            ret = loss
        elif self.reduction == "mean":
            ret = loss.mean()
        elif self.reduction == "sum":
            ret = loss.sum()
        else:
            raise ValueError(self.reduction + " is not valid")
        return ret

cls_std = Classifier().to(device)
optimizer_std = optim.Adam(cls_std.parameters(), lr = 1e-3, weight_decay = 1e-4)
criterion = LabelSmoothingCrossEntropy()
max_epoch = 50

def train(epoch):

  cls_std.train()

  for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(device).float(), y.to(device).float()
        pred = cls_std(x)
        target = torch.zeros_like(y)
        target = torch.tensor(cls_cplx(x))[:,1].to(device).float()
        loss = criterion(pred, target.view(-1,1)).mean()
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
          train_losses.append(loss.item())
          train_counter.append(
            (batch_idx*128) + ((epoch-1)*len(train_loader.dataset)))
        if batch_idx % 20 == 0:
          print(f'Epoch {epoch}: [{batch_idx*len(x)}/{len(train_loader.dataset)}] Loss: {loss.item()}')

def model_eval(actual, pred):

    confusion = pd.crosstab(actual, pred, rownames=['Actual'], colnames=['Predicted'])
    TP = confusion.loc[1,1] + 1
    TN = confusion.loc[0,0] + 1
    FP = confusion.loc[0,1] + 1
    FN = confusion.loc[1,0] + 1

    out = {}
    out['ALL'] = (TP+TN+FP+FN-4)
    out['DP'] = (TP+FP-2)/(TP+TN+FP+FN-4)
    out['TPR'] =  (TP-1)/(TP+FN-2)
    out['TNR'] = (TN-1)/(FP+TN-2)
    out['FPR'] = (FP-1)/(FP+TN-2)
    out['FNR'] = (FN-1)/(TP+FN-2)
    out['ACR'] = (TP+TN-2)/(TP+TN+FP+FN-4)

    return out

def test(epoch):
  cls_std.eval()
  test_loss = 0
  correct = 0

  with torch.no_grad():
    for x, y, a in test_loader:
      y = y.long()
      output = cls_std(x)
      test_loss += F.nll_loss(output, y).item()
      pred = output.data.max(1, keepdim=True)[1]
      correct += pred.eq(y.data.view_as(pred)).sum()
      idx_b = np.where(a==1)
      y_b = y[[idx_b]]
      pred_b = output[[idx_b]]
      pred_b = torch.squeeze(pred_b,0).data.max(1, keepdim=True)[1]
      idx_w = np.where(a==0)
      y_w = y[[idx_w]]
      pred_w = output[[idx_w]]
      pred_w = torch.squeeze(pred_w,0).data.max(1, keepdim=True)[1]
      w = model_eval(torch.squeeze(y_w,0), pred_w.detach().numpy().reshape(pred_w.shape[0]))
      b = model_eval(torch.squeeze(y_b,0), pred_b.detach().numpy().reshape(pred_b.shape[0]))
      DI = 100 * abs(w['DP'] - b['DP'])
      DFPR = 100 * abs(w['TNR'] - b['TNR'])
      DFNR = 100 * abs(w['TPR'] - b['TPR'])
      eod = DFPR + DFNR
  test_losses.append(test_loss)
  test_counter.append(len(train_loader.dataset)*epoch)
  test_DIs.append(DI)
  test_EOds.append(eod)

train_losses = []
train_counter = []
test_losses = []
test_counter = []
test_DIs = []
test_EOds = []

for epoch in range(1, max_epoch+1):
  train(epoch)
  test(epoch)