In [31]:
""" FINE TUNING MODEL WITH FAIRNESS"""
import heapq
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from models import NCF3
from fairness_measures import Measures
from evaluators import evaluate_model
import data as data_parser
from importlib import reload
reload(data_parser)
from data import AttributeData, TargetData




In [32]:
emb_size = 128
num_layers = 4
# hidden_layers = np.array([emb_size, 64, 32, 16])
output_size = 1

num_epochs = 10
batch_size = 256

num_negatives = 5

random_samples = 15
top_k = 10

learning_rate = .001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

fairness_thres = torch.tensor(0.1).to(device)
epsilonBase = torch.tensor(0.0).to(device)


In [33]:
td = TargetData()
data = AttributeData()
m = Measures()


In [34]:
# LOAD PRE-TRAINED MODEL
ncf = NCF3(td.num_users, td.num_movies, emb_size, num_layers, output_size).to(device)
ncf.load_state_dict(torch.load("saved_models/NCF2"))

# FETCH NUMBER OF UNIQUE CAREERS
n_careers = data.num_jobs

ncf.embed_item_GMF = nn.Embedding(n_careers, emb_size).to(device)
ncf.embed_item_GMF.weight.requires_grad = False

ncf.embed_item_MLP = nn.Embedding(n_careers, emb_size * (2 ** (num_layers - 1))).to(device)
ncf.embed_item_MLP.weight.requires_grad = False
# CHANGE EMBEDDING SIZE TO FIT SENSITIVE INFO
# ncf.like_emb = nn.Embedding(n_careers, emb_size).to(device)


In [35]:
user_embeds = ncf.embed_user_MLP.weight.data.cpu().detach().numpy()
user_embeds = user_embeds.astype('float')


''' COMPUTE GENDER EMBEDDING '''
gender_embed = np.zeros((2,user_embeds.shape[1]))
num_users_x_group = np.zeros((2, 1))

for i in range(data.train.shape[0]):
    u = data.train['uid'].iloc[i]
    if data.train['gender'].iloc[i] == 0:
        gender_embed[0] +=  user_embeds[u]
        num_users_x_group[0] += 1.0
    else:
        gender_embed[1] +=  user_embeds[u]
        gender_embed[1] += 1.0
        num_users_x_group[1] += 1.0


In [36]:
''' VERTICAL BIAS'''
gender_embed = gender_embed / num_users_x_group
# vBias = compute_bias_direction(gender_embed)
vBias = gender_embed[1].reshape((1,-1)) - gender_embed[0].reshape((1,-1))
vBias = vBias / np.linalg.norm(vBias,axis=1,keepdims=1)

vBias


array([[0.03232344, 0.03123073, 0.03107457, ..., 0.03259981, 0.03171002,
        0.03077014]])

In [37]:
''' LINEAR PROJECTION '''
debiased_user_embeds = user_embeds
for i in range(len(data.data)):
    u = data.all_data['uid'].iloc[i]
    debiased_user_embeds[u] = user_embeds[u] - (np.inner(user_embeds[u].reshape(1,-1),vBias)[0][0])*vBias

AttributeError: 

In [None]:
data.all_data

In [None]:
'''UPDATE USER EMBEDDINGS'''
fairness_thres = torch.tensor(0.1).to(device)
epsilonBase = torch.tensor(0.0).to(device)

# replace user embedding of the model with debiased embeddings
ncf.embed_user_MLP.weight.data = torch.from_numpy(debiased_user_embeds.astype(np.float32)).to(device)



In [38]:
# criterion = nn.BCELoss()

# optimizer = torch.optim.Adam(ncf.parameters(), lr=learning_rate, weight_decay=1e-6)

ncf.train()

NCF3(
  (embed_user_GMF): Embedding(4920, 128)
  (embed_item_GMF): Embedding(17, 128)
  (embed_user_MLP): Embedding(4920, 1024)
  (embed_item_MLP): Embedding(17, 1024)
  (out_act): Sigmoid()
  (MLP_layers): Sequential(
    (0): Dropout(p=1, inplace=False)
    (1): Linear(in_features=2048, out_features=1024, bias=True)
    (2): ReLU()
    (3): Dropout(p=1, inplace=False)
    (4): Linear(in_features=1024, out_features=512, bias=True)
    (5): ReLU()
    (6): Dropout(p=1, inplace=False)
    (7): Linear(in_features=512, out_features=256, bias=True)
    (8): ReLU()
    (9): Dropout(p=1, inplace=False)
    (10): Linear(in_features=256, out_features=128, bias=True)
    (11): ReLU()
  )
  (predict_layer): Linear(in_features=256, out_features=1, bias=True)
)

In [39]:
# LOAD TRAINING DATA
all_users = torch.LongTensor(data.train['uid'].values).to(device)
all_items = torch.LongTensor(data.train['job'].values).to(device)

# PROTECTED ATTRIBUTE
all_genders = torch.LongTensor(data.train['gender'].values).to(device)
# from opacus import PrivacyEngine
#
# privacy_engine = PrivacyEngine(
#     ncf,
#     sample_rate=0.01,
#     alphas=[10, 100],
#     noise_multiplier=1.3,
#     max_grad_norm=1.0,
# )
# optimizer = torch.optim.Adam(ncf.parameters(), lr=learning_rate, weight_decay=1e-6)
# privacy_engine.attach(optimizer)

In [44]:
def train_normal(model):
    # REMOVES JOBS BASED ON THRESHOLD + SPLIT DATA
    # train, test = data.train_test_split(train_fraction)
    # num_batches = np.int64(np.floor(train.shape[0] / batch_size))
    loss = nn.BCELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=1e-6)
    final_loss, loss1, loss2 = 0, 0, 0

    for i in range(num_epochs):
        j=0
        dataloader = DataLoader(data, batch_size=batch_size,
                                shuffle=True, num_workers=0)

        it_per_epoch = len(data) / batch_size

        for batch in dataloader:
            usr, jb, _, rt = batch
            # LOAD BATCH
            users = usr.to(device)
            jobs = jb.to(device)  # career
            # genders = g.to(device)
            ratings = rt.to(device)

            # PREDICTIONS
            y_hat = model(users.squeeze(1), jobs.squeeze(1))

            # BINARY CROSS-ENTROPY LOSS
            loss1 = loss(y_hat, ratings.float())

            predicted_probs = model(all_users, all_items)
            avg_epsilon = m.compute_edf(all_genders.cpu(), predicted_probs, data.num_jobs, all_items, device)

            # criteroin hinge
            loss2 = torch.max(torch.tensor(0.0).to(device), (avg_epsilon - epsilonBase))

            final_loss = loss1 + fairness_thres*loss2

            optimizer.zero_grad()
            final_loss.backward()
            optimizer.step()

            if j % int(1 + it_per_epoch / 10) == 0:
                print(f"\r Epoch {i + 1}, Progress: {round(100 * j / it_per_epoch)}%", end='', flush=True)
            j += 1
        ht, ndcg = evaluate_model(model, data.test[['uid', 'job']].values, top_k, random_samples, data.num_jobs, device)
        print(f'\nHit Ratio: {round(ht[-1], 2)}  NDCG: {round(ndcg[-1], 2)}   LOSS1: {final_loss}')


# -----------------------------------------------------------------

In [45]:
ht, ndcg = evaluate_model(ncf, data.test[['uid', 'job']].values, top_k, random_samples, data.num_jobs, device)
ht

array([0.        , 0.25187567, 0.27974277, 0.3204716 , 0.38585209,
       0.47695606, 0.58199357, 0.67631297, 0.76956056, 0.84780279])

In [46]:
train_normal(ncf)

Progress: 0%
Progress: 11%
Progress: 21%
Progress: 32%
Progress: 42%
Progress: 53%
Progress: 63%
Progress: 74%
Progress: 85%
Progress: 95%
Hit Ratio: [0.         0.15326902 0.27759914 0.39121115 0.49732047 0.59914255
 0.69989282 0.76956056 0.84137192 0.89496249]  NDCG: [0.         0.15326902 0.2317126  0.2885186  0.3342174  0.37360755
 0.40949552 0.4327181  0.45537206 0.47150443]   LOSS1: 4.7397894859313965  LOSS2: 0.2597944438457489 
Progress: 0%
Progress: 11%
Progress: 21%
Progress: 32%
Progress: 42%
Progress: 53%
Progress: 63%
Progress: 74%
Progress: 85%
Progress: 95%
Hit Ratio: [0.         0.15755627 0.27867095 0.40728832 0.52411576 0.62165059
 0.70310825 0.78135048 0.85637728 0.90782422]  NDCG: [0.         0.15755627 0.23397113 0.29827981 0.34859465 0.38632627
 0.41534208 0.44142282 0.46509114 0.48057821]   LOSS1: 3.7208361625671387  LOSS2: 0.09305262565612793 
Progress: 0%
Progress: 11%
Progress: 21%
Progress: 32%
Progress: 42%
Progress: 53%
Progress: 63%
Progress: 74%
Progress: 

In [49]:
'''MEASURE THE FAIRNESS OF THE MODEL'''

m.fairness_measures(ncf, data.test, all_genders.cpu(), n_careers, device)

AttributeError: 'Measures' object has no attribute 'fairness_measures'

In [50]:
torch.save(ncf.state_dict(), "saved_models/NFCF")