In [1]:
""" FINE TUNING MODEL WITH DIFFERENTIAL PRIVACY """
import heapq
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from models import NCF
from fairness_measures import Measures

import data
from importlib import reload
reload(data)
from data import AttributeData, TargetData


In [2]:
emb_size = 128
hidden_layers = np.array([emb_size, 64, 32, 16])
output_size = 1

num_epochs = 10
batch_size = 256

num_negatives = 5

random_samples = 15
top_k = 10

learning_rate = .001

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
data = AttributeData()
m = Measures()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [4]:
ncf = NCF(6040, 3952, emb_size, hidden_layers, output_size).to(device)
ncf.load_state_dict(torch.load("models/preTrained_NCF"))

# FETCH NUMBER OF UNIQUE CAREERS
n_careers = data.num_jobs

# CHANGE EMBEDDING SIZE TO FIT SENSITIVE INFO
ncf.like_emb = nn.Embedding(n_careers, emb_size).to(device)

In [None]:
from opacus import PrivacyEngine

privacy_engine = PrivacyEngine(
    ncf,
    sample_rate=0.01,
    alphas=[10, 100],
    noise_multiplier=1.3,
    max_grad_norm=1.0,
)


In [None]:
def train_differential_privacy_model():
    loss = nn.BCELoss()
    optimizer = torch.optim.SGD(ncf.parameters(), lr=learning_rate, weight_decay=1e-6)
    privacy_engine.attach(optimizer)
    final_loss = 0

    for i in range(num_epochs):
        j = 0
        dataloader = DataLoader(data, batch_size=batch_size,
                                shuffle=True, num_workers=0)
        it_per_epoch = len(data) / batch_size

        for batch in dataloader:
            usr, jb, _, rt = batch
            # LOAD BATCH
            users = usr.to(device)
            jobs = jb.to(device)  # career
            ratings = rt.to(device)
            # PREDICTIONS
            y_hat = ncf(users.squeeze(1), jobs.squeeze(1))

            # BINARY CROSS-ENTROPY LOSS
            final_loss = loss(y_hat, ratings.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if j % int(1 + it_per_epoch / 10) == 0:
                print(f"Progress: {round(100 * j / it_per_epoch)}%")
            j += 1
        ht, ndcg = evaluate_fine_tune(ncf, data.test, top_k, random_samples)
        print(f'Hit Ratio: {ht}  NDCG: {ndcg}   LOSS1: {final_loss}')



# features = ['age', 'gender', 'job']



    # local_epsilon = 4

    # job = get_dummies(data.train.job, drop_first=True)
    # gender = get_dummies(data.train.gender, drop_first=True)
    # age = 2 * ( (data.train.age - data.train.age.min()) / (data.train.age.max() - data.train.age.min()) ) - 1

    # slack = np.max(1, n_features, np.int8(local_epsilon/2.5))
    # job.apply(lambda x: np.float32(.5) if x==1 else 1/(np.exp(local_epsilon/slack) + 1))
    # users_tensor = torch.LongTensor(data.train.uid.values).to(device)
    # jobs_tensor = torch.LongTensor(job.values).to(device)
    # genders_tensor = torch.LongTensor(gender.values).to(device)
    # ages_tensor = torch.LongTensor(age.values).to(device)
    #
    # all_features =
    # optimizer = torch.optim.Adam(ncf.parameters(), lr=learning_rate, weight_decay=1e-6)




In [None]:
# optimizer = torch.optim.Adam(ncf.parameters(), lr=learning_rate, weight_decay=1e-6)

def evaluate_fine_tune(model, df_val, k, random_samples):
    model.eval()
    avg_hr = np.zeros((len(df_val), k))
    avg_ndcg = np.zeros((len(df_val), k))

    for i in range(len(df_val)):
        test_df = data.add_negatives(
            df_val,
            item='job',
            items=data.jobs,
            n_samples=random_samples
        )
        users, items = torch.LongTensor(test_df.uid).to(device), torch.LongTensor(test_df.job).to(device)
        y_hat = model(users, items)

        y_hat = y_hat.cpu().detach().numpy().reshape((-1,))
        items = items.cpu().detach().numpy().reshape((-1,))
        map_item_score = {}
        for j in range(len(y_hat)):
            map_item_score[items[j]] = y_hat[j]
        for k in range(k):
            # Evaluate top rank list
            ranklist = heapq.nlargest(k, map_item_score, key=map_item_score.get)
            gtItem = items[0]
            avg_hr[i, k] = m.get_hit_ratio(ranklist, gtItem)
            avg_ndcg[i, k] = m.get_ndcg(ranklist, gtItem)
        avg_hr = np.mean(avg_hr, axis=0)
        avg_ndcg = np.mean(avg_ndcg, axis=0)
        return avg_hr, avg_ndcg

# SET MODEL TO TRAINING MODE
ncf.train()

In [None]:
# all_users = torch.LongTensor(train['uid'].values).to(device)
# all_items = torch.LongTensor(train['job'].values).to(device)
# all_genders = torch.LongTensor(train['gender'].values).to(device)
# def train_differential(train_fraction):
#     # REMOVES JOBS BASED ON THRESHOLD + SPLIT DATA
#     train, test = data.train_test_split(train_fraction)
#     # num_batches = np.int64(np.floor(train.shape[0] / batch_size))
#
#     # ADAM OPTIMIZER
#     optimizer = torch.optim.Adam(ncf.parameters(), lr=learning_rate, weight_decay=1e-6)
#
#     for i in range(num_epochs):
#         dataloader = DataLoader(data, batch_size=batch_size,
#                                 shuffle=True, num_workers=0)
#
#         it_per_epoch = len(data) / batch_size
#         loss1, loss2, j = 0, 0, 1
#
#         for batch in dataloader:
#             u, c, g, r = batch
#
#             # LOAD BATCH
#             users = u.to(device)
#             jobs = c.to(device)  # career
#             # genders = g.to(device)
#             ratings = r.to(device)
#
#             # PREDICTIONS
#             y_hat = ncf(users, jobs)
#
#             noise = np.random.laplace(delta/n_inputs)
#             # BINARY CROSS-ENTROPY LOSS
#             loss = nn.BCELoss(y_hat + np.random.laplace(), ratings.unsqueeze(1)) + noise
#
#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()
#
#             if j % int(1 + it_per_epoch / 10) == 0:
#                 print(f"Progress: {round(100 * j / it_per_epoch)}%")
#             j += 1
#         ht, ndcg = evaluate_fine_tune(ncf, test, top_k, random_samples)
#         print(f'Hit Ratio: {ht}  NDCG: {ndcg}   LOSS1: {loss1}  LOSS2: {loss2} ')

# -----------------------------------------------------------------
train_differential_privacy_model()

In [None]:
torch.save(ncf.state_dict(), "models/DF_NCF")