In [1]:
import math
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from attack import (
    reconstruct_interactions,
    reconstruct_interactions_functional,
    optimize_ltr_data_manipulation_es,
    optimize_ltr_data_manipulation_grad,
)
from dataset import (
    LearningToRankDataset,
)
from more_itertools import grouper
from ranker import (
    LinearPDGDRanker,
    Neural1LayerPDGDRanker,
    Neural2LayerPDGDRanker,
)
from tqdm.notebook import tqdm
from utils import (
    CascadeClickModel,
    Metrics,
    apply_gaussian_mechanism,
)

In [None]:
def set_seed():
    torch.manual_seed(2023)
    random.seed(2023)
    np.random.seed(2023)

# Make sure to normalize if using MSLR
data = LearningToRankDataset("../dataset/MQ2008/Fold1/train.txt", normalize=False)
num_features = data.get_num_features()

models = {
    "linear_pdgd": LinearPDGDRanker(num_features),
    # "neural_4_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=4),
    # "neural_8_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=8),
    # "neural_16_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=16),
    # "neural_4_2_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=4, hidden_size2=2
    # ),
    # "neural_8_4_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=8, hidden_size2=4
    # ),
    "neural_16_8_pdgd": Neural2LayerPDGDRanker(
        num_features, hidden_size=16, hidden_size2=8
    ),
}

click_models = {
    # "perfect": CascadeClickModel(prob_click=[0.0, 0.5, 1.0], prob_stop=[0.0, 0.0, 0.0]),
    "navigational": CascadeClickModel(
        prob_click=[0.05, 0.5, 0.95], prob_stop=[0.2, 0.5, 0.9]
    ),
    "informational": CascadeClickModel(
        prob_click=[0.4, 0.7, 0.9], prob_stop=[0.1, 0.3, 0.5]
    ),
}

# MSLR Click Model
# click_models = {
#     # "perfect": CascadeClickModel(
#     #     prob_click=[0.0, 0.2, 0.4, 0.8, 1.0], prob_stop=[0.0, 0.0, 0.0, 0.0, 0.0]
#     # ),
#     "navigational": CascadeClickModel(
#         prob_click=[0.05, 0.3, 0.5, 0.7, 0.95], prob_stop=[0.2, 0.3, 0.5, 0.7, 0.9]
#     ),
#     # "informational": CascadeClickModel(
#     #     prob_click=[0.4, 0.6, 0.7, 0.8, 0.9], prob_stop=[0.1, 0.2, 0.3, 0.4, 0.5]
#     # ),
# }

In [None]:
# Optimize manipulation for multiple queries + DP

set_seed()

num_query_per_user = [4, 8, 12, 16]
num_item_per_ranking = 10
local_lr = 1e-01
num_sim_round = 1

# Optimization for data
adm_epochs = 50
num_reconstructions_per_step = 1
adm_reconstruct_lr = 0.1
# adm_reconstruct_epochs = 30
# adm_lr = 1e-02
# adm_max_grad_l1_norm = None
# adm_grad_clip = None
# adm_l1_factor = 0

# Reconstruction
num_atk = 1
max_iter = 1000
atk_lr = 0.1

epsilons = [1.0, 10.0, 20.0, 100.0, 500.0, math.inf]
delta = 1e-08
sensitivity = 0.5

metrics = Metrics()

def train(model, params, grouped_train_data, local_lr=local_lr):
    cur_params = params.clone()

    for features, ranking, interactions in grouped_train_data:
        cur_grad = model.grad(
            cur_params,
            features,
            ranking,
            interactions,
        )

        cur_params = cur_params + local_lr * cur_grad

    return cur_params - params


def gen_adm_perturbation(model, num_query):
    return optimize_ltr_data_manipulation_es(
        train,
        model,
        data,
        num_query,
        num_features,
        num_item_per_ranking,
        max_epochs=adm_epochs,
        num_reconstructions_per_step=num_reconstructions_per_step,
        seed=2023,
        lr=adm_reconstruct_lr,
        max_iter=100,
    )

def simulate_attack(model, model_name, grouped_data, click_model, epsilons, click_model_name, num_query, adm_perturb, adm_masks):
    params = model.gen_params()

    grouped_train_data = []
    grouped_train_data_adm_simple = []
    grouped_train_data_adm_opt = []
    indices = []
    start_ind = 0

    for i, (relevances, features) in enumerate(grouped_data):
        if len(relevances) == 1:
            continue
        features = torch.Tensor(features)
        ranking = model.rank(params, features, sample=True)[:num_item_per_ranking]
        features = features[ranking]
        num_items = len(ranking)
        mask = adm_masks[i][:num_items,:]
        features_adm = features * mask
        features_adm_opt = features_adm + torch.normal(adm_perturb[0], adm_perturb[1], features_adm.shape) * (1.0 - mask)
        # features_adm_opt = features_adm + adm_perturb[:num_items,:] * (1.0 - mask)
        interactions = torch.Tensor(click_model.click(ranking, relevances))

        # Remap the original ranking into the correct range
        _, ranking = torch.where(
            torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
        )

        grouped_train_data.append((features, ranking, interactions))
        grouped_train_data_adm_simple.append((features_adm, ranking, interactions))
        grouped_train_data_adm_opt.append((features_adm_opt, ranking, interactions))
        indices.append((start_ind, start_ind + num_items))
        start_ind += num_items

    if len(grouped_train_data) < 1:
        return
        
    grouped_train_data_dict = {
        "no_adm": grouped_train_data,
        "adm_simple": grouped_train_data_adm_simple,
        "adm_optimized": grouped_train_data_adm_opt,
    }
    raw_target_dict = {
        key: train(
            model,
            params,
            random.sample(train_data, len(train_data)),
            local_lr,
        ) for key, train_data in grouped_train_data_dict.items()
    }

    for epsilon in epsilons:
        for key, raw_target in raw_target_dict.items():
            target = (apply_gaussian_mechanism(raw_target, epsilon, delta, sensitivity))
            train_data = grouped_train_data_dict[key]
            preds_raw, _ = reconstruct_interactions(
                lambda I: (train(
                    model,
                    params,
                    [
                        (features, ranking, I[indices[idx][0] : indices[idx][1]])
                        for idx, (features, ranking, _) in enumerate(train_data)
                    ],
                    local_lr,
                )),
                target,
                indices[-1][1],
                lr=atk_lr,
                max_iter=max_iter,
                num_rounds=num_atk,
                return_raw=True,
            )
            preds = preds_raw.sigmoid().round().long()
            interactions = torch.cat([I for (_, _, I) in train_data])

            metrics.update(
                f"{model_name}_{click_model_name}_{num_query}_query_eps_{epsilon}_{key}",
                interactions,
                preds,
                preds_raw=preds_raw,
            )

for _ in tqdm(range(num_sim_round)):
    query_ids = data.get_all_query_ids()
    query_ids = random.sample(query_ids, len(query_ids))

    for num_query in num_query_per_user:
        model_perturb = {}
        for model_name, model in models.items():
            print(f"Optimizing ADM for {model_name} with {num_query} queries")
            model_perturb[model_name] = gen_adm_perturbation(model, num_query)
            print(f"ADM params: {model_perturb[model_name][0]}")
        print("Num query", num_query)
        for qids in tqdm(grouper(query_ids, num_query, incomplete="ignore"), total=len(query_ids)//num_query):
            grouped_data = data.get_data_for_queries(list(qids))
            for model_name, model in models.items():
                adm_perturb, adm_masks = model_perturb[model_name]
                for click_model_name, click_model in click_models.items():                    
                    simulate_attack(model, model_name, grouped_data, click_model, epsilons, click_model_name, num_query, adm_perturb, adm_masks)

print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())
# metrics.save("../output/ltr_metrics.csv")

In [None]:
# PDGD: Evaluate NDCG on test
import pickle
from utils import (
    LtrEvaluator,
)

set_seed()

test_data = LearningToRankDataset("../dataset/MSLR-WEB10K/Fold1/test.txt", normalize=True)

num_query_per_user = [1]
num_item_per_ranking = 10
local_lr = 1e-01
epsilons = [1.0, 10.0, 20.0, 100.0, 500.0, math.inf]
delta = 1e-08
sensitivity = 0.5
num_users_per_agg = 100

results = []
evaluator = LtrEvaluator(test_data, num_item_per_ranking)

query_ids = data.get_all_query_ids()
query_ids = random.sample(query_ids, len(query_ids))

for model_name, model in models.items():
    for click_model_name, click_model in click_models.items():
        for epsilon in epsilons:
            print(f"Model: {model_name} | Click model: {click_model_name} | Epsilon: {epsilon}")
            ndcgs = []
            model_params = model.gen_params()
            grad_arr = []

            ndcgs.append(evaluator.calculate_average_offline_ndcg(model, model_params))

            for qid in tqdm(query_ids):
                relevances, features = data.get_data_for_queries([qid])[0]

                features = torch.Tensor(features)
                ranking = model.rank(model_params, features, sample=True)[:num_item_per_ranking]
                features = features[ranking]
                interactions = torch.Tensor(click_model.click(ranking, relevances))

                # Remap the original ranking into the correct range
                _, ranking = torch.where(
                    torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
                )
                
                raw_grad = local_lr * model.grad(
                    model_params,
                    features,
                    ranking,
                    interactions,
                )

                grad_arr.append(apply_gaussian_mechanism(raw_grad, epsilon, delta, sensitivity))

                if (len(grad_arr) == num_users_per_agg):
                    model_params = model_params + torch.stack(grad_arr).mean(dim=0)
                    grad_arr = []

                    ndcgs.append(evaluator.calculate_average_offline_ndcg(model, model_params))

            if (len(grad_arr) > 0):
                model_params = model_params + torch.stack(grad_arr).mean(dim=0)
                ndcgs.append(evaluator.calculate_average_offline_ndcg(model, model_params))
            
            results.append((model_name, click_model, epsilon, ndcgs))
            print(ndcgs[-1])

with open("../output/ltr_utility_eval.pkl", 'wb') as fp:
    pickle.dump(results, fp)