In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from attack import (
    reconstruct_interactions,
)
from dataset import (
    LearningToRankDataset,
)
from more_itertools import grouper
from ranker import (
    LinearPDGDRanker,
    Neural1LayerPDGDRanker,
    Neural2LayerPDGDRanker,
)
from scipy.stats import ks_2samp
from tqdm.notebook import tqdm
from utils import (
    CascadeClickModel,
    Metrics,
    apply_gaussian_mechanism,
)

In [None]:
def set_seed(seed):
    torch.manual_seed(2023)
    random.seed(2023)
    np.random.seed(2023)

# Make sure to normalize if using MSLR
data = LearningToRankDataset("../dataset/MQ2008/Fold1/test.txt", normalize=False)
num_features = data.get_num_features()

models = {
    "linear_pdgd": LinearPDGDRanker(num_features),
    # "neural_4_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=4),
    # "neural_8_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=8),
    # "neural_16_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=16),
    # "neural_4_2_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=4, hidden_size2=2
    # ),
    # "neural_8_4_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=8, hidden_size2=4
    # ),
    # "neural_16_8_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=16, hidden_size2=8
    # ),
}

click_models = {
    # "perfect": CascadeClickModel(prob_click=[0.0, 0.5, 1.0], prob_stop=[0.0, 0.0, 0.0]),
    "navigational": CascadeClickModel(
        prob_click=[0.05, 0.5, 0.95], prob_stop=[0.2, 0.5, 0.9]
    ),
    "informational": CascadeClickModel(
        prob_click=[0.4, 0.7, 0.9], prob_stop=[0.1, 0.3, 0.5]
    ),
}

# MSLR Click Model
# click_models = {
#     "perfect": CascadeClickModel(
#         prob_click=[0.0, 0.2, 0.4, 0.8, 1.0], prob_stop=[0.0, 0.0, 0.0, 0.0, 0.0]
#     ),
#     "navigational": CascadeClickModel(
#         prob_click=[0.05, 0.3, 0.5, 0.7, 0.95], prob_stop=[0.2, 0.3, 0.5, 0.7, 0.9]
#     ),
#     "informational": CascadeClickModel(
#         prob_click=[0.4, 0.6, 0.7, 0.8, 0.9], prob_stop=[0.1, 0.2, 0.3, 0.4, 0.5]
#     ),
# }

In [None]:
# PDGD: multiple queries, random order, DP

def worker(num_query):
    set_seed(num_query)

    data = LearningToRankDataset("../dataset/MQ2008/Fold1/test.txt", normalize=False)
    
    num_item_per_ranking = 10
    local_lr = 1e-01

    num_sim_round = 1
    atk_lr = 1e-01
    max_iter = 1000
    num_atk = 1

    epsilons = [1.0, 10.0, 100.0, math.inf]
    delta = 1e-08
    sensitivity = 4.0

    metrics = Metrics()

    def train(model, params, grouped_train_data, local_lr):
        cur_params = params.clone()

        for features, ranking, interactions in grouped_train_data:
            cur_grad = model.grad(
                cur_params,
                features,
                ranking,
                interactions,
            )

            cur_params = cur_params + local_lr * cur_grad

        return cur_params


    def simulate_attack(model, grouped_data, click_model, epsilons, click_model_name, num_query):
        params = model.gen_params()

        grouped_train_data = []
        indices = []
        start_ind = 0
        for relevances, features in grouped_data:
            if len(relevances) == 1:
                continue
            features = torch.Tensor(features)
            ranking = model.rank(params, features, sample=True)[:num_item_per_ranking]
            features = features[ranking]
            interactions = torch.Tensor(click_model.click(ranking, relevances))

            # Remap the original ranking into the correct range
            _, ranking = torch.where(
                torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
            )
            grouped_train_data.append((features, ranking, interactions))
            indices.append((start_ind, start_ind + len(ranking)))
            start_ind += len(ranking)

        if len(grouped_train_data) < 1:
            return

        raw_target = train(
            model,
            params,
            random.sample(grouped_train_data, len(grouped_train_data)),
            local_lr,
        )

        for epsilon in epsilons:
            target = (apply_gaussian_mechanism(raw_target, epsilon, delta, sensitivity) - params) / local_lr

            preds_raw, _ = reconstruct_interactions(
                lambda I: (train(
                    model,
                    params,
                    [
                        (features, ranking, I[indices[idx][0] : indices[idx][1]])
                        for idx, (features, ranking, _) in enumerate(grouped_train_data)
                    ],
                    local_lr,
                ) - params) / local_lr,
                target,
                indices[-1][1],
                lr=atk_lr,
                max_iter=max_iter,
                num_rounds=num_atk,
                return_raw=True,
            )
            preds = preds_raw.sigmoid().round().long()
            interactions = torch.cat([I for (_, _, I) in grouped_train_data])

            metrics.update(
                f"{model_name}_{click_model_name}_{num_query}_query_eps_{epsilon}",
                interactions,
                preds,
                preds_raw=preds_raw,
            )

            # Random guess
            random_preds_raw = torch.rand(preds_raw.shape)
            random_preds = random_preds_raw.round()
            metrics.update(
                f"random_{click_model_name}_{num_query}_query_eps_{epsilon}",
                interactions,
                random_preds,
                preds_raw=random_preds_raw,
            )

    for _ in tqdm(range(num_sim_round)):
        query_ids = data.get_all_query_ids()
        query_ids = random.sample(query_ids, len(query_ids))    
        for qids in tqdm(grouper(query_ids, num_query, incomplete="ignore"), total=len(query_ids)//num_query):
            grouped_data = data.get_data_for_queries(list(qids))

            for model_name, model in models.items():
                for click_model_name, click_model in click_models.items():
                    simulate_attack(model, grouped_data, click_model, epsilons, click_model_name, num_query)

    metrics.save(f"../output/ltr_{num_query}_metrics.csv")

In [None]:
from multiprocessing import Pool, cpu_count

if __name__ == "__main__":
    num_query_per_user = [1, 4, 8, 12, 16]
    pool = Pool()
    metrics = pool.map(worker, num_query_per_user)
    final_metrics_df = pd.concat([m.df for m in metrics])
    final_metrics_df.to_csv(f"../output/ltr_output_final.csv", index=False)