In [1]:
import math
import matplotlib.pyplot as plt
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from attack import (
    reconstruct_interactions,
    interaction_mia_fedrec,
)
from dataset import (
    LearningToRankDataset,
    MovieLens,
)
from more_itertools import grouper
from ranker import (
    LinearPDGDRanker,
    Neural1LayerPDGDRanker,
    Neural2LayerPDGDRanker,
    CollaborativeFilteringRecommender,
    NeuralCollaborativeFilteringRecommender,
)
from scipy.stats import ks_2samp
from tqdm.notebook import tqdm
from utils import (
    CascadeClickModel,
    Metrics,
    apply_gaussian_mechanism,
)

In [2]:
def set_seed():
    torch.manual_seed(2023)
    random.seed(2023)
    np.random.seed(2023)

data = LearningToRankDataset("../dataset/MQ2008/Fold1/test.txt")
num_features = data.get_num_features()

models = {
    "linear_pdgd": LinearPDGDRanker(num_features),
    # "neural_4_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=4),
    # "neural_8_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=8),
    # "neural_16_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=16),
    # "neural_4_2_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=4, hidden_size2=2
    # ),
    # "neural_8_4_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=8, hidden_size2=4
    # ),
    # "neural_16_8_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=16, hidden_size2=4
    # ),
}

click_models = {
    # "perfect": CascadeClickModel(prob_click=[0.0, 0.5, 1.0], prob_stop=[0.0, 0.0, 0.0]),
    "navigational": CascadeClickModel(
        prob_click=[0.05, 0.5, 0.95], prob_stop=[0.2, 0.5, 0.9]
    ),
    "informational": CascadeClickModel(
        prob_click=[0.4, 0.7, 0.9], prob_stop=[0.1, 0.3, 0.5]
    ),
}

Loading ../dataset/MQ2008/Fold1/test.txt


2874it [00:00, 43676.74it/s]


In [27]:
# Simulation for LTR

set_seed()

num_sim_round = 10
num_features = 10
num_data = 100
atk_lr = 1e-01
max_iter = 1000
num_atk = 1

metrics = Metrics()

models = {
    "linear_pdgd": LinearPDGDRanker(num_features),
    # "neural_1_pdgd": Neural1LayerPDGDRanker(num_features, hidden_size=5),
    # "neural_2_pdgd": Neural2LayerPDGDRanker(
    #     num_features, hidden_size=4, hidden_size2=2
    # ),
}

for _ in tqdm(range(num_sim_round)):
    features = torch.rand(num_data, num_features) * 2 - 1
    interactions = torch.randint(0, 2, (num_data,))
    while interactions.sum() == 0:
        interactions = torch.randint(0, 2, (num_data,))
    
    ranking = list(range(num_data))
    random.shuffle(ranking)
    ranking = torch.LongTensor(ranking)

    for model_name, model in models.items():
        params = model.gen_params()
        log_pos_bias_weight = model.calc_log_pos_bias_weight(
            ranking, model.forward_multiple(params, features), num_data
        )
        
        target = model.grad(
            params,
            features,
            ranking,
            interactions,
            log_pos_bias_weight=log_pos_bias_weight,
        )

        preds_raw, _ = reconstruct_interactions(
            lambda I: model.grad(
                params, features, ranking, I, log_pos_bias_weight=log_pos_bias_weight
            ),
            target,
            num_data,
            lr=atk_lr,
            max_iter=max_iter,
            num_rounds=num_atk,
            return_raw=True,
        )
        preds = preds_raw.sigmoid().round().long()

        metrics.update(model_name, interactions, preds, preds_raw=preds_raw)

    # Data manipulation
    if num_data > num_features:
        num_new_features = num_data - num_features
        new_features = torch.rand(num_data, num_new_features)
        features = torch.cat([features, new_features], dim=1)

        model = LinearPDGDRanker(num_features + num_new_features)
        params = model.gen_params()
        log_pos_bias_weight = model.calc_log_pos_bias_weight(
            ranking, model.forward_multiple(params, features), num_data
        )
        
        target = model.grad(
            params,
            features,
            ranking,
            interactions,
            log_pos_bias_weight=log_pos_bias_weight,
        )

        preds_raw, _ = reconstruct_interactions(
            lambda I: model.grad(
                params, features, ranking, I, log_pos_bias_weight=log_pos_bias_weight
            ),
            target,
            num_data,
            lr=atk_lr,
            max_iter=max_iter,
            num_rounds=num_atk,
            return_raw=True,
        )
        preds = preds_raw.sigmoid().round().long()

        metrics.update(model_name + "_DM", interactions, preds, preds_raw=preds_raw)

100%|██████████| 10/10 [05:11<00:00, 31.16s/it]


In [29]:
print(metrics.get_dataframe().to_string())

              name  accuracy        f1  precision    recall       auc    auc-pr extra_data
0      linear_pdgd    0.5425  0.548148   0.587302  0.513889  0.558399  0.607230         {}
1   linear_pdgd_DM    1.0000  1.000000   1.000000  1.000000  1.000000  1.000000         {}
2      linear_pdgd    0.4700  0.459184   0.505618  0.420561  0.500879  0.536752         {}
3   linear_pdgd_DM    1.0000  1.000000   1.000000  1.000000  1.000000  1.000000         {}
4      linear_pdgd    0.5250  0.520202   0.512438  0.528205  0.517724  0.485470         {}
5   linear_pdgd_DM    1.0000  1.000000   1.000000  1.000000  1.000000  1.000000         {}
6      linear_pdgd    0.4950  0.459893   0.457447  0.462366  0.504648  0.476432         {}
7   linear_pdgd_DM    1.0000  1.000000   1.000000  1.000000  1.000000  1.000000         {}
8      linear_pdgd    0.5200  0.505155   0.505155  0.505155  0.500651  0.472847         {}
9   linear_pdgd_DM    1.0000  1.000000   1.000000  1.000000  1.000000  1.000000         {}

In [4]:
# Simulation for collaborative filtering

set_seed()

num_sim_round = 10
num_features = 64
num_data = 1000
atk_lr = 1e-01
max_iter = 100000
num_atk = 10

metrics = Metrics()

for _ in tqdm(range(num_sim_round)):
    # features = torch.rand(num_data, num_features) * 2 - 1
    # user_embedding = torch.rand(num_features) * 2 - 1
    # user_embedding2 = torch.rand(num_features) * 2 - 1
    features = torch.normal(0, 1, (num_data, num_features))
    user_embedding = torch.normal(0, 1, (num_features,))
    user_embedding2 = torch.normal(0, 1, (num_features,))

    interactions = torch.randint(0, 2, (num_data,))
    while interactions.sum() == 0:
        interactions = torch.randint(0, 2, (num_data,))

    preds_raw = torch.rand((num_data),)
    metrics.update("Random", interactions, preds_raw.sigmoid().round().long(), preds_raw=preds_raw)

    ncf_rec = NeuralCollaborativeFilteringRecommender(num_features, [128, 64, 32])

    target = ncf_rec.item_grad(user_embedding, features, interactions.float())
    scale = max(1.0 / target.mean().abs(), 1.0)
    target = scale * target

    preds_raw, _ = reconstruct_interactions(
        lambda I: scale * ncf_rec.item_grad(user_embedding2, features, I),
        target,
        num_data,
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()

    metrics.update(
        "FedNCF_simple",
        interactions,
        preds,
        preds_raw=preds_raw,
    )

    target = ncf_rec.item_grad(user_embedding, features, interactions.float())
    scale = max(1.0 / target.mean().abs(), 1.0)
    target = scale * target

    preds_raw, user_embedding_est, _ = reconstruct_interactions(
        lambda I, U: scale * ncf_rec.item_grad(U, features, I),
        target,
        num_data,
        private_params_size=num_features,
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()
    
    embedding_err = F.mse_loss(user_embedding_est, user_embedding).item()

    metrics.update(
        "FedNCF_private",
        interactions,
        preds,
        preds_raw=preds_raw,
        extra_data={"embedding_err": embedding_err},
    )

    item_grad = ncf_rec.item_grad(user_embedding, features, interactions.float()).flatten()
    scale = max(1.0 / item_grad.mean().abs(), 1.0)

    target = torch.cat(
        [
            scale * item_grad,
            ncf_rec.feature_grad(user_embedding, features, interactions.float()),
        ]
    )

    preds_raw, user_embedding_est, _ = reconstruct_interactions(
        lambda I, U: torch.cat(
            [
                scale * ncf_rec.item_grad(U, features, I).flatten(),
                ncf_rec.feature_grad(U, features, I, retain_graph=True),
            ]
        ),
        target,
        num_data,
        private_params_size=num_features,
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()

    embedding_err = F.mse_loss(user_embedding_est, user_embedding).item()

    metrics.update(
        "FedNCF_private2",
        interactions,
        preds,
        preds_raw=preds_raw,
        extra_data={"embedding_err": embedding_err},
    )

    target = ncf_rec.item_grad(user_embedding, features, interactions.float())

    preds = interaction_mia_fedrec(
        lambda I: ncf_rec.item_grad(user_embedding2, features, I.float()),
        target,
        num_data,
        select_ratio=interactions.float().mean(),
    )

    metrics.update(
        "FedNCF_IMIA",
        interactions,
        preds,
    )

100%|██████████| 10/10 [01:07<00:00,  6.77s/it]


In [5]:
print(metrics.get_dataframe().to_string())

               name  accuracy        f1  precision    recall       auc    auc-pr                             extra_data
0            Random     0.486  0.654105   0.486000  1.000000  0.514337  0.491345                                     {}
1     FedNCF_simple     0.968  0.967871   0.945098  0.991770  0.996982  0.996428                                     {}
2    FedNCF_private     1.000  1.000000   1.000000  1.000000  1.000000  1.000000  {"embedding_err": 0.9334539175033569}
3   FedNCF_private2     0.991  0.990769   0.987730  0.993827  0.999868  0.999861  {"embedding_err": 1.2190757989883423}
4       FedNCF_IMIA     0.516  0.502058   0.502058  0.502058       NaN       NaN                                     {}
5            Random     0.507  0.672860   0.507000  1.000000  0.521126  0.518385                                     {}
6     FedNCF_simple     0.971  0.970971   0.985772  0.956607  0.997447  0.997581                                     {}
7    FedNCF_private     0.994  0.994071 

In [None]:
# PDGD: single query, single epoch

set_seed()

num_item_per_ranking = 10
num_sim_round = 1
atk_lr = 1e-01
max_iter = 1000
num_atk = 1

metrics = Metrics()

def simulate_attack(model, features, relevances, click_model):
    params = model.gen_params()
    ranking = model.rank(params, features, sample=True)[:num_item_per_ranking]
    features = features[ranking]
    interactions = torch.Tensor(click_model.click(ranking, relevances))
    num_data = len(ranking)

    # Remap the original ranking into the correct range
    _, ranking = torch.where(
        torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
    )

    log_pos_bias_weight = model.calc_log_pos_bias_weight(
        ranking, model.forward_multiple(params, features), num_data
    )

    target = model.grad(
        params,
        features,
        ranking,
        interactions,
        log_pos_bias_weight=log_pos_bias_weight,
    )

    preds_raw, _ = reconstruct_interactions(
        lambda I: model.grad(
            params,
            features,
            ranking,
            I,
            log_pos_bias_weight=log_pos_bias_weight,
        ),
        target,
        num_data,
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()

    return (interactions, preds, preds_raw)


for _ in tqdm(range(num_sim_round)):
    for qid in tqdm(data.get_all_query_ids()):
        relevances, features = data.get_data_for_queries([qid])[0]
        features = torch.Tensor(features)

        for model_name, model in models.items():
            for click_model_name, click_model in click_models.items():
                interactions, preds, preds_raw = simulate_attack(
                    model, features, relevances, click_model
                )
                metrics.update(
                    f"{model_name}_{click_model_name}",
                    interactions,
                    preds,
                    preds_raw=preds_raw,
                )

                # Random guess
                random_preds_raw = torch.rand(preds_raw.shape)
                random_preds = random_preds_raw.round()
                metrics.update(
                    f"random_{click_model_name}",
                    interactions,
                    random_preds,
                    preds_raw=random_preds_raw,
                )

print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())

In [None]:
# PDGD: single query, multiple epochs

set_seed()

num_item_per_ranking = 10
num_local_epochs = 5
local_lr = 1e-01

num_sim_round = 1
atk_lr = 1e-01
max_iter = 1000
num_atk = 1

metrics = Metrics()

def train(model, params, features, ranking, interactions, num_local_epochs, local_lr):
    cur_params = params.clone()

    for _ in range(num_local_epochs):
        cur_grad = model.grad(
            cur_params,
            features,
            ranking,
            interactions,
        )

        cur_params = cur_params + local_lr * cur_grad

    return cur_params


def simulate_attack(model, features, relevances, click_model):
    params = model.gen_params()
    ranking = model.rank(params, features, sample=True)[:num_item_per_ranking]
    features = features[ranking]
    interactions = torch.Tensor(click_model.click(ranking, relevances))
    num_data = len(ranking)

    # Remap the original ranking into the correct range
    _, ranking = torch.where(
        torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
    )

    target = train(model, params, features, ranking, interactions, num_local_epochs, local_lr)

    preds_raw, _ = reconstruct_interactions(
        lambda I: train(model, params, features, ranking, I, num_local_epochs, local_lr),
        target,
        num_data,
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()

    return (interactions, preds, preds_raw)


for _ in tqdm(range(num_sim_round)):
    for qid in tqdm(data.get_all_query_ids()):
        relevances, features = data.get_data_for_queries([qid])[0]
        features = torch.Tensor(features)

        for model_name, model in models.items():
            for click_model_name, click_model in click_models.items():
                interactions, preds, preds_raw = simulate_attack(
                    model, features, relevances, click_model
                )
                metrics.update(
                    f"{model_name}_{click_model_name}",
                    interactions,
                    preds,
                    preds_raw=preds_raw,
                )

                # Random guess
                random_preds_raw = torch.rand(preds_raw.shape)
                random_preds = random_preds_raw.round()
                metrics.update(
                    f"random_{click_model_name}",
                    interactions,
                    random_preds,
                    preds_raw=random_preds_raw,
                )

print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())

In [None]:
# PDGD: multiple queries, no randomness

set_seed()

num_query_per_user = [1, 8, 16, 24, 32]
num_item_per_ranking = 10
local_lr = 1e-01

num_sim_round = 1
atk_lr = 1e-01
max_iter = 1000
num_atk = 1

metrics = Metrics()

def train(model, params, grouped_train_data, local_lr):
    cur_params = params.clone()

    for features, ranking, interactions in grouped_train_data:
        cur_grad = model.grad(
            cur_params,
            features,
            ranking,
            interactions,
        )

        cur_params = cur_params + local_lr * cur_grad

    return cur_params


def simulate_attack(model, grouped_data, click_model):
    params = model.gen_params()

    grouped_train_data = []
    indices = []
    start_ind = 0
    for relevances, features in grouped_data:
        features = torch.Tensor(features)
        ranking = model.rank(params, features, sample=True)[:num_item_per_ranking]
        features = features[ranking]
        interactions = torch.Tensor(click_model.click(ranking, relevances))

        # Remap the original ranking into the correct range
        _, ranking = torch.where(
            torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
        )
        grouped_train_data.append((features, ranking, interactions))
        indices.append((start_ind, start_ind + len(ranking)))
        start_ind += len(ranking)

    target = train(model, params, grouped_train_data, local_lr)

    preds_raw, _ = reconstruct_interactions(
        lambda I: train(
            model,
            params,
            [
                (features, ranking, I[indices[idx][0] : indices[idx][1]])
                for idx, (features, ranking, _) in enumerate(grouped_train_data)
            ],
            local_lr,
        ),
        target,
        indices[-1][1],
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()
    interactions = torch.cat([I for (_, _, I) in grouped_train_data])
    return (interactions, preds, preds_raw)


for num_query in num_query_per_user:
    print(f"Num query: {num_query}")
    for _ in tqdm(range(num_sim_round)):
        for qids in tqdm(
            grouper(data.get_all_query_ids(), num_query, incomplete="ignore"),
            total=len(data.get_all_query_ids()) // num_query
        ):
            grouped_data = data.get_data_for_queries(list(qids))

            for model_name, model in models.items():
                for click_model_name, click_model in click_models.items():
                    interactions, preds, preds_raw = simulate_attack(
                        model, grouped_data, click_model
                    )
                    metrics.update(
                        f"{model_name}_{click_model_name}_{num_query}_query",
                        interactions,
                        preds,
                        preds_raw=preds_raw,
                    )

                    # Random guess
                    random_preds_raw = torch.rand(preds_raw.shape)
                    random_preds = random_preds_raw.round()
                    metrics.update(
                        f"random_{click_model_name}_{num_query}_query",
                        interactions,
                        random_preds,
                        preds_raw=random_preds_raw,
                    )

print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())

In [None]:
# PDGD: multiple queries, random order

set_seed()

num_query_per_user = [8, 16, 24, 32]
num_item_per_ranking = 10
local_lr = 1e-01

num_sim_round = 1
atk_lr = 1e-01
max_iter = 1000
num_atk = 1

metrics = Metrics()

def train(model, params, grouped_train_data, local_lr):
    cur_params = params.clone()

    for features, ranking, interactions in grouped_train_data:
        cur_grad = model.grad(
            cur_params,
            features,
            ranking,
            interactions,
        )

        cur_params = cur_params + local_lr * cur_grad

    return cur_params


def simulate_attack(model, grouped_data, click_model):
    params = model.gen_params()

    grouped_train_data = []
    indices = []
    start_ind = 0
    for relevances, features in grouped_data:
        features = torch.Tensor(features)
        ranking = model.rank(params, features, sample=True)[:num_item_per_ranking]
        features = features[ranking]
        interactions = torch.Tensor(click_model.click(ranking, relevances))

        # Remap the original ranking into the correct range
        _, ranking = torch.where(
            torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
        )
        grouped_train_data.append((features, ranking, interactions))
        indices.append((start_ind, start_ind + len(ranking)))
        start_ind += len(ranking)

    target = train(
        model,
        params,
        random.sample(grouped_train_data, len(grouped_train_data)),
        local_lr,
    )

    preds_raw, _ = reconstruct_interactions(
        lambda I: train(
            model,
            params,
            [
                (features, ranking, I[indices[idx][0] : indices[idx][1]])
                for idx, (features, ranking, _) in enumerate(grouped_train_data)
            ],
            local_lr,
        ),
        target,
        indices[-1][1],
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()
    interactions = torch.cat([I for (_, _, I) in grouped_train_data])
    return (interactions, preds, preds_raw)


for num_query in num_query_per_user:
    print(f"Num query: {num_query}")
    for _ in tqdm(range(num_sim_round)):
        for qids in tqdm(
            grouper(data.get_all_query_ids(), num_query, incomplete="ignore"),
            total=len(data.get_all_query_ids()) // num_query,
        ):
            grouped_data = data.get_data_for_queries(list(qids))

            for model_name, model in models.items():
                for click_model_name, click_model in click_models.items():
                    interactions, preds, preds_raw = simulate_attack(
                        model, grouped_data, click_model
                    )
                    metrics.update(
                        f"{model_name}_{click_model_name}_{num_query}_query",
                        interactions,
                        preds,
                        preds_raw=preds_raw,
                    )

                    # Random guess
                    random_preds_raw = torch.rand(preds_raw.shape)
                    random_preds = random_preds_raw.round()
                    metrics.update(
                        f"random_{click_model_name}_{num_query}_query",
                        interactions,
                        random_preds,
                        preds_raw=random_preds_raw,
                    )

print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())

In [None]:
# PDGD: random queries

set_seed()

num_query_per_user = [8]
num_train_query = 3
num_item_per_ranking = 10
local_lr = 1.0

num_sim_round = 1
atk_lr = 1e-01
max_iter = 1000
num_atk = 1

metrics = Metrics()

def train(model, params, grouped_train_data, local_lr):
    cur_params = params.clone()

    for features, ranking, interactions in grouped_train_data:
        cur_grad = model.grad(
            cur_params,
            features,
            ranking,
            interactions,
        )

        cur_params = cur_params + local_lr * cur_grad

    return cur_params


def simulate_attack(model, grouped_data, click_model):
    params = model.gen_params()

    grouped_train_data = []
    indices = []
    start_ind = 0
    for relevances, features in grouped_data:
        features = torch.Tensor(features)
        ranking = model.rank(params, features, sample=True)[:num_item_per_ranking]
        features = features[ranking]
        interactions = torch.Tensor(click_model.click(ranking, relevances))

        # Remap the original ranking into the correct range
        _, ranking = torch.where(
            torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
        )
        grouped_train_data.append((features, ranking, interactions))
        indices.append((start_ind, start_ind + len(ranking)))
        start_ind += len(ranking)

    target_ind = random.sample(range(len(grouped_train_data)), num_train_query)

    target = train(
        model,
        params,
        [grouped_train_data[i] for i in target_ind],
        local_lr,
    )

    preds_raw, _ = reconstruct_interactions(
        lambda I: train(
            model,
            params,
            [
                (features, ranking, I[indices[idx][0] : indices[idx][1]])
                for idx, (features, ranking, _) in enumerate(grouped_train_data)
            ],
            local_lr,
        ),
        target,
        indices[-1][1],
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()
    interactions = torch.cat([I for (_, _, I) in grouped_train_data])
    return (interactions, preds, preds_raw, [indices[i] for i in target_ind])


for num_query in num_query_per_user:
    print(f"Num query: {num_query}")
    for _ in tqdm(range(num_sim_round)):
        for qids in tqdm(
            grouper(data.get_all_query_ids(), num_query, incomplete="ignore"),
            total=len(data.get_all_query_ids()) // num_query,
        ):
            grouped_data = data.get_data_for_queries(list(qids))

            for model_name, model in models.items():
                for click_model_name, click_model in click_models.items():
                    interactions, preds, preds_raw, indices = simulate_attack(
                        model, grouped_data, click_model
                    )

                    metrics.update(
                        f"{model_name}_{click_model_name}_{num_query}_query",
                        interactions,
                        preds,
                        preds_raw=preds_raw,
                    )

                    actual_interactions = []
                    actual_preds = []
                    actual_preds_raw = []
                    for (i1, i2) in indices:
                        actual_interactions += interactions[i1:i2]
                        actual_preds += preds[i1:i2]
                        actual_preds_raw += preds_raw[i1:i2]
                    
                    metrics.update(
                        f"{model_name}_{click_model_name}_{num_query}_query_actual",
                        actual_interactions,
                        actual_preds,
                        preds_raw=actual_preds_raw,
                    )

                    # Random guess
                    random_preds_raw = torch.rand(preds_raw.shape)
                    random_preds = random_preds_raw.round()
                    metrics.update(
                        f"random_{click_model_name}_{num_query}_query",
                        interactions,
                        random_preds,
                        preds_raw=random_preds_raw,
                    )

print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())

In [None]:
# PDGD: multiple queries, random order, DP

set_seed()

num_query_per_user = [1, 4, 8, 12, 16]
num_item_per_ranking = 10
local_lr = 1e-01

num_sim_round = 1
atk_lr = 1e-01
max_iter = 1000
num_atk = 1

epsilons = [0.125, 0.25, 0.5, 1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]
delta = 1e-08
sensitivity = 4.0

metrics = Metrics()

def train(model, params, grouped_train_data, local_lr):
    cur_params = params.clone()

    for features, ranking, interactions in grouped_train_data:
        cur_grad = model.grad(
            cur_params,
            features,
            ranking,
            interactions,
        )

        cur_params = cur_params + local_lr * cur_grad

    return cur_params


def simulate_attack(model, grouped_data, click_model, epsilon):
    params = model.gen_params()

    grouped_train_data = []
    indices = []
    start_ind = 0
    for relevances, features in grouped_data:
        features = torch.Tensor(features)
        ranking = model.rank(params, features, sample=True)[:num_item_per_ranking]
        features = features[ranking]
        interactions = torch.Tensor(click_model.click(ranking, relevances))

        # Remap the original ranking into the correct range
        _, ranking = torch.where(
            torch.sort(ranking)[0].unsqueeze(1) == ranking.unsqueeze(0)
        )
        grouped_train_data.append((features, ranking, interactions))
        indices.append((start_ind, start_ind + len(ranking)))
        start_ind += len(ranking)

    target = train(
        model,
        params,
        random.sample(grouped_train_data, len(grouped_train_data)),
        local_lr,
    )
    target = (apply_gaussian_mechanism(target, epsilon, delta, sensitivity) - params) / local_lr

    preds_raw, _ = reconstruct_interactions(
        lambda I: (train(
            model,
            params,
            [
                (features, ranking, I[indices[idx][0] : indices[idx][1]])
                for idx, (features, ranking, _) in enumerate(grouped_train_data)
            ],
            local_lr,
        ) - params) / local_lr,
        target,
        indices[-1][1],
        lr=atk_lr,
        max_iter=max_iter,
        num_rounds=num_atk,
        return_raw=True,
    )
    preds = preds_raw.sigmoid().round().long()
    interactions = torch.cat([I for (_, _, I) in grouped_train_data])
    return (interactions, preds, preds_raw)


for num_query in num_query_per_user:
    print(f"Num query: {num_query}")
    for _ in tqdm(range(num_sim_round)):
        for qids in tqdm(
            grouper(data.get_all_query_ids(), num_query, incomplete="ignore"),
            total=len(data.get_all_query_ids()) // num_query,
        ):
            grouped_data = data.get_data_for_queries(list(qids))

            for model_name, model in models.items():
                for click_model_name, click_model in click_models.items():
                    for epsilon in epsilons:
                        interactions, preds, preds_raw = simulate_attack(
                            model, grouped_data, click_model, epsilon
                        )
                        metrics.update(
                            f"{model_name}_{click_model_name}_{num_query}_query_eps_{epsilon}",
                            interactions,
                            preds,
                            preds_raw=preds_raw,
                        )

                        # Random guess
                        random_preds_raw = torch.rand(preds_raw.shape)
                        random_preds = random_preds_raw.round()
                        metrics.update(
                            f"random_{click_model_name}_{num_query}_query_eps_{epsilon}",
                            interactions,
                            random_preds,
                            preds_raw=random_preds_raw,
                        )

print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())

In [None]:
dp = metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe()
dp["eps"] = dp.index.to_series().apply(lambda name: float(name.split("_")[name.split("_").index("eps") + 1]))
dp["model"] = dp.index.to_series().apply(lambda name: "_".join(name.split("_")[:name.split("_").index("eps")]))
dp = dp.reset_index()

fig, ax = plt.subplots(ncols=2)

for model in dp[("model", "")].unique().tolist():
    df = dp[dp[("model", "")] == model].sort_values(by=["eps"])
    ax[0].plot(df[("eps", "")].astype(str), df[("auc", "mean")], 'o-', label=model)
    ax[1].plot(df[("eps", "")].astype(str), df[("auc-pr", "mean")], 'o-', label=model)
    ax[0].set_xticks(df[("eps", "")].astype(str))
    ax[1].set_xticks(df[("eps", "")].astype(str))

ax[0].set_ylabel("auc")
ax[1].set_ylabel("auc-pr")

ax[0].legend()
fig.set_figwidth(10)
fig.tight_layout()
plt.show()

In [None]:
for epsilon in epsilons:
    for num_query in num_query_per_user:
        print(f"Epsilon {epsilon}, num query {num_query}")
        for click_model in click_models.keys():
            print(f"{click_model} AUC p-value:", ks_2samp(
                metrics.df[metrics.df["name"] == f"linear_pdgd_{click_model}_{num_query}_query_eps_{epsilon}"].loc[:, "auc"],
                metrics.df[metrics.df["name"] == f"random_{click_model}_{num_query}_query_eps_{epsilon}"].loc[:, "auc"],
            ).pvalue)
            print(f"{click_model} AUC-PR p-value:", ks_2samp(
                metrics.df[metrics.df["name"] == f"linear_pdgd_{click_model}_{num_query}_query_eps_{epsilon}"].loc[:, "auc-pr"],
                metrics.df[metrics.df["name"] == f"random_{click_model}_{num_query}_query_eps_{epsilon}"].loc[:, "auc-pr"],
            ).pvalue)

In [None]:
# Collaborative Filtering

set_seed()

data = MovieLens("../dataset/ML-100K/u.data")
user_ids = data.get_all_user_ids()
item_ids = data.get_all_item_ids()
user_id_to_idx = {id: idx for idx, id in enumerate(user_ids)}
item_id_to_idx = {id: idx for idx, id in enumerate(item_ids)}
num_users = len(user_ids)
num_items = len(item_ids)
embedding_dim = 64
neg_sample_ratio = 4

local_lr = 1e-01

num_sim_round = 1
atk_lr = 1e-01
max_iter = 1000
num_atk = 5

metrics = Metrics()

for _ in tqdm(range(num_sim_round)):
    user_embeddings = nn.Embedding(num_users, embedding_dim)
    item_embeddings = nn.Embedding(num_items, embedding_dim)

    fcf = CollaborativeFilteringRecommender()
    fncf = NeuralCollaborativeFilteringRecommender(embedding_dim, [16, 8])

    for user_id in tqdm(user_ids):
        # Set up training data
        interacted_items = data.get_item_ids_for_users([user_id])[0]
        non_interacted_items = data.get_non_interacted_item_ids_for_users([user_id])[0]

        num_pos = len(interacted_items)
        sampled_non_interacted_items = random.sample(
            non_interacted_items,
            min(num_pos * neg_sample_ratio, len(non_interacted_items)),
        )
        num_neg = len(sampled_non_interacted_items)
        num_data = num_pos + num_neg

        user_embedding = (
            user_embeddings(torch.LongTensor([user_id_to_idx[user_id]]))
            .detach()
            .view(-1)
        )
        item_embedding = item_embeddings(
            torch.cat(
                [
                    torch.LongTensor([item_id_to_idx[id] for id in interacted_items]),
                    torch.LongTensor(
                        [item_id_to_idx[id] for id in sampled_non_interacted_items]
                    ),
                ]
            )
        ).detach()
        interactions = torch.cat([torch.ones(num_pos), torch.zeros(num_neg)])
        random_user_emb = torch.rand(embedding_dim)

        # FCF Simple
        target = fcf.item_grad(user_embedding, item_embedding, interactions)

        preds_raw, _ = reconstruct_interactions(
            lambda I: fcf.item_grad(random_user_emb, item_embedding, I),
            target,
            num_data,
            lr=atk_lr,
            max_iter=max_iter,
            num_rounds=num_atk,
            return_raw=True,
        )
        preds = preds_raw.sigmoid().round().long()

        metrics.update(
            f"FCF_emb_{embedding_dim}_simple",
            interactions,
            preds,
            preds_raw=preds_raw,
        )

        # FCF jointly estimate user embedding
        preds_raw, user_embedding_est, _ = reconstruct_interactions(
            lambda I, U: fcf.item_grad(U, item_embedding, I),
            target,
            num_data,
            private_params_size=embedding_dim,
            lr=atk_lr,
            max_iter=max_iter,
            num_rounds=num_atk,
            return_raw=True,
        )
        preds = preds_raw.sigmoid().round().long()
        embedding_err = F.mse_loss(user_embedding_est, user_embedding).item()

        metrics.update(
            f"FCF_emb_{embedding_dim}_joint",
            interactions,
            preds,
            preds_raw=preds_raw,
            extra_data={"embedding_err": embedding_err},
        )

        # FNCF setup
        target = fncf.item_grad(user_embedding, item_embedding, interactions)
        mean_norm = torch.linalg.vector_norm(target, dim=1).mean()
        norm_scale = max(torch.Tensor([1.0]), torch.Tensor([1e+02]) / mean_norm)
        custom_loss = lambda e1, e2: (e1 - e2).pow(2).sum(dim=1).sqrt().mean() * norm_scale

        # FNCF simple
        preds_raw, _ = reconstruct_interactions(
            lambda I: fncf.item_grad(random_user_emb, item_embedding, I),
            target,
            num_data,
            lr=atk_lr,
            max_iter=max_iter,
            num_rounds=num_atk,
            loss_func=custom_loss,
            return_raw=True,
        )
        preds = preds_raw.sigmoid().round().long()

        metrics.update(
            f"FNCF_emb_{embedding_dim}_simple",
            interactions,
            preds,
            preds_raw=preds_raw,
        )

        # FNCF jointly estimate user embedding
        preds_raw, user_embedding_est, _ = reconstruct_interactions(
            lambda I, U: fncf.item_grad(U, item_embedding, I),
            target,
            num_data,
            private_params_size=embedding_dim,
            lr=atk_lr,
            max_iter=max_iter,
            num_rounds=num_atk,
            loss_func=custom_loss,
            return_raw=True,
        )
        preds = preds_raw.sigmoid().round().long()
        embedding_err = F.mse_loss(user_embedding_est, user_embedding).item()

        metrics.update(
            f"FNCF_emb_{embedding_dim}_joint",
            interactions,
            preds,
            preds_raw=preds_raw,
            extra_data={"embedding_err": embedding_err},
        )

        # FNCF jointly estimate user embedding with neural net params
        feature_grad = fncf.feature_grad(user_embedding, item_embedding, interactions)

        preds_raw, user_embedding_est, _ = reconstruct_interactions(
            lambda I, U: (
                fncf.item_grad(U, item_embedding, I),
                fncf.feature_grad(U, item_embedding, I, retain_graph=True),
            ),
            (target, feature_grad),
            num_data,
            private_params_size=embedding_dim,
            lr=atk_lr,
            max_iter=max_iter,
            num_rounds=num_atk,
            loss_func=lambda t1, t2: custom_loss(t1[0], t2[0]) + F.mse_loss(t1[1], t2[1]),
            return_raw=True,
        )
        preds = preds_raw.sigmoid().round().long()
        embedding_err = F.mse_loss(user_embedding_est, user_embedding).item()

        metrics.update(
            f"FNCF_emb_{embedding_dim}_joint_model",
            interactions,
            preds,
            preds_raw=preds_raw,
            extra_data={"embedding_err": embedding_err},
        )

        # FNCF simple with neural net params
        feature_grad = fncf.feature_grad(user_embedding, item_embedding, interactions)

        preds_raw, _ = reconstruct_interactions(
            lambda I: (
                fncf.item_grad(random_user_emb, item_embedding, I),
                fncf.feature_grad(random_user_emb, item_embedding, I, retain_graph=True),
            ),
            (target, feature_grad),
            num_data,
            lr=atk_lr,
            max_iter=max_iter,
            num_rounds=num_atk,
            loss_func=lambda t1, t2: custom_loss(t1[0], t2[0]) + F.mse_loss(t1[1], t2[1]),
            return_raw=True,
        )
        preds = preds_raw.sigmoid().round().long()

        metrics.update(
            f"FNCF_emb_{embedding_dim}_simple_model",
            interactions,
            preds,
            preds_raw=preds_raw,
        )

        # Random guess
        preds_raw = 2 * torch.rand(num_data) - 1
        metrics.update(
            f"Random_emb_{embedding_dim}",
            interactions,
            preds_raw.sigmoid().round().long(),
            preds_raw=preds_raw,
        )

        # IMIA FCF
        # target = fcf.item_grad(user_embedding, item_embedding, interactions)
        # preds = interaction_mia_fedrec(
        #     lambda I: fcf.item_grad(random_user_emb, item_embedding, I),
        #     target,
        #     num_data,
        #     select_ratio=interactions.mean(),
        # )

        # metrics.update(
        #     "FCF_IMIA",
        #     interactions,
        #     preds,
        # )

print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())

In [None]:
print(metrics.df[["name", "auc", "auc-pr"]].groupby("name").describe().to_string())
pairs = [("joint", "simple"), ("joint_model", "joint"), ("simple_model", "simple"), ("joint", "simple_model")]
for model1, model2 in pairs:
    print(f"FNCF {model1} vs FNCF {model2} AUC p-value:", ks_2samp(
        metrics.df[metrics.df["name"] == f"FNCF_emb_{embedding_dim}_{model1}"].loc[:, "auc"],
        metrics.df[metrics.df["name"] == f"FNCF_emb_{embedding_dim}_{model2}"].loc[:, "auc"],
        alternative="less",
    ).pvalue)
    print(f"FNCF {model1} vs FNCF {model2} AUC-PR p-value:", ks_2samp(
        metrics.df[metrics.df["name"] == f"FNCF_emb_{embedding_dim}_{model1}"].loc[:, "auc-pr"],
        metrics.df[metrics.df["name"] == f"FNCF_emb_{embedding_dim}_{model2}"].loc[:, "auc-pr"],
        alternative="less",
    ).pvalue)