In [6]:
!pip install diffprivlib
!pip install torchopt

import functorch
import math
import numpy as np
import random
import torch
import torchopt
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import warnings
import json
import os.path
import pandas as pd
import itertools
import traceback
from torch.func import grad, vmap
from tqdm import tqdm
from typing import Dict, List, Tuple
from sklearn.preprocessing import StandardScaler

from diffprivlib.mechanisms import (
    Gaussian,
    GaussianAnalytic,
)
from sklearn.metrics import (
    accuracy_score,
    average_precision_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from tqdm.notebook import tqdm

warnings.filterwarnings("ignore", message='.*make_functional.*')

class RecommendationDataset:
    def __init__(self, df, user_id_col, item_id_col, rating_col):
        self.user_ids = np.sort(df[user_id_col].unique()).tolist()
        self.item_ids = np.sort(df[item_id_col].unique()).tolist()
        self.user_to_item_rating_map = df.groupby(user_id_col).apply(
            lambda x: dict(zip(x[item_id_col], x[rating_col]))
        )

    def get_all_user_ids(self) -> List:
        return self.user_ids

    def get_all_item_ids(self) -> List:
        return self.item_ids

    def get_item_ids_for_users(self, user_ids: List) -> List[List]:
        return (
            self.user_to_item_rating_map.loc[user_ids]
            .apply(lambda x: list(x.keys()))
            .tolist()
        )

    def get_non_interacted_item_ids_for_users(self, user_ids: List) -> List[List]:
        return (
            self.user_to_item_rating_map.loc[user_ids]
            .apply(lambda x: list(set(self.get_all_item_ids()) - set(x.keys())))
            .tolist()
        )

    # Return a list of dictionary from item id to rating
    def get_item_ratings_for_users(self, user_ids: List) -> List[Dict]:
        return self.user_to_item_rating_map.loc[user_ids].tolist()


class MovieLens(RecommendationDataset):
    def __init__(self, path="../dataset/ML-100K/u.data", sep="\t"):
        df = pd.read_csv(
            path,
            sep=sep,
            header=None,
            names=["user_id", "item_id", "rating"],
            usecols=["user_id", "item_id", "rating"],
        )
        super().__init__(df, "user_id", "item_id", "rating")


class Steam200K(RecommendationDataset):
    def __init__(self, path="../dataset/STEAM-200K/steam-200k.csv"):
        df = pd.read_csv(
            path,
            header=None,
            names=["user_id", "item_id", "behavior"],
            usecols=["user_id", "item_id", "behavior"],
        )
        df = df.drop_duplicates(subset=["user_id", "item_id"], ignore_index=True)
        df["item_id"], _ = pd.factorize(df["item_id"])
        df["rating"] = 1
        super().__init__(df, "user_id", "item_id", "rating")

def reconstruct_interactions(
    trainer,
    target_params,
    num_items,
    private_params_size=0,
    loss_fn=F.mse_loss,
    num_rounds=1,
    return_raw=False,
    prior_penalty=None,
    device=torch.device("cpu"),
    **kwargs,
):
    if prior_penalty is None:
        prior_penalty = lambda _: 0.0

    best_loss = math.inf
    best_opt_params = None

    for _ in range(num_rounds):
        opt_params = nn.Parameter(torch.rand(num_items + private_params_size, device=device) * 2 - 1)
        optimizer = optim.LBFGS([opt_params], line_search_fn="strong_wolfe", **kwargs)

        def calc_loss():
            optimizer.zero_grad()
            interactions = opt_params[:num_items].sigmoid()
            shadow_params = (
                trainer(interactions)
                if private_params_size == 0
                else trainer(interactions, opt_params[num_items:])
            )
            loss = loss_fn(shadow_params, target_params) + prior_penalty(interactions)
            loss.backward(inputs=[opt_params])
            return loss

        try:
            optimizer.step(calc_loss)
        except Exception:
            print("An exception occurred in the optimization step!")
            traceback.print_exc()
            continue

        optimizer_state = optimizer.state[list(optimizer.state)[0]]
        if "prev_loss" not in optimizer_state:
            print("Optimization did not take any step!")
            continue
        else:
            cur_loss = optimizer_state["prev_loss"]

        if cur_loss < best_loss:
            best_loss = cur_loss
            best_opt_params = opt_params.detach()

    if best_opt_params is None:
        print("Optimization failed! Defaulting to random guessing.")
        best_opt_params = torch.rand(num_items + private_params_size) * 2 - 1

    if private_params_size == 0:
        if return_raw:
            return best_opt_params, best_loss
        else:
            return best_opt_params.sigmoid().round().long(), best_loss
    else:
        if return_raw:
            return best_opt_params[:num_items], best_opt_params[num_items:], best_loss
        else:
            return (
                best_opt_params[:num_items].sigmoid().round().long(),
                best_opt_params[num_items:],
                best_loss,
            )

class NeuralCollaborativeFilteringRecommender(nn.Module):
    def __init__(self, embedding_size, hidden_sizes, bias=True):
        super().__init__()
        self.first_layer = nn.Linear(embedding_size * 2, hidden_sizes[0], bias=bias)
        self.fc_layers = []
        for i in range(len(hidden_sizes) - 1):
            self.fc_layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i + 1], bias=bias))
        self.fc_layers = nn.ModuleList(self.fc_layers)
        self.final_layer = nn.Linear(hidden_sizes[-1], 1, bias=bias)

    def forward(self, user_embedding, item_embeddings):
        embeddings = torch.cat(
            [user_embedding.expand(item_embeddings.shape[0], -1), item_embeddings], dim=1
        )
        res = F.relu(self.first_layer(embeddings))
        for layer in self.fc_layers:
            res = F.relu(layer(res))
        return F.sigmoid(self.final_layer(res))

    def item_grad(self, user_embedding, item_embeddings, interactions, create_graph=False):
        self.eval()
        item_embeddings.grad = None
        preds = self.forward(user_embedding, item_embeddings)
        loss = F.binary_cross_entropy(preds.view(-1), interactions)
        return autograd.grad(loss, item_embeddings, create_graph=create_graph)[0]

    def feature_grad(self, user_embedding, item_embeddings, interactions, create_graph=False):
        self.train()
        for p in self.parameters():
            p.grad = None
        preds = self.forward(user_embedding, item_embeddings)
        loss = F.binary_cross_entropy(preds.view(-1), interactions)
        grads = autograd.grad(loss, list(self.parameters()), create_graph=create_graph)
        return torch.cat([t.flatten() for t in grads])

class Metrics:
    def __init__(self, path=None):
        if path is not None:
            self.df = pd.read_csv(path)
        else:
            self.df = pd.DataFrame(
                {
                    "name": [],
                    "accuracy": [],
                    "f1": [],
                    "precision": [],
                    "recall": [],
                    "auc": [],
                    "auc-pr": [],
                    "extra_data": [],
                }
            )

    def update(self, name, target, preds, preds_raw=None, extra_data={}):
        row = {
            "name": name,
            "accuracy": accuracy_score(target, preds),
            "f1": f1_score(target, preds),
            "precision": precision_score(target, preds, zero_division=0),
            "recall": recall_score(target, preds),
            "auc": None if preds_raw is None else roc_auc_score(target, preds_raw),
            "auc-pr": None
            if preds_raw is None
            else average_precision_score(target, preds_raw),
            "extra_data": json.dumps(extra_data),
        }
        self.df.loc[len(self.df.index), :] = row

    def get_dataframe(self):
        return self.df

    def save(self, path):
        self.df.to_csv(path, index=False)

    def load(self, path):
        self.df = pd.read_csv(path)

    def print_summary(self, metrics=["auc"]):
        print(self.df[["name"] + metrics].groupby("name").describe().to_string())

def apply_gaussian_mechanism(input, epsilon, delta, sensitivity, scale_only=False):
    if math.isinf(epsilon):
        return input
    # Clip L2 norm to 0.5 * sensitivity (since global L2 sensitivity = 2 * max L2 norm)
    output = input * torch.minimum(torch.tensor(1.0), 0.5 * sensitivity / torch.linalg.vector_norm(input))
    if scale_only:
        return output

    # Add noise
    mechanism = (Gaussian if epsilon <= 1.0 else GaussianAnalytic)(
        epsilon=epsilon, delta=delta, sensitivity=sensitivity
    )
    return output.apply_(mechanism.randomise)

In [None]:
# Load dataset. Choose either ML-100K or STEAM-200K

data = MovieLens("./u.data")
# data = Steam200K("../dataset/STEAM-200K/steam-200k.csv")

user_ids = data.get_all_user_ids()
item_ids = data.get_all_item_ids()
user_id_to_idx = {id: idx for idx, id in enumerate(user_ids)}
item_id_to_idx = {id: idx for idx, id in enumerate(item_ids)}
num_users = len(user_ids)
num_items = len(item_ids)

# Scaled down for artifact eval. Comment this out to run on the entire dataset
user_ids = user_ids[:30]

In [None]:
# Main results in Table V and VI of Section VI.A as well as DP results in Table IX of Section VII.A

def set_seed(seed=2023):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
set_seed()

# Initialize embeddings and model
embedding_dim = 64
neg_sample_ratio = 4
user_embeddings = nn.Embedding(num_users, embedding_dim)
item_embeddings = nn.Embedding(num_items, embedding_dim)
fncf = NeuralCollaborativeFilteringRecommender(embedding_dim, [128, 64, 32])

# Reconstruction attack parameters
atk_lr = 1e-01
max_iter = 1000
num_atk = 1

# Differential privacy parameters
epsilons = [1.0, 20.0, 100.0, 500.0, math.inf]
delta = 1e-08
sensitivity = 1e-01

# Local learning parameters
local_epoch = 20
local_lr = 0.001
reg_factors = [0.0, 1.0] # IMIA defense mu, 0.0 means no defense

metrics = Metrics()

# Local training algorithm
def train_fncf_functional(model, user_embedding, item_embeddings, interactions, num_epoch, lr, reg_factor):
    user_embedding.grad = None
    item_embeddings.grad = None
    func_model, model_params = functorch.make_functional(model)
    opt_params = (user_embedding, item_embeddings, *model_params)
    # use_accelerated_op=True would be faster but prevent reconstruction for some reasons. Bug?
    # eps_root must be set
    optimizer = torchopt.FuncOptimizer(torchopt.adam(lr=lr, eps_root=1e-08))
    for _ in range(num_epoch):
        preds = func_model(opt_params[2:], opt_params[0], opt_params[1])
        reg_loss = reg_factor * F.l1_loss(opt_params[1], item_embeddings) # IMIA defense L1 reg term
        loss = F.binary_cross_entropy(preds.view(-1), interactions) + reg_loss
        opt_params = optimizer.step(loss, opt_params)
    return item_embeddings - opt_params[1], opt_params[2:]

# Simulate attack on each user
for user_id in tqdm(user_ids):
    # Sample items and interactions
    interacted_items = data.get_item_ids_for_users([user_id])[0]
    non_interacted_items = data.get_non_interacted_item_ids_for_users([user_id])[0]

    num_pos = len(interacted_items)
    sampled_non_interacted_items = random.sample(
        non_interacted_items,
        min(num_pos * neg_sample_ratio, len(non_interacted_items)),
    )
    num_neg = len(sampled_non_interacted_items)
    num_data = num_pos + num_neg

    user_embedding = (
        user_embeddings(torch.LongTensor([user_id_to_idx[user_id]]))
        .detach()
        .view(-1)
    )
    item_embedding = item_embeddings(
        torch.cat(
            [
                torch.LongTensor([item_id_to_idx[id] for id in interacted_items]),
                torch.LongTensor(
                    [item_id_to_idx[id] for id in sampled_non_interacted_items]
                ),
            ]
        )
    ).detach()
    user_embedding.requires_grad_()
    item_embedding.requires_grad_()
    interactions = torch.cat([torch.ones(num_pos), torch.zeros(num_neg)]) # Ground truth interactions
    random_user_emb = torch.rand(embedding_dim, requires_grad=True) # The server doesn't know the real user embedding

    for epsilon in epsilons:
        for reg_factor in reg_factors:
            # Local training
            target, target_model_params = train_fncf_functional(fncf, user_embedding, item_embedding, interactions, local_epoch, local_lr, reg_factor)
            target = apply_gaussian_mechanism(target.detach(), epsilon, delta, sensitivity=sensitivity)

            # Attack
            mean_norm = torch.linalg.vector_norm(target, dim=1).mean()
            norm_scale = max(torch.Tensor([1.0]), torch.Tensor([1e+02]) / mean_norm)
            custom_loss = lambda e1, e2: F.pairwise_distance(e1, e2).mean() * norm_scale
            preds_raw, _ = reconstruct_interactions(
                lambda I: train_fncf_functional(fncf, random_user_emb, item_embedding, I, local_epoch, local_lr, reg_factor)[0] / local_lr,
                target / local_lr,
                num_data,
                lr=atk_lr,
                max_iter=max_iter,
                num_rounds=num_atk,
                loss_fn=custom_loss,
                return_raw=True,
            )
            preds = preds_raw.sigmoid().round().long()

            # Update attack performance
            metrics.update(
                f"FNCF_eps_{epsilon}_IMIA_{reg_factor}",
                interactions,
                preds,
                preds_raw=preds_raw,
            )

metrics.save("./rec_metrics.csv")
metrics.print_summary(["auc", "f1"])