# Data Minimization: Inference Black-Box Attack on the Nursery Dataset

## Load data

In [None]:
# Add the directory containing shamir.py to the PYTHONPATH if needed.
import os
import sys
sys.path.insert(0, os.path.abspath('.'))

# Import the secret sharing module
from shamir import ShamirSecretSharingWrapper

# Import your minimizer and dataset utilities.
# (Assuming GeneralizeToRepresentative is defined in your minimizer module.)
from apt.minimization import GeneralizeToRepresentative
from apt.utils.dataset_utils import get_nursery_dataset_pd
import pandas as pd
import numpy as np

# ---------------------------
# Load the Nursery Dataset
# ---------------------------
(x_train, y_train), (x_test, y_test) = get_nursery_dataset_pd(transform_social=True)

## Train: XGBoost Model

In [None]:
from apt.utils.datasets import ArrayDataset
from apt.utils.models import SklearnClassifier, ModelOutputType
from xgboost import XGBClassifier

# Instantiate the XGBoost classifier.
# Note: We disable the use of the label encoder and specify a log-loss evaluation metric.
base_est = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Wrap the model using SklearnClassifier and specify that it outputs probabilities.
model = SklearnClassifier(base_est, ModelOutputType.CLASSIFIER_PROBABILITIES)

# Fit the model using your training data.
model.fit(ArrayDataset(x_train, y_train))

# Evaluate the model's accuracy on test data.
print('Base model accuracy:', model.score(ArrayDataset(x_test, y_test)))

## Train: XGBoost Minimization

In [None]:
from apt.minimization import GeneralizeToRepresentative
from sklearn.model_selection import train_test_split

# Use all columns for minimization in this example.
minimizer = GeneralizeToRepresentative(model)
X_generalizer_train, x_test, y_generalizer_train, y_test = train_test_split(x_test, y_test, stratify=y_test,
                                                                test_size = 0.4, random_state = 38)
x_train_predictions = model.predict(ArrayDataset(X_generalizer_train))
if x_train_predictions.shape[1] > 1:
    x_train_predictions = np.argmax(x_train_predictions, axis=1)
minimizer.fit(dataset=ArrayDataset(X_generalizer_train, x_train_predictions))
transformed = minimizer.transform(dataset=ArrayDataset(x_test))

print('Accuracy on minimized data: ', model.score(ArrayDataset(transformed, y_test)))
print('generalizations: ',minimizer.generalizations_)

## Shamir Secret Sharing: Appplication

In [None]:
# Choose sensitive features (for example, the first two columns)
sensitive_features = [x_train.columns[0], x_train.columns[1]]
sss = ShamirSecretSharingWrapper(n_shares=5, threshold=3)
shares = sss.split_dataframe(generalized_data, sensitive_features)

# Display the secret shares for one sensitive column.
print("Secret Shares for sensitive feature:", sensitive_features[0])
print(shares[sensitive_features[0]].head())

# Demonstrate reconstruction: Reconstruct the secret for the first record.
first_record_shares = shares[sensitive_features[0]].iloc[0].tolist()
# Re-create share tuples with known x-indices.
share_tuples = [(i+1, first_record_shares[i]) for i in range(5)]
reconstructed_value = sss.reconstruct_value(share_tuples[:3])  # using any 3 shares
print("Reconstructed value for first record, {}: {}".format(sensitive_features[0], reconstructed_value))

# ---------------------------
# Evaluate Model Accuracy
# ---------------------------
# Compute accuracy on the generalized (minimized) data.
if minimizer.encoder:
    data_for_scoring = minimizer.encoder.transform(generalized_data)
else:
    data_for_scoring = generalized_data
model_accuracy = minimizer.estimator.score(data_for_scoring, y_test)
print("Model accuracy on minimized data:", model_accuracy)

# ---------------------------
# Membership Inference Attack Metrics using IBM ART
# ---------------------------
# Install the ART library if not already installed:
# !pip install adversarial-robustness-toolbox

from art.attacks.inference.membership_inference import MembershipInferenceBlackBox
from art.estimators.classification import SklearnClassifier

# Wrap the underlying scikit-learn classifier used by your minimizer.
# (Assuming minimizer.estimator._model holds the trained sklearn model.)
art_classifier = SklearnClassifier(model=minimizer.estimator._model)

# Create a membership inference attack (using a black-box approach with a random forest attack model).
attack = MembershipInferenceBlackBox(art_classifier, attack_model_type='rf')

# Prepare membership data:
# Label training samples as members (1) and test samples as non-members (0).
membership_data = pd.concat([x_train, x_test])
membership_labels = np.concatenate([np.ones(len(x_train)), np.zeros(len(x_test))])

# Run the attack (the infer method returns membership probabilities).
attack_memberships = attack.infer(membership_data.to_numpy())

# Threshold the probabilities at 0.5 to decide membership.
attack_pred = (attack_memberships > 0.5).astype(int)

from sklearn.metrics import accuracy_score
attack_accuracy = accuracy_score(membership_labels, attack_pred)
print("Membership inference attack accuracy:", attack_accuracy)