# Training Notebook

In [2]:
import sys
import os

# Add the parent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from kyber.mlwe import MLWE
from ml_attack import check_secret, clean_secret, get_no_mod, LWEDataset, get_filename_from_params
from ml_attack.utils import get_lwe_default_params, get_reduction_default_params, get_continuous_reduction_default_params, get_default_params, get_b_distribution, get_percentage_true_b, get_true_mask, \
    get_vector_distribution
from ml_attack.train import LinearComplex, train_until_stall

import numpy as np
import torch
import torch.nn as nn
from scipy.stats import norm

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import HuberRegressor

  from .siever_params import SieverParams  # noqa


## Dataset creation

Training debug:

In [5]:
params = get_default_params()
params.update({
    'n': 128,
    'q': 3329,
    'secret_type': 'binary',

    'num_gen': 4,
    'seed': 0,
    
    'reduction_factor': 0.875,
    'reduction_resampling': True,
    
    'penalty': 4,
    'verbose': True,
    'save_to': './../data/'
})

filename = get_filename_from_params(params)

filename = "./../reduced_data/data_n_128_k_1_s_binary_f4818_48.pkl"

reload = True
if os.path.exists(filename) and reload:
    print(f"Loading dataset from {filename}")
    dataset = LWEDataset.load_reduced(filename)
    params = dataset.params
else:
    print(f"Generating dataset and saving to {filename}")
    dataset = LWEDataset(params)
    dataset.initialize()
    dataset.reduction()
    dataset.approximate_b()
    dataset.save_reduced()

Loading dataset from ./../reduced_data/data_n_128_k_1_s_binary_f4818_48.pkl


In [6]:
dataset.initialize_secret()
dataset.approximate_b()

In [7]:
get_percentage_true_b(dataset, verbose=True)

True B is the best candidate: 27270 / 39996 (68.18%)


np.float64(0.6818181818181818)

In [5]:
#for idx, value in enumerate(b_real):
#  print(f"Index {idx}: True B = {value}, best_b = {dataset.b_candidates[idx][np.argmax(dataset.b_probs[idx])]}, prob = {np.max(dataset.b_probs[idx]):.4f}")

IDEA: recover real b from approximated reduced b
-> Seems not working

In [None]:
q = params['q']

A = np.stack([dataset.A[i] for i in dataset.indices])
B = np.stack([dataset.B[i] for i in dataset.indices])
batch_size, m, n = A.shape
print(f"Shape of A: {A.shape}")
print(f"Shape of B: {B.shape}")

R = dataset.R
print(f"Shape of R: {R.shape}")

RA = R @ A % q
RA[RA > q // 2] -= q
print(f"Shape of RA: {RA.shape}")

Rb = np.array(dataset.best_b).reshape(batch_size, -1, 1)
print(f"Shape of Rb: {Rb.shape}")

b_real = get_no_mod(params, A, dataset.secret, B).reshape(batch_size, -1, 1)
print(f"Shape of b_real: {b_real.shape}")

Shape of A: (167, 112, 128)
Shape of B: (167, 112)
Shape of R: (167, 240, 112)
Shape of RA: (167, 240, 128)


ValueError: cannot reshape array of size 39996 into shape (167,newaxis,1)

In [None]:
# Compute norms
R_norm = np.linalg.norm(R, axis=2)      # Frobenius norm of each R matrix
RA_norm = np.linalg.norm(RA, axis=2)    # Frobenius norm of each RA matrix

_, var_s, _ = get_vector_distribution(params, params["secret_type"])
_, var_e, _ = get_vector_distribution(params, params["error_type"])

std_Rb = np.sqrt(RA_norm**2 * var_s + R_norm**2 * var_e)
print(f"Shape of std_b: {std_Rb.shape}")

p = 0.75
S_inv = norm.ppf((p + 1) / 2)
upper_bound = q / (2*S_inv)

w = params['penalty']
hermite = 1.02



Shape of std_b: (39, 60)
Det: 7775100061539390838314905510079722288876042707760630293127826525112188626504198409567314368635645340053270064556670976


In [9]:
A_reduced = dataset.get_A()
best_b = np.array(dataset.best_b)

# 1) Train until stall
use_gradient = False
if use_gradient:
    lr = 1e-3
    check_every = 10

    # Check if GPU is available and use it if possible
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LinearComplex(params).to(device)
    A_reduced = torch.tensor(A_reduced, dtype=torch.float).to(device)
    best_b = torch.tensor(best_b, dtype=torch.float).to(device)

    epoch = 0
    loss, epoch = train_until_stall(model, A_reduced, best_b, dataset, epoch=epoch)
    if loss == 0:
        print("Secret guessed correctly at epoch {}!".format(epoch))
    else:
        print(f"Stalling detected at loss {loss:.4f}.")

    raw_guessed_secret = model.guessed_secret.detach().cpu()
else:
    model = HuberRegressor(fit_intercept=True, max_iter=10000, alpha=0.0001, epsilon=1.25)
    raw_guessed_secret = model.fit(A_reduced, best_b).coef_

    guessed_secret = clean_secret(raw_guessed_secret, params)
    if check_secret(guessed_secret, dataset.A, dataset.B, params):
        print("Secret guessed correctly!")
    else:
        print("Wrong secret guessed!")

Secret guessed correctly!


In [10]:
real_mask = get_true_mask(dataset)
outlier_mask = model.outliers_
# Compare not mask to outlier_mask to see how many outliers are actual outliers
num_outliers = np.sum(outlier_mask)
num_actual_outliers = np.sum(outlier_mask & (~real_mask))
print(f"Number of outliers detected by model: {num_outliers}")
print(f"Number of actual outliers among detected: {num_actual_outliers}")
if num_outliers > 0:
  print(f"Fraction of detected outliers that are actual: {num_actual_outliers / num_outliers:.2%}")

Number of outliers detected by model: 1133
Number of actual outliers among detected: 0
Fraction of detected outliers that are actual: 0.00%


In [11]:
non_outlier_indices = np.where(~outlier_mask)[0]
get_percentage_true_b(dataset, verbose=True, indices=non_outlier_indices)

True B is the best candidate: 1207 / 1207 (100.00%)


np.float64(1.0)

In [12]:
# Check the guessed secret
raw_guessed_secret = raw_guessed_secret.cpu().detach().numpy() if use_gradient else raw_guessed_secret
guessed_secret = clean_secret(raw_guessed_secret, params)

real_secret = dataset.get_secret()

print("Raw Guessed secret:", raw_guessed_secret)
print("Guessed secret:", guessed_secret)
print("Actual secret:", real_secret)

Raw Guessed secret: [-3.30121720e-02  9.94070836e-01  1.00088861e+00  2.44779457e-02
  2.87433294e-02  8.21152801e-02  1.96291540e+00  1.25796908e-03
 -2.00125692e+00 -4.33339175e-02  1.92993940e+00  1.01227460e+00
 -1.02322144e+00 -9.61276681e-01 -1.03169826e+00 -2.80201576e-03
  9.80239215e-01  1.02770001e+00  2.31378724e-02  3.33541469e-02
  2.16773082e-03 -1.03868079e+00  1.00786894e+00  1.96812631e+00
  1.90708957e-02 -2.53998183e-02  2.00452338e-02  9.78496181e-01
  3.13591396e-02  9.95787920e-01 -1.05617563e+00 -9.96587743e-01]
Guessed secret: [ 0.  1.  1.  0.  0.  0.  2.  0. -2.  0.  2.  1. -1. -1. -1.  0.  1.  1.
  0.  0.  0. -1.  1.  2.  0.  0.  0.  1.  0.  1. -1. -1.]
Actual secret: [ 0  1  1  0  0  0  2  0 -2  0  2  1 -1 -1 -1  0  1  1  0  0  0 -1  1  2
  0  0  0  1  0  1 -1 -1]


In [13]:
# Check the differences between the guessed and actual secret
diff = guessed_secret - real_secret
raw_diff = raw_guessed_secret[diff != 0]
raw_diff[raw_diff > params['q'] // 2] -= params['q']
diff_indices = np.nonzero(diff)
if len(diff[diff != 0]) > 0:
    print("Number of differences:", len(diff[diff != 0]))
    print("Difference:", raw_diff)
    print("real_secret:", real_secret[diff != 0])
    print("guessed_secret:", guessed_secret[diff != 0])
    print("Indices of differences:", diff_indices)

In [14]:
close_to_integer = np.abs(raw_guessed_secret - np.round(raw_guessed_secret))
sorted_indices = np.argsort(-close_to_integer)
print("Sorted uncertain indices:", sorted_indices)
print("Sorted uncertain values:", np.round(close_to_integer[sorted_indices], 3))

if len(diff_indices[0]) > 0:
  diff_indices_in_sorted = [np.where(sorted_indices == i)[0][0] for i in diff_indices[0]]
  print("Worst case scenario:", max(diff_indices_in_sorted))

Sorted uncertain indices: [ 5 10 30  9 13 21  6 19  0 23 14 28  4 17 25  3 12 18 27 26 16 24 11 22
  1 29 31 15 20  7  8  2]
Sorted uncertain values: [0.082 0.07  0.056 0.043 0.039 0.039 0.037 0.033 0.033 0.032 0.032 0.031
 0.029 0.028 0.025 0.024 0.023 0.023 0.022 0.02  0.02  0.019 0.012 0.008
 0.006 0.004 0.003 0.003 0.002 0.001 0.001 0.001]


In [15]:
from itertools import product

# Find values in raw_guessed_secret that are within ±0.1 of an integer
close_to_integer = np.abs(raw_guessed_secret - np.round(raw_guessed_secret)) < 0.2
uncertain_count = np.sum(~close_to_integer)
print("Number of uncertain values:", uncertain_count)

# Calculate the number of brute force attacks to perform
brute_force_attempts = 2 ** uncertain_count
print("Number of brute force attempts required:", brute_force_attempts)

# Get the indices of uncertain values
uncertain_indices = np.where(~close_to_integer)[0]

real_uncertain_secret = real_secret[uncertain_indices]
print("Real uncertain secret:", real_uncertain_secret)

# Perform brute force attack
raw_uncertain_secret = raw_guessed_secret[uncertain_indices]
raw_uncertain_secret[raw_uncertain_secret > params['q'] // 2] -= params['q']
raw_uncertain_secret = raw_uncertain_secret[np.abs(raw_uncertain_secret) <= params['eta']]

lower_values = np.floor(raw_uncertain_secret)
upper_values = np.ceil(raw_uncertain_secret)

#values = product(*zip(lower_values, upper_values))

#for value in values:
#    print("Trying values:", value)
    # Create a copy of the guessed secret
#    brute_force_secret = copy.deepcopy(guessed_secret)
    # Update the uncertain values with the current combination
#    for idx, val in zip(uncertain_indices, value):
#        brute_force_secret[idx] = val
    # Check if the guessed secret is correct
#    if check_secret(brute_force_secret, dataset.A, dataset.B, params):
#        print("Brute force attack successful! Guessed secret:", brute_force_secret)
#        break

Number of uncertain values: 0
Number of brute force attempts required: 1
Real uncertain secret: []


In [16]:
def report(real_secret, guessed_secret):
    """
    Print classification report and confusion matrix.
    """
  
    # Get unique sorted labels and compute confusion matrix
    labels = np.unique(np.concatenate((real_secret, guessed_secret)))
    cm = confusion_matrix(real_secret, guessed_secret, labels=labels)

    # Header
    header = "       |" + "".join([f"{l:>6}" for l in labels]) + " | Accuracy"
    print("Confusion Matrix:")
    print(header)
    print("-" * len(header))

    # Rows
    for i, row in enumerate(cm):
        label = f"{labels[i]:>6} |"
        values = "".join([f"{v:6}" for v in row])

        correct = row[i]
        total = row.sum()
        acc = correct / total if total > 0 else 0.0
        print(label + values + f" | {acc:4.1%}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(real_secret, guessed_secret, zero_division=0))

report(real_secret, guessed_secret)

Confusion Matrix:
       |  -2.0  -1.0   0.0   1.0   2.0 | Accuracy
-------------------------------------------------
  -2.0 |     1     0     0     0     0 | 100.0%
  -1.0 |     0     6     0     0     0 | 100.0%
   0.0 |     0     0    14     0     0 | 100.0%
   1.0 |     0     0     0     8     0 | 100.0%
   2.0 |     0     0     0     0     3 | 100.0%

Classification Report:
              precision    recall  f1-score   support

          -2       1.00      1.00      1.00         1
          -1       1.00      1.00      1.00         6
           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00         8
           2       1.00      1.00      1.00         3

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32

