# Training Notebook

In [1]:
import sys
import os

# Add the parent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from ml_attack import check_secret, clean_secret, get_no_mod, LWEDataset, get_filename_from_params, get_default_params, get_vector_distribution
from ml_attack.utils import get_percentage_true_b, get_expected_percentage_true_b
from ml_attack.train import LinearComplex, train_until_stall

import numpy as np
import torch
import torch.nn as nn

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import HuberRegressor

  from .siever_params import SieverParams  # noqa


## Dataset creation

Training debug:

In [2]:
filename = "./../reduced_data/data_n_128_k_1_s_binary_f4818_48.pkl"

if os.path.exists(filename):
    print(f"Loading dataset from {filename}")
    dataset = LWEDataset.load_reduced(filename)
    params = dataset.params
    dataset.initialize_secret()
else:
    print(f"File {filename} does not exist.")

Loading dataset from ./../reduced_data/data_n_128_k_1_s_binary_f4818_48.pkl


In [3]:
in_candidates = 0
exact_candidates = 0
indeces = []

dataset.params["approximation_std"] = 2
dataset.approximate_b()

secret = dataset.get_secret()
A_reduced = dataset.get_A()
b_reduced = dataset.get_B()

b_real = get_no_mod(params, A_reduced, secret, b_reduced)

mask = np.zeros(A_reduced.shape[0], dtype=np.bool)
c = np.zeros(A_reduced.shape[0])

probs_fake = []
probs_real = []

for i in range(len(b_real)):
    true_b = b_real[i]

    candidates = dataset.b_candidates[i]
    probs = dataset.b_probs[i]
    c[i] = (true_b - dataset.best_b[i]) // params['q']

    if true_b in candidates:
        in_candidates += 1
        if true_b == dataset.best_b[i]:
            exact_candidates += 1
            mask[i] = True
            probs_real.append(np.max(probs))
        else:
            #print(f"Index {i}: {true_b} not the best candidate: {candidates[np.argmax(probs)]} with prob {np.max(probs)}")
            #print(f"Other candidates: {candidates} with probs {probs}")
            indeces.append(i)
            probs_fake.append(np.max(probs))
    else:
        #print(f"Index {i}: {true_b} not in set: {candidates}")
        indeces.append(i)
    
length = len(b_real)
print(f"True B in candidate set: {in_candidates} / {length} ({100 * in_candidates / length:.2f}%)")
print(f"True B is the best candidate: {exact_candidates} / {length} ({100 * exact_candidates / length:.2f}%)")
print(f"Indeces: {indeces}")
print(f"C values: {sorted(list(set(c.tolist())))}")
print("C value counts:")
unique_c, counts_c = np.unique(c, return_counts=True)
for val, count in zip(unique_c, counts_c):
    print(f"  C = {val}: {count}")
print(f"Mean prob of fake candidates: {np.mean(probs_fake):.4f} (std: {np.std(probs_fake):.4f})")
print(f"Mean prob of real candidates: {np.mean(probs_real):.4f} (std: {np.std(probs_real):.4f})")

True B in candidate set: 39887 / 39996 (99.73%)
True B is the best candidate: 27270 / 39996 (68.18%)
Indeces: [15, 17, 27, 29, 32, 33, 37, 40, 41, 43, 44, 48, 50, 55, 56, 58, 59, 67, 70, 72, 74, 76, 82, 85, 86, 88, 90, 97, 98, 99, 103, 107, 109, 113, 124, 127, 128, 134, 135, 140, 144, 146, 147, 148, 149, 152, 153, 160, 161, 163, 164, 166, 174, 175, 178, 181, 183, 187, 192, 194, 201, 202, 203, 205, 206, 207, 210, 211, 212, 221, 222, 223, 224, 229, 230, 234, 235, 242, 245, 246, 247, 257, 265, 267, 270, 280, 285, 289, 299, 303, 308, 311, 313, 315, 318, 322, 323, 324, 331, 333, 334, 336, 338, 339, 340, 342, 344, 349, 350, 354, 357, 360, 361, 367, 374, 375, 378, 379, 391, 394, 396, 398, 399, 404, 413, 414, 415, 419, 421, 422, 428, 430, 436, 438, 445, 448, 449, 450, 452, 455, 457, 465, 467, 468, 470, 474, 478, 479, 485, 497, 498, 502, 503, 507, 511, 512, 516, 517, 532, 534, 535, 536, 539, 542, 543, 544, 545, 551, 552, 554, 555, 557, 558, 564, 565, 570, 571, 573, 574, 575, 581, 583, 584, 588,

In [4]:
percentages = [0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0]
for p in percentages:
    expected = get_expected_percentage_true_b(dataset, p, verbose=True)
    actual = get_percentage_true_b(dataset, p, verbose=True)

[BEST 1% STD] Expected true B is best candidate: 72.67%
[BEST 1% STD] True B is the best candidate: 283 / 399 (70.93%)
[BEST 5% STD] Expected true B is best candidate: 70.74%
[BEST 5% STD] True B is the best candidate: 1458 / 1999 (72.94%)
[BEST 10% STD] Expected true B is best candidate: 69.43%
[BEST 10% STD] True B is the best candidate: 2871 / 3999 (71.79%)
[BEST 20% STD] Expected true B is best candidate: 68.31%
[BEST 20% STD] True B is the best candidate: 5688 / 7999 (71.11%)
[BEST 40% STD] Expected true B is best candidate: 66.99%
[BEST 40% STD] True B is the best candidate: 11293 / 15998 (70.59%)
[BEST 60% STD] Expected true B is best candidate: 66.08%
[BEST 60% STD] True B is the best candidate: 16724 / 23997 (69.69%)
[BEST 80% STD] Expected true B is best candidate: 65.31%
[BEST 80% STD] True B is the best candidate: 22061 / 31996 (68.95%)
Expected true B is best candidate: 64.46%
True B is the best candidate: 27270 / 39996 (68.18%)


In [5]:
_, _, std_B = dataset.get_b_distribution()
choosen_percentage = 0.05

num_selected = int(len(std_B) * choosen_percentage)
selected_indices = np.argsort(std_B)[:num_selected]

In [6]:
A_reduced = dataset.get_A()[selected_indices]
best_b = dataset.best_b[selected_indices]
c = c[selected_indices]

model = HuberRegressor(fit_intercept=True, max_iter=10000, alpha=0.001, epsilon=1)
raw_guessed_secret = model.fit(A_reduced, best_b).coef_

guessed_secret = clean_secret(raw_guessed_secret, params)
if check_secret(guessed_secret, dataset.A, dataset.B, params):
    print("Secret guessed correctly!")
else:
    print("Wrong secret guessed!")

Wrong secret guessed!


## Unsupervised Learning Outlier detection

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

loss_fn = nn.MSELoss()
model = LinearComplex(params).to(device)
model.guessed_secret = nn.Parameter(torch.tensor(raw_guessed_secret, dtype=torch.float).to(device), requires_grad=True)

A_reduced = torch.tensor(A_reduced, dtype=torch.float).to(device)
best_b = torch.tensor(best_b, dtype=torch.float).to(device)

model.eval()

grads = []
pred_b = model(A_reduced)

for i in range(len(A_reduced)):
    model.zero_grad()
    output = model(A_reduced[i].unsqueeze(0))
    loss = loss_fn(output, best_b[i].unsqueeze(0))
    loss.backward()

    grads.append(model.guessed_secret.grad.detach().cpu().numpy().copy())

grads = np.array(grads)  # shape: (N, D)
best_b = best_b.cpu().numpy()  # shape: (N,)

In [8]:
from sklearn.decomposition import PCA
import plotly.graph_objects as go

pca = PCA()
pca_transformed_data = pca.fit_transform(grads)

explained_variance = pca.explained_variance_ratio_

# plot explained variance and cumulative explained variance
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, len(explained_variance) + 1)), y=np.cumsum(explained_variance), name='Cumulative explained variance'))

# Add labels on the x-axis at every 0.1 increment on the y-axis
y_values = np.arange(0.1, 1.1, 0.1)  # Adjust the range and step as needed
x_values = np.interp(y_values, np.cumsum(explained_variance), range(1, len(explained_variance) + 1))
fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='text', text=[str(int(x)) for x in x_values], textposition='bottom center', name='n_components'))

fig.update_layout(title='Cumulative explained variance')
fig.show()

In [9]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Perform PCA
pca = PCA(n_components=7)
pca_transformed_data = pca.fit_transform(grads)

# Calculate the silhouette score for different numbers of clusters
silhouette_scores = []
max_clusters = 15
for n_clusters in range(2, max_clusters):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    clusters = kmeans.fit_predict(pca_transformed_data)
    silhouette_scores.append(silhouette_score(pca_transformed_data, clusters))

# Plot the silhouette scores
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(2, max_clusters+1)), y=silhouette_scores, mode='lines+markers'))
fig.update_layout(title='Silhouette scores for different numbers of clusters', xaxis_title='Number of clusters', yaxis_title='Silhouette score')
fig.show()

In [12]:
from sklearn.manifold import TSNE

# Perform PCA to reduce dimensionality before t-SNE
pca = PCA(n_components=100)
pca_transformed_data = pca.fit_transform(grads)

# Perform K-means clustering
kmeans = KMeans(n_clusters=2, random_state=0)
clusters = kmeans.fit_predict(pca_transformed_data)

# Perform t-SNE
tsne = TSNE(n_components=2, perplexity=40)
tsne_transformed_data = tsne.fit_transform(pca_transformed_data)

In [13]:
# Create a scatter plot of the t-SNE results
fig = go.Figure()
for cluster in np.unique(clusters):
    cluster_data = tsne_transformed_data[clusters == cluster]
    fig.add_trace(go.Scatter(
        x=cluster_data[:, 0],
        y=cluster_data[:, 1],
        mode='markers',
        name=f'Cluster {cluster}',
        marker=dict(size=4)  # smaller point size
    ))
fig.update_layout(title='t-SNE visualization of K-means clusters', xaxis_title='t-SNE Component 1', yaxis_title='t-SNE Component 2')
fig.show()

# Create a scatter plot of the t-SNE results colored by c values
import plotly.express as px

fig = px.scatter(
    x=tsne_transformed_data[:, 0],
    y=tsne_transformed_data[:, 1],
    color=c,
    color_continuous_scale='Viridis',
    labels={'color': 'c value'},
    title='t-SNE visualization colored by c values'
)
fig.update_traces(marker=dict(size=4))  # smaller point size
fig.update_layout(xaxis_title='t-SNE Component 1', yaxis_title='t-SNE Component 2')
fig.show()

In [14]:
from scipy.optimize import linear_sum_assignment
import numpy as np
import pandas as pd

# Build co-occurrence matrix (rows=c values, columns=clusters)
df = pd.DataFrame({'c': c, 'cluster': clusters})
cooccurrence = pd.crosstab(df['c'], df['cluster'])  

# Convert to cost matrix (negate frequencies for maximization)
cost_matrix = -cooccurrence.values  # Hungarian minimizes cost

# Apply Hungarian Algorithm
row_ind, col_ind = linear_sum_assignment(cost_matrix)

# Map results
c_values = cooccurrence.index.values
cluster_values = cooccurrence.columns.values

mapping = {c_values[i]: cluster_values[j] for i, j in zip(row_ind, col_ind)}
print("Optimal unique mapping from c → cluster:")
for k, v in mapping.items():
    print(f"  c = {k} → cluster = {v}")

# Create a mapping from c values to cluster labels
inverse_mapping = {v: k for k, v in mapping.items()}
# Map each predicted cluster to the corresponding c value
predicted_c = np.array([inverse_mapping.get(cl, -99) for cl in clusters])  # -99 for unmatched
valid_mask = predicted_c != -99
y_true = np.array(c)[valid_mask]
y_pred = predicted_c[valid_mask]

from sklearn.metrics import confusion_matrix, adjusted_rand_score, normalized_mutual_info_score, fowlkes_mallows_score

print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\nAdjusted Rand Index (ARI):", adjusted_rand_score(y_true, y_pred))
print("Normalized Mutual Information (NMI):", normalized_mutual_info_score(y_true, y_pred))
print("Fowlkes-Mallows Score (FMS):", fowlkes_mallows_score(y_true, y_pred))


Optimal unique mapping from c → cluster:
  c = -1.0 → cluster = 1
  c = 0.0 → cluster = 0
Confusion Matrix:
[[   0  255    0]
 [   1 1457    0]
 [   0  286    0]]

Adjusted Rand Index (ARI): -0.0007451180492353387
Normalized Mutual Information (NMI): 0.00040734800574767575
Fowlkes-Mallows Score (FMS): 0.7533980389104392


In [15]:
# Get indices of samples in the cluster mapped to c=0
cluster_for_c0 = mapping[0]
indices_c0 = np.where(clusters == cluster_for_c0)[0]

get_percentage_true_b(dataset, indices=indices_c0, verbose=True)

# Prepare data for retraining
A_c0 = A_reduced[indices_c0].cpu().numpy() if hasattr(A_reduced, 'cpu') else A_reduced[indices_c0]
b_c0 = best_b[indices_c0]

# Retrain HuberRegressor on the filtered data
model_c0 = HuberRegressor(fit_intercept=True, max_iter=10000, alpha=0.001, epsilon=1)
raw_guessed_secret = model_c0.fit(A_c0, b_c0).coef_

# Clean and check the new guessed secret
guessed_secret = clean_secret(raw_guessed_secret, params)
if check_secret(guessed_secret, dataset.A, dataset.B, params):
  print("Secret guessed correctly with c=0 cluster data!")
else:
  print("Wrong secret guessed with c=0 cluster data!")

True B is the best candidate: 1369 / 1998 (68.52%)
Wrong secret guessed with c=0 cluster data!


In [14]:
# Check the guessed secret
guessed_secret = clean_secret(raw_guessed_secret, params)

real_secret = dataset.get_secret()

print("Raw Guessed secret:", raw_guessed_secret)
print("Guessed secret:", guessed_secret)
print("Actual secret:", real_secret)

Raw Guessed secret: [ 0.59318646  0.25292998  0.20929181  0.58574537  0.54388688  0.05775208
  0.38917655  0.10295486  0.41686651  0.63387019  0.78582966  0.71279775
  0.64419477  0.53049803  0.21937334  0.29376306  0.52367064  0.56076761
  0.4224168   0.63741601  0.0821955   0.66760083  0.51129317  1.09838369
  0.16141902  0.50607918  0.29446124  0.05227455  0.26154396  0.4771835
  1.0590037   0.68565996  0.3561495   0.32581986  0.28140811  0.27286776
  0.23752957  0.31582034  0.76445648  0.01254398  0.20047395  1.14250411
  0.51097428  0.64389247  0.53227683  0.05310978  0.62550194  0.86304648
  0.69335635  0.13533053  0.67536787  0.58362744  0.1687727   0.30340198
  0.51433602  0.36934399  0.86701     0.29504691  0.80586994  0.5555124
  0.09104143  0.3001203  -0.06383873  0.89996187  0.6443566   0.46980288
  0.6032483   0.43891073  0.71118315  0.58693158  0.25899857  0.62211099
  0.1780037  -0.08788786  0.25048936  0.28997267  0.43969329  0.60126929
  0.86050763  0.74384986  0.25618

In [15]:
# Check the differences between the guessed and actual secret
diff = guessed_secret - real_secret
raw_diff = raw_guessed_secret[diff != 0]
raw_diff[raw_diff > params['q'] // 2] -= params['q']
diff_indices = np.nonzero(diff)
if len(diff[diff != 0]) > 0:
    print("Number of differences:", len(diff[diff != 0]))
    print("Difference:", raw_diff)
    print("real_secret:", real_secret[diff != 0])
    print("guessed_secret:", guessed_secret[diff != 0])
    print("Indices of differences:", diff_indices)

Number of differences: 62
Difference: [ 0.59318646  0.25292998  0.20929181  0.54388688  0.38917655  0.41686651
  0.71279775  0.64419477  0.4224168   0.63741601  0.05227455  0.68565996
  0.27286776  0.23752957  0.31582034  0.76445648  0.01254398  1.14250411
  0.51097428  0.53227683  0.69335635  0.13533053  0.58362744  0.30340198
  0.86701     0.29504691  0.5555124   0.6443566   0.46980288  0.58693158
  0.25899857  0.62211099  0.25048936  0.28997267  0.43969329  0.74384986
  0.25618862  0.48973759  0.38925224  0.72987642  0.71632339  0.60495492
  0.86084746  0.17797387  0.64630185  0.1906606   0.38881701  0.19531685
  0.36232077  0.15567725  0.31766912  0.21443869  0.47081835  0.66547664
  0.26045525  0.36014036  0.37519954 -0.01577054  0.8830777   0.75075901
  0.60696943  0.58184317]
real_secret: [0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 1 0 0 0 0 1 0 1 0 1 0 0 1 0 1 0 1 1 1 0 1
 1 1 0 0 0 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0 0]
guessed_secret: [1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 1. 0. 0. 0. 1. 0

In [16]:
close_to_integer = np.abs(raw_guessed_secret - np.round(raw_guessed_secret))
sorted_indices = np.argsort(-close_to_integer)
print("Sorted uncertain indices:", sorted_indices)
print("Sorted uncertain values:", np.round(close_to_integer[sorted_indices], 3))

if len(diff_indices[0]) > 0:
  diff_indices_in_sorted = [np.where(sorted_indices == i)[0][0] for i in diff_indices[0]]
  print("Worst case scenario:", max(diff_indices_in_sorted))

Sorted uncertain indices: [ 25  81  42  22  54 123  29  16 105  65  13 127  44 104   4 111  59  76
  17  67  18 126   8  51 109   3  69  94   0  77  66  86 124  83   6  92
 125 108  71 119  46 101  55   9  19  96 113  32  43  12  64  90 114 107
  21  33  50  99  37  31  48  53  61  57  26  15  75  93  68  11  85  34
  82  35 100  84  28 110  70  80  79   1  74 122  36  38  14 103 102  10
   2 112  40  95  58  91  72  89  52  88  24 118  98  41  78  87  47  49
  56 106 121   7  63  23  60  73  20  62  97  30   5  45  27 116 117 120
  39 115]
Sorted uncertain values: [0.494 0.49  0.489 0.489 0.486 0.483 0.477 0.476 0.471 0.47  0.47  0.469
 0.468 0.457 0.456 0.447 0.444 0.44  0.439 0.439 0.422 0.418 0.417 0.416
 0.415 0.414 0.413 0.411 0.407 0.399 0.397 0.395 0.393 0.389 0.389 0.389
 0.388 0.383 0.378 0.375 0.374 0.372 0.369 0.366 0.363 0.362 0.36  0.356
 0.356 0.356 0.356 0.354 0.338 0.335 0.332 0.326 0.325 0.318 0.316 0.314
 0.307 0.303 0.3   0.295 0.294 0.294 0.29  0.289 0.289 0.287 0.

In [17]:
from itertools import product

# Find values in raw_guessed_secret that are within ±0.1 of an integer
close_to_integer = np.abs(raw_guessed_secret - np.round(raw_guessed_secret)) < 0.2
uncertain_count = np.sum(~close_to_integer)
print("Number of uncertain values:", uncertain_count)

# Calculate the number of brute force attacks to perform
brute_force_attempts = 2 ** uncertain_count
print("Number of brute force attempts required:", brute_force_attempts)

# Get the indices of uncertain values
uncertain_indices = np.where(~close_to_integer)[0]

real_uncertain_secret = real_secret[uncertain_indices]
print("Real uncertain secret:", real_uncertain_secret)

# Perform brute force attack
raw_uncertain_secret = raw_guessed_secret[uncertain_indices]
raw_uncertain_secret[raw_uncertain_secret > params['q'] // 2] -= params['q']
raw_uncertain_secret = raw_uncertain_secret[np.abs(raw_uncertain_secret) <= params['eta']]

lower_values = np.floor(raw_uncertain_secret)
upper_values = np.ceil(raw_uncertain_secret)

#values = product(*zip(lower_values, upper_values))

#for value in values:
#    print("Trying values:", value)
    # Create a copy of the guessed secret
#    brute_force_secret = copy.deepcopy(guessed_secret)
    # Update the uncertain values with the current combination
#    for idx, val in zip(uncertain_indices, value):
#        brute_force_secret[idx] = val
    # Check if the guessed secret is correct
#    if check_secret(brute_force_secret, dataset.A, dataset.B, params):
#        print("Brute force attack successful! Guessed secret:", brute_force_secret)
#        break

Number of uncertain values: 93
Number of brute force attempts required: 0
Real uncertain secret: [0 1 1 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1
 0 1 0 1 1 0 1 0 0 0 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0
 1 1 1 1 0 1 0 1 0 1 1 1 1 0 0 0 1 0 1]


In [18]:
def report(real_secret, guessed_secret):
    """
    Print classification report and confusion matrix.
    """
  
    # Get unique sorted labels and compute confusion matrix
    labels = np.unique(np.concatenate((real_secret, guessed_secret)))
    cm = confusion_matrix(real_secret, guessed_secret, labels=labels)

    # Header
    header = "       |" + "".join([f"{l:>6}" for l in labels]) + " | Accuracy"
    print("Confusion Matrix:")
    print(header)
    print("-" * len(header))

    # Rows
    for i, row in enumerate(cm):
        label = f"{labels[i]:>6} |"
        values = "".join([f"{v:6}" for v in row])

        correct = row[i]
        total = row.sum()
        acc = correct / total if total > 0 else 0.0
        print(label + values + f" | {acc:4.1%}")

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(real_secret, guessed_secret, zero_division=0))

report(real_secret, guessed_secret)

Confusion Matrix:
       |   0.0   1.0 | Accuracy
-------------------------------
   0.0 |    32    28 | 53.3%
   1.0 |    34    34 | 50.0%

Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.53      0.51        60
           1       0.55      0.50      0.52        68

    accuracy                           0.52       128
   macro avg       0.52      0.52      0.52       128
weighted avg       0.52      0.52      0.52       128

