### Analyse the output file to retrive statistics on the run

In [52]:
import re
import ast
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
import numpy as np
import glob
import os

import pandas as pd

In [57]:
output_files = "../cluster/outputs/*.out"

if isinstance(output_files, str):
    output_files = sorted(glob.glob(output_files))
    if not output_files:
        print("No output files found.")
        exit(1)
  
print(f"Found {len(output_files)} output files.")

Found 160 output files.


In [58]:
def parse_output_file(output_file):
    """Parse the output file and extract relevant statistics."""
    print(f"Parsing output file: {output_file}")

    # Statistics from the output file
    with open(output_file, 'r') as file:
        content = file.readlines()
        processed_content = []
        for line in content:
            if '.-' in line:
                split_lines = line.replace('.-', '.\n-').splitlines()
                processed_content.extend(split_lines)
            else:
                processed_content.append(line)
        content = [line for line in processed_content if line.strip()]

    # Prepare lists to collect the results
    results = {
        "output_file": os.path.basename(output_file),
        "exp_perc": Counter(),
        "best_b": Counter(),
        "exp_perc_subset": defaultdict(dict),
        "best_b_subset": defaultdict(dict),
        "tour_to_time": defaultdict(),
        "block_sizes": defaultdict(list),
        "num_updates": defaultdict(list),
        "std_B": defaultdict(float),
        "start_bkz": None,
        "total_time": 0,
        "total_train_samples": 0,
        "confusion_matrix": None,
        "classification_report": None,
        "success": False,
        "parameters": {},
        "num_matrices": None
    }

    # Extract dictionary from the first line using ast.literal_eval for safety
    parameters_line = content[0]
    results['parameters'] = ast.literal_eval(parameters_line.strip().split("Parameters: ")[1])

    # Extract number of matrices from the line "Attacking X matrices"
    matrices_line = content[1]
    num_matrices_match = re.search(r"Attacking (\d+) matrices*", matrices_line)
    results['num_matrices'] = int(num_matrices_match.group(1)) if num_matrices_match else None

    # Iterate through the file content and parse relevant data
    counter = 2
    current_tour = 0
    while counter < len(content):
        line = content[counter]
        if line.startswith("Tour"):
            # Extract tour number
            match = re.search(r'Tour (\d+)', line)
            current_tour = int(match.group(1)) - 1 if match else current_tour + 1

            # Extract time from "Tour X completed after Y seconds."
            time_match = re.search(r'Tour \d+ completed after ([\d.]+) seconds', line)
            if time_match:
                tour_time = float(time_match.group(1)) / 3600 # Convert seconds to hours
                results['tour_to_time'][current_tour] = tour_time

            # Handle alternative tour line format
            alt_tour_match = re.match(
                r"Tour (\d+)\s*\|\s*Time:\s*([\d.]+)s\s*\|\s*Mean std_B:\s*([\d.]+)\s*\|\s*Performance:\s*([\d.]+)", line)
            if alt_tour_match:
                current_tour = int(alt_tour_match.group(1)) - 1
                tour_time = float(alt_tour_match.group(2)) / 3600  # Convert seconds to hours
                results['tour_to_time'][current_tour] = tour_time
                results['std_B'][current_tour] = float(alt_tour_match.group(3))
                # Optionally, store performance if needed:
                # results['performance'][current_tour] = float(alt_tour_match.group(4))

            # Handle alternative tour line format with "Reduced performance"
            alt_tour_reduced_perf = re.match(
                r"Tour (\d+)\s*\|\s*Time:\s*([\d.]+)s\s*\|\s*Reduced performance:\s*([\d.]+)\s*\|\s*Mean std_B:\s*([\d.]+)", line)
            if alt_tour_reduced_perf:
                current_tour = int(alt_tour_reduced_perf.group(1)) - 1
                tour_time = float(alt_tour_reduced_perf.group(2)) / 3600  # Convert seconds to hours
                results['tour_to_time'][current_tour] = tour_time
                # Optionally store reduced performance if needed:
                # results['reduced_performance'][current_tour] = float(alt_tour_reduced_perf.group(3))
                results['std_B'][current_tour] = float(alt_tour_reduced_perf.group(4))

            # Start scanning lines until next block or BKZ
            counter += 1
            while counter < len(content):
                subline = content[counter]

                # Match [BEST X% STD] True B in candidate set
                match_true_b = re.match(r"\[BEST (\d+)% STD\] Expected true B is best candidate: ([\d.]+)%", subline)
                if match_true_b:
                    percentile = int(match_true_b.group(1))
                    percent = float(match_true_b.group(2))
                    results['exp_perc_subset'][current_tour][percentile] = percent
                    counter += 1
                    continue

                # Match [BEST X% STD] True B is the best candidate
                match_best = re.match(r"\[BEST (\d+)% STD\] True B is the best candidate: \d+ / \d+ \(([\d.]+)%\)", subline)
                if match_best:
                    percentile = int(match_best.group(1))
                    percent = float(match_best.group(2))
                    results['best_b_subset'][current_tour][percentile] = percent
                    counter += 1
                    continue

                # Match general True B
                match_general_true_b = re.match(r"Expected true B is best candidate: ([\d.]+)%", subline)
                if match_general_true_b:
                    results['exp_perc'][current_tour] = float(match_general_true_b.group(1))
                    counter += 1
                    continue

                # Match general Best candidate
                match_general_best = re.match(r"True B is the best candidate: \d+ / (\d+) \(([\d.]+)%\)", subline)
                if match_general_best:
                    results['total_train_samples'] = int(match_general_best.group(1))
                    results['best_b'][current_tour] = float(match_general_best.group(2))
                    counter += 1
                    continue

                # Match [BEST X% STD] True B in candidate set: N / M (P%)
                match_in_candidate = re.match(r"\[BEST (\d+)% STD\] True B in candidate set: (\d+) / (\d+) \(([\d.]+)%\)", subline)
                if match_in_candidate:
                    counter += 1
                    continue

                match_in_candidate = re.match(r"True B in candidate set: (\d+) / (\d+) \(([\d.]+)%\)", subline)
                if match_in_candidate:
                    counter += 1
                    continue

                # Match "Mean overall std_B: <float>"
                match_mean_std_b = re.match(r"Mean overall std_B: ([\d.]+)", subline)
                if match_mean_std_b:
                    results['std_B'][current_tour] = float(match_mean_std_b.group(1))
                    counter += 1
                    continue

                # If no match, break to the next tour
                counter -= 1
                break
        elif line.startswith("- Running BKZ2.0"):
            if results['start_bkz'] is None:
                results['start_bkz'] = current_tour
            block_size_match = re.search(r"with block size (\d+)...", line)
            if block_size_match:
                block_size = int(block_size_match.group(1))
                results['block_sizes'][current_tour].append(block_size)
        elif line.startswith("- Updated"):
            num_updates_match = re.search(r"Updated (\d+)/", line)
            if num_updates_match:
                results['num_updates'][current_tour].append(int(num_updates_match.group(1)))
        elif line.startswith("Reduction completed "):
            total_time_match = re.search(r"Reduction completed in (\d+\.\d+) seconds", line)
            if total_time_match:
                results['total_time'] += float(total_time_match.group(1))
        elif line.startswith("Secret found after"):
            total_time_match = re.search(r"Secret found after (\d+\.\d+) seconds.", line)
            if total_time_match:
                results['total_time'] += float(total_time_match.group(1))
        elif line.startswith("Confusion Matrix"):
            # Save lines for confusion matrix until classification report
            confusion_matrix_lines = []
            counter += 1
            while counter < len(content) and not content[counter].startswith("Classification Report"):
                confusion_matrix_lines.append(content[counter])
                counter += 1
            results['confusion_matrix'] = "".join(confusion_matrix_lines)
            counter -= 1
        elif line.startswith("Classification Report"):
            classification_report_lines = []
            counter += 1
            while counter < len(content) and not content[counter].startswith("#########"):
                classification_report_lines.append(content[counter])
                counter += 1
            results['classification_report'] = "".join(classification_report_lines)
            counter -= 1
        elif line.startswith("Mean overall std_B:"):
            # Extract mean overall std_B
            match = re.search(r"Mean overall std_B: ([\d.]+)", line)
            if match:
                results['std_B'][current_tour] = float(match.group(1))
        elif line.startswith("#########################################"):
            # End of attack
            break
        counter += 1

    results['success'] = True if results['confusion_matrix'] is not None else False

    return results

Parameters used:

In [59]:
results = []
for output_file in output_files:
    result = parse_output_file(output_file)
    results.append(result)

# Collect sets for each parameter
params_to_print = ['n', 'q', 'secret_type', 'hw', 'bkz_block_sizes', 'penalty', 'matrix_config']

# Prepare a list of dicts for DataFrame construction
df_rows = []
for result in results:
    # Get the last tour for std_B, if available
    last_std_B = None
    if result['std_B']:
        last_tour = max(result['std_B'].keys())
        last_std_B = result['std_B'][last_tour]
    row = {
        "output_file": result['output_file'],
        "last_best_b": next(reversed(result['best_b'].values()), None),
        "last_std_B": last_std_B,
        "last_tour_time": next(reversed(result['tour_to_time'].values()), None),
        "total_time": result['total_time'],
        "success": result['success'],
    }
    for param in params_to_print:
        row[param] = result['parameters'].get(param, None)
    df_rows.append(row)

df_infos = pd.DataFrame(df_rows)
df_infos = df_infos.sort_values("output_file").reset_index(drop=True)


with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df_infos)

Parsing output file: ../cluster/outputs/attack_100_3329_binary_dual_4_58996086.out
Parsing output file: ../cluster/outputs/attack_100_3329_binary_original_4_58996088.out
Parsing output file: ../cluster/outputs/attack_100_3329_binary_salsa_4_58996087.out
Parsing output file: ../cluster/outputs/attack_100_3329_cbd_dual_4_58996128.out
Parsing output file: ../cluster/outputs/attack_100_3329_cbd_original_4_58996130.out
Parsing output file: ../cluster/outputs/attack_100_3329_cbd_salsa_4_58996129.out
Parsing output file: ../cluster/outputs/attack_110_3329_binary_dual_4_58996089.out
Parsing output file: ../cluster/outputs/attack_110_3329_binary_original_4_58996091.out
Parsing output file: ../cluster/outputs/attack_110_3329_binary_salsa_4_58996090.out
Parsing output file: ../cluster/outputs/attack_110_3329_cbd_dual_4_58996131.out
Parsing output file: ../cluster/outputs/attack_110_3329_cbd_original_4_58996133.out
Parsing output file: ../cluster/outputs/attack_110_3329_cbd_salsa_4_58996132.out
Pa

Unnamed: 0,output_file,last_best_b,last_std_B,last_tour_time,total_time,success,n,q,secret_type,hw,bkz_block_sizes,penalty,matrix_config
0,attack_100_3329_binary_dual_4_58996086.out,,1005.81,0.053536,195.47,True,100,3329,binary,-1,4:50:2,4,dual
1,attack_100_3329_binary_original_4_58996088.out,79.07,1283.51,0.397531,1435.11,True,100,3329,binary,-1,4:50:2,4,original
2,attack_100_3329_binary_salsa_4_58996087.out,80.1,1230.19,0.438953,1586.19,True,100,3329,binary,-1,4:50:2,4,salsa
3,attack_100_3329_cbd_dual_4_58996128.out,85.29,1150.38,1.402253,5051.53,True,100,3329,cbd,-1,4:50:2,4,dual
4,attack_100_3329_cbd_original_4_58996130.out,87.22,1072.65,2.955144,10643.88,True,100,3329,cbd,-1,4:50:2,4,original
5,attack_100_3329_cbd_salsa_4_58996129.out,86.95,1108.31,2.445858,8809.03,True,100,3329,cbd,-1,4:50:2,4,salsa
6,attack_110_3329_binary_dual_4_58996089.out,77.62,1335.59,0.070619,259.58,True,110,3329,binary,-1,4:50:2,4,dual
7,attack_110_3329_binary_original_4_58996091.out,80.39,1240.99,0.934983,3371.97,True,110,3329,binary,-1,4:50:2,4,original
8,attack_110_3329_binary_salsa_4_58996090.out,79.21,1277.6,0.952239,3433.13,True,110,3329,binary,-1,4:50:2,4,salsa
9,attack_110_3329_cbd_dual_4_58996131.out,76.48,1424.38,10.3889,0.0,False,110,3329,cbd,-1,4:50:2,4,dual


In [60]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df_infos[(df_infos['n'] == 80) & (df_infos['q'] == 113)].sort_values('last_std_B'))

Unnamed: 0,output_file,last_best_b,last_std_B,last_tour_time,total_time,success,n,q,secret_type,hw,bkz_block_sizes,penalty,matrix_config
144,attack_80_2_113_binary_original_4_58994076.out,80.4,44.79,25.266303,0.0,False,80,113,binary,9,20:40:5,4,original
146,attack_80_2_113_binary_original_4_58994079.out,80.41,45.44,25.023275,0.0,False,80,113,binary,9,20:40:5,4,original
105,attack_80_113_binary_dual_3_58994127.out,79.79,45.49,23.434844,0.0,False,80,113,binary,9,20:40:5,3,dual
137,attack_80_113_binary_salsa_3_58994129.out,82.03,45.53,23.218211,0.0,False,80,113,binary,9,20:40:5,3,salsa
116,attack_80_113_binary_original_3_58994125.out,81.9,46.14,23.727403,0.0,False,80,113,binary,9,20:40:5,3,original
119,attack_80_113_binary_original_5_58995784.out,77.95,47.7,22.315817,0.0,False,80,113,binary,9,20:45:2,5,original
118,attack_80_113_binary_original_5_58995780.out,76.75,48.38,22.921564,0.0,False,80,113,binary,9,20:45:2,5,original
121,attack_80_113_binary_original_6_58995781.out,76.61,48.75,22.805833,0.0,False,80,113,binary,9,20:45:2,6,original
120,attack_80_113_binary_original_5_58995797.out,76.92,48.85,21.022578,0.0,False,80,113,binary,9,20:45:2,5,original
122,attack_80_113_binary_original_6_58995785.out,74.12,48.95,22.033981,0.0,False,80,113,binary,9,20:45:2,6,original


In [None]:
# Filter df_infos based on n, q, secret_type, and hw
filtered = df_infos.groupby(['n', 'q', 'secret_type', 'hw'])

# For each group, select the row where attack succeeded and has the lowest total_time,
# or if no success, the row with the highest last_best_b
selected_rows = []
for _, group in filtered:
  success_rows = group[group['success']]
  if not success_rows.empty:
    # Take the one with the lowest total_time among successful
    best_row = success_rows.loc[success_rows['total_time'].idxmin()]
  else:
    # Take the one with the highest last_best_b among unsuccessful
    best_row = group.loc[group['last_best_b'].idxmax()]
  selected_rows.append(best_row)

df_selected = pd.DataFrame(selected_rows).reset_index(drop=True)
# Define the q value to filter on, e.g., q_value = 251
q_value = 113

df_selected_q = df_selected[df_selected['q'] == q_value].reset_index(drop=True)
df_selected_q

  best_row = group.loc[group['last_best_b'].idxmax()]


KeyError: nan

In [None]:
for idx, res in enumerate(results):
  if res['confusion_matrix'] is not None or res['classification_report'] is not None:
    print(f"Output file: {res['output_file']}")
    if res['confusion_matrix'] is not None:
      print("Confusion Matrix:")
      print(res['confusion_matrix'])
    if res['classification_report'] is not None:
      print("Classification Report:")
      print(res['classification_report'])
    print("-" * 40)

In [None]:
use_time = all(res['tour_to_time'] and len(res['tour_to_time']) > 0 for res in results)

x_axis_title = "Tour Number" if not use_time else "Time (hours)"

import plotly.graph_objects as go

fig = go.Figure()

for res in results:
    tours = sorted(res['best_b'].keys())
    best_b = res['best_b']
    tour_times = [res['tour_to_time'][tour] for tour in tours]

    if use_time:
        # Flatten tour_times to a list of floats
        tour_times_list = [v[0] if isinstance(v, list) else v for v in tour_times]
        x_axis = tour_times_list
        if res['start_bkz'] is not None and res['start_bkz'] < len(tour_times_list):
            start_bkz = tour_times_list[res['start_bkz']]
        else:
            start_bkz = None
    else:
        x_axis = tours
        start_bkz = res['start_bkz'] if res['start_bkz'] is not None else 0

    best_candidate_values = [best_b[tour] for tour in tours]

    fig.add_trace(go.Scatter(
        x=x_axis,
        y=best_candidate_values,
        mode='lines+markers',
        name=f"{res['output_file']}"
    ))

    # Add vertical line for BKZ start
    if start_bkz is not None:
        fig.add_vline(x=start_bkz, line_dash="dash", line_color="red")

fig.update_layout(
    title=None,
    xaxis_title=x_axis_title,
    yaxis_title="Percentage",
    legend_title="Output File",
    template="plotly_white",
    autosize=True,
    margin=dict(l=40, r=20, t=20, b=40)  # Reduce left, right, top, bottom margins
)
fig.show()

KeyError: 1

In [None]:
fig_block_sizes = go.Figure()

for res in results:
    block_sizes = res['block_sizes']
    tours = sorted(res['best_b'].keys())
    x_axis = tours if not use_time else [res['tour_to_time'][tour] for tour in tours]

    # Prepare block size values: use max if multiple in a tour, 0 if none
    block_size_values = []
    for tour in tours:
        sizes = block_sizes.get(tour, [])
        max_size = max(sizes) if sizes else 0
        block_size_values.append(max_size)

    fig_block_sizes.add_trace(go.Scatter(
        x=x_axis,
        y=block_size_values,
        mode='lines+markers',
        name=f"BKZ Block Size {res['output_file']}"
    ))
    # Add vertical line where BKZ starts
    if res['start_bkz'] is not None and res['start_bkz'] in res['tour_to_time']:
        start_bkz = res['start_bkz'] if not use_time else res['tour_to_time'][res['start_bkz']]
        fig_block_sizes.add_vline(x=start_bkz, line_dash="dash", line_color="red")

fig_block_sizes.update_layout(
    xaxis_title=x_axis_title,
    yaxis_title="Block Size",
    legend_title="Output File",
    template="plotly_white",
    autosize=True,
    margin=dict(l=40, r=20, t=40, b=40)
)
fig_block_sizes.show()


In [None]:
fig_updates = go.Figure()

for res in results:
    num_updates = res['num_updates']
    tours = sorted(res['best_b'].keys())
    x_axis = tours if not use_time else [res['tour_to_time'][tour] for tour in tours]

    # Prepare update counts
    update_counts = [sum(num_updates.get(tour, [])) for tour in tours[:-1]]

    fig_updates.add_trace(go.Scatter(
        x=list(x_axis)[:-1],
        y=update_counts,
        mode='lines+markers',
        name=f"Total Updates {res['output_file']}"
    ))
    # Add vertical line where BKZ starts
    if res['start_bkz'] is not None and res['start_bkz'] in res['tour_to_time']:
        start_bkz = res['start_bkz'] if not use_time else res['tour_to_time'][res['start_bkz']]
        fig_updates.add_vline(x=start_bkz, line_dash="dash", line_color="red")

fig_updates.update_layout(
    xaxis_title=x_axis_title,
    yaxis_title="Number of Updates",
    legend_title="Output File",
    template="plotly_white",
    width=800,
    height=600,
    autosize=True,
    margin=dict(l=40, r=20, t=20, b=40)
)
fig_updates.show()


In [None]:
fig_std = go.Figure()

for res in results:
  std_B = res['std_B']
  if std_B:
    tours = sorted(std_B.keys())
    std_B_values = [std_B[tour] for tour in tours]
    x_axis = tours if not use_time else [res['tour_to_time'][tour] for tour in tours]
    fig_std.add_trace(go.Scatter(
      x=x_axis,
      y=std_B_values,
      mode='lines+markers',
      name=f"{res['output_file']}"
    ))
    # Add vertical line for BKZ start
    if res['start_bkz'] is not None and res['start_bkz'] in res['tour_to_time']:
      start_bkz = res['start_bkz'] if not use_time else res['tour_to_time'][res['start_bkz']]
      fig_std.add_vline(x=start_bkz, line_dash="dash", line_color="red")

fig_std.update_layout(
  xaxis_title=x_axis_title,
  yaxis_title="Mean std_B",
  legend_title="Output File",
  template="plotly_white",
  autosize=True,
  width=1000,
  height=600,
  margin=dict(l=40, r=20, t=20, b=40)
)
fig_std.show()