# Context
This notebook drives the training process for different models.

In [1]:
# Set project's environment variables
import os
import sys
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../../project.env")
sys.path.append(os.environ["PYTHONPATH"])

In [2]:
# Import project-wide and PH2 specific variables and functions
import superheader as sup
import TRAIN.architecture.archeader as arch

In [3]:
from math import comb
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
from itertools import combinations

In [4]:
TRAIN_classes = 'all-classes'
num_classes = 38
data_config = {
    "PH2" : False,
    "PH3" : False,
    "reducer": '',
    "kernel": '',
    "n": -1,
    "data_unit": sup.DATA_S_PV,
    "label_col": sup.class_numeric_column,
    "class_list": TRAIN_classes
    }

train_config = {"arch" : "generic"}

In [5]:
generic = arch.Arch(data_config=data_config, df=None, train_config=train_config)

In [6]:
df = generic.df
df

Unnamed: 0,class_numeric,f0_h0x,f0_h0y,f0_h0z,f0_h1x,f0_h1y,f0_h1z,f0_h2x,f0_h2y,f0_h2z,...,f11_h20z,f11_p0x,f11_p0y,f11_p0z,f11_p11x,f11_p11y,f11_p11z,f11_p12x,f11_p12y,f11_p12z
0,29,-0.792900,1.100812,-0.751818,-1.081953,1.041106,0.814337,-1.257684,0.848338,0.973066,...,1.053431,0.108433,0.948984,-0.121414,0.093744,1.779584,-0.181946,0.397396,1.402068,-0.359545
1,37,-0.672795,0.850968,0.825148,-0.549001,0.817573,-0.617562,-0.455937,0.636620,-0.208720,...,-0.843302,0.253259,1.188646,0.344296,0.187990,1.858469,0.760271,0.564365,1.862700,0.302348
2,30,-0.861860,1.056493,1.108747,-1.128709,0.951601,1.082660,-1.255844,0.789815,1.002735,...,1.489724,0.115646,0.913005,0.150163,0.031617,1.776816,0.267760,0.584477,1.517096,-0.078514
3,31,-0.959088,1.065751,-0.118277,-1.214209,0.872499,1.186723,-1.304189,0.698813,1.324967,...,1.437709,0.126987,1.087323,-0.053093,0.059566,1.778371,0.201922,0.463597,1.648508,-0.069109
4,32,-0.982582,0.902474,0.284455,-1.237670,0.737153,0.818105,-1.347797,0.588256,0.966225,...,-0.503393,0.121749,0.977534,-0.225261,0.059050,1.791173,0.019757,0.439201,1.566831,-0.237159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3896,24,1.139689,0.349096,0.425912,1.123375,0.333303,-1.449617,1.167727,0.214704,-1.094006,...,-0.258490,-0.284375,0.772177,0.933794,-0.042956,0.949019,1.352938,-0.552511,1.421335,1.058762
3897,25,1.174193,0.312433,0.518765,1.136539,0.347140,-1.402185,1.151418,0.245268,-0.829644,...,-0.033928,-0.171778,0.776293,0.822702,-0.037920,0.956870,1.303580,-0.495289,1.435401,0.962147
3898,26,1.293529,0.314785,-1.413979,1.286812,0.232945,-1.372722,1.339927,0.116292,-1.184254,...,0.237394,-0.317654,0.792136,1.065567,-0.253776,0.987524,1.360811,-0.581969,1.353457,1.198303
3899,27,1.167021,0.220154,-1.982639,1.071311,0.039063,-0.052367,1.054789,-0.085848,-0.121965,...,0.280019,-0.219322,0.563675,0.437762,-0.037496,0.827723,0.890685,-0.520828,1.163448,0.646387


In [None]:
# Function to compute class centroids
def compute_class_centroids(df, label_col=sup.class_numeric_column):
    feature_cols = [col for col in df.columns if col != label_col]
    return df.groupby(label_col)[feature_cols].mean().values

# Compute average difficulty score using centroid sets
def compute_average_difficulty_score(centroids, class_indices):
    vectors = [centroids[i] for i in class_indices]
    sim_matrix = cosine_similarity(vectors)
    upper_triangle = sim_matrix[np.triu_indices(len(class_indices), k=1)]
    return np.mean(upper_triangle)

# Sample random combinations of class indices, respecting total possible
def sample_k_combinations(n_classes, k, num_samples=1000, seed=42):
    total_possible = comb(n_classes, k)
    if total_possible <= num_samples:
        return list(combinations(range(n_classes), k))
    
    actual_samples = min(num_samples, total_possible)
    random.seed(seed + k)
    samples = set()
    while len(samples) < actual_samples:
        sample = tuple(sorted(random.sample(range(n_classes), k)))
        samples.add(sample)
    
    return list(samples)

# Main function to get easy/average/hard groupings using both centroid sets
def get_difficulty_based_combinations(centroids, k, num_samples=1000):
    combos = sample_k_combinations(len(centroids), k, num_samples)
    scored_combos = [
        (combo, compute_average_difficulty_score(centroids, combo))
        for combo in combos
    ]
    scored_combos.sort(key=lambda x: x[1])  # Higher = harder

    scores = [score for _, score in scored_combos]
    percentiles = {
        'easy': np.percentile(scores, 10),
        'average': np.percentile(scores, 50),
        'hard': np.percentile(scores, 90),
    }

    def find_closest(target):
        return min(scored_combos, key=lambda x: abs(x[1] - target))[0]

    return {key: find_closest(val) for key, val in percentiles.items()}


In [8]:
# Compute centroids
centroids = compute_class_centroids(df)

In [9]:
# Example
k = 2
results = get_difficulty_based_combinations(centroids, k, num_samples=10)

In [10]:
print(f"Dataset A (k={k}):", results)

Dataset A (k=2): {'easy': (20, 25), 'average': (0, 6), 'hard': (26, 33)}


In [11]:
subsets = dict()

for k in range(2, num_classes):
  print(k)
  subsets[k] = get_difficulty_based_combinations(centroids, k, num_samples=10000)


subsets[num_classes] = {'easy' : tuple(range(num_classes)), 'average' : tuple(range(num_classes)), 'hard' : tuple(range(num_classes))}

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


In [12]:
subsets

{2: {'easy': (19, 26), 'average': (13, 22), 'hard': (3, 15)},
 3: {'easy': (24, 33, 36), 'average': (3, 9, 22), 'hard': (6, 31, 34)},
 4: {'easy': (3, 15, 26, 27),
  'average': (6, 9, 28, 32),
  'hard': (6, 9, 25, 33)},
 5: {'easy': (1, 19, 25, 27, 30),
  'average': (0, 1, 2, 7, 24),
  'hard': (3, 8, 28, 29, 31)},
 6: {'easy': (0, 1, 8, 10, 31, 35),
  'average': (3, 7, 10, 13, 20, 21),
  'hard': (3, 9, 14, 17, 27, 34)},
 7: {'easy': (2, 21, 23, 24, 25, 26, 35),
  'average': (1, 9, 16, 25, 26, 28, 34),
  'hard': (5, 30, 31, 32, 33, 34, 36)},
 8: {'easy': (2, 6, 13, 15, 25, 27, 32, 36),
  'average': (0, 2, 11, 12, 20, 23, 26, 36),
  'hard': (2, 7, 12, 16, 18, 23, 24, 31)},
 9: {'easy': (0, 3, 5, 9, 12, 13, 23, 29, 33),
  'average': (3, 8, 11, 13, 15, 19, 21, 25, 28),
  'hard': (4, 8, 10, 12, 18, 21, 23, 24, 29)},
 10: {'easy': (11, 12, 16, 20, 22, 24, 26, 27, 28, 30),
  'average': (4, 5, 10, 16, 17, 18, 24, 27, 29, 30),
  'hard': (7, 9, 14, 15, 17, 19, 25, 27, 30, 36)},
 11: {'easy': (0,

In [13]:
import json

# Convert integer keys to strings and tuples to lists
json_friendly = {
    str(k): {d: list(v) for d, v in v_dict.items()}
    for k, v_dict in subsets.items()
}

# Save to file
with open(os.path.join(sup.DATA_ROOT, f"{TRAIN_classes}-subsets.json"), "w") as f:
    json.dump(json_friendly, f, indent=2)


In [14]:
total_items = sum(len(inner) for inner in subsets.values())
print(total_items)

111
