# Context
This notebook drives the training process for different models.

In [1]:
# Set project's environment variables
import os
import sys
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../../project.env")
sys.path.append(os.environ["PYTHONPATH"])

In [2]:
# Import project-wide and PH2 specific variables and functions
import superheader as sup
import TRAIN.architecture.archeader as arch

In [3]:
from math import comb
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
from itertools import combinations

In [4]:
TRAIN_classes = 'all-classes'
num_classes = 38
base_data_config = {
    "PH2" : None,
    "PH3" : False,
    "reducer": '',
    "kernel": '',
    "n": -1,
    "data_unit": sup.DATA_S_PV,
    "label_col": sup.class_numeric_column,
    "class_list": TRAIN_classes
    }

data_config_w2 = base_data_config.copy()
data_config_w2["PH2"] = True
data_config_wo2 = base_data_config.copy()
data_config_wo2["PH2"] = False

train_config = {"arch" : "generic"}

In [5]:
model_w2 = arch.Arch(data_config=data_config_w2, df=None, train_config=train_config)
model_wo2 = arch.Arch(data_config=data_config_wo2, df=None, train_config=train_config)

In [6]:
df_w2 = model_w2.df
df_wo2 = model_wo2.df

In [7]:
# Function to compute class centroids
def compute_class_centroids(df, label_col=sup.class_numeric_column):
    feature_cols = [col for col in df.columns if col != label_col]
    return df.groupby(label_col)[feature_cols].mean().values

# Compute average difficulty score using both centroid sets
def compute_average_difficulty_score(centroids1, centroids2, class_indices):
    def score(centroids):
        vectors = [centroids[i] for i in class_indices]
        sim_matrix = cosine_similarity(vectors)
        upper_triangle = sim_matrix[np.triu_indices(len(class_indices), k=1)]
        return np.mean(upper_triangle)
    return (score(centroids1) + score(centroids2)) / 2

# Sample random combinations of class indices, respecting total possible
def sample_k_combinations(n_classes, k, num_samples=1000, seed=42):
    total_possible = comb(n_classes, k)
    if total_possible <= num_samples:
        return list(combinations(range(n_classes), k))
    
    actual_samples = min(num_samples, total_possible)
    random.seed(seed + k)
    samples = set()
    while len(samples) < actual_samples:
        sample = tuple(sorted(random.sample(range(n_classes), k)))
        samples.add(sample)
    return list(samples)

# Main function to get easy/average/hard groupings using both centroid sets
def get_difficulty_based_combinations(centroids1, centroids2, k, num_samples=1000):
    combos = sample_k_combinations(len(centroids1), k, num_samples)
    scored_combos = [
        (combo, compute_average_difficulty_score(centroids1, centroids2, combo))
        for combo in combos
    ]
    scored_combos.sort(key=lambda x: x[1])  # Higher = harder

    scores = [score for _, score in scored_combos]
    percentiles = {
        'easy': np.percentile(scores, 10),
        'average': np.percentile(scores, 50),
        'hard': np.percentile(scores, 90),
    }

    def find_closest(target):
        return min(scored_combos, key=lambda x: abs(x[1] - target))[0]

    return {key: find_closest(val) for key, val in percentiles.items()}


In [8]:
# Compute centroids
centroids_w2 = compute_class_centroids(df_w2)
centroids_wo2 = compute_class_centroids(df_wo2)

In [9]:
# Example
k = 5
results = get_difficulty_based_combinations(centroids_w2, centroids_wo2, k, num_samples=10)

In [10]:
print(f"Dataset A (k={k}):", results)

Dataset A (k=5): {'easy': (14, 15, 27, 33, 36), 'average': (1, 2, 6, 25, 26), 'hard': (2, 15, 22, 28, 32)}


In [11]:
subsets = dict()

for k in range(2, num_classes):
  print(k)
  subsets[k] = get_difficulty_based_combinations(centroids_w2, centroids_wo2, k, num_samples=10000)


subsets[num_classes] = {'easy' : tuple(range(num_classes)), 'average' : tuple(range(num_classes)), 'hard' : tuple(range(num_classes))}

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


In [12]:
subsets

{2: {'easy': (27, 32), 'average': (6, 15), 'hard': (30, 33)},
 3: {'easy': (5, 17, 25), 'average': (1, 29, 34), 'hard': (1, 15, 20)},
 4: {'easy': (0, 15, 23, 33),
  'average': (1, 8, 12, 33),
  'hard': (19, 22, 23, 32)},
 5: {'easy': (18, 23, 26, 29, 34),
  'average': (13, 15, 17, 26, 36),
  'hard': (1, 26, 33, 34, 35)},
 6: {'easy': (7, 11, 12, 23, 33, 35),
  'average': (9, 11, 13, 15, 16, 33),
  'hard': (3, 4, 5, 19, 21, 35)},
 7: {'easy': (2, 4, 10, 21, 25, 29, 32),
  'average': (0, 4, 8, 9, 12, 16, 25),
  'hard': (1, 8, 12, 24, 25, 27, 36)},
 8: {'easy': (1, 2, 9, 11, 27, 28, 29, 33),
  'average': (0, 5, 6, 14, 30, 31, 32, 36),
  'hard': (0, 1, 2, 4, 8, 10, 20, 28)},
 9: {'easy': (7, 8, 17, 18, 21, 22, 29, 30, 36),
  'average': (1, 9, 10, 13, 17, 18, 29, 31, 34),
  'hard': (3, 5, 9, 10, 13, 15, 16, 18, 21)},
 10: {'easy': (0, 3, 4, 8, 17, 27, 30, 32, 34, 36),
  'average': (0, 1, 2, 3, 6, 13, 16, 19, 22, 35),
  'hard': (0, 3, 4, 5, 8, 12, 24, 25, 27, 30)},
 11: {'easy': (6, 7, 9, 1

In [13]:
import json

# Convert integer keys to strings and tuples to lists
json_friendly = {
    str(k): {d: list(v) for d, v in v_dict.items()}
    for k, v_dict in subsets.items()
}

# Save to file
with open(os.path.join(sup.DATA_ROOT, f"{TRAIN_classes}-subsets.json"), "w") as f:
    json.dump(json_friendly, f, indent=2)
