# Context
This notebook drives the training process for different models.

In [15]:
# Set project's environment variables
import os
import sys
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../../project.env")
sys.path.append(os.environ["PYTHONPATH"])

In [16]:
# Import project-wide and PH2 specific variables and functions
import superheader as sup
import TRAIN.architecture.archeader as arch

In [17]:
from math import comb
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
from itertools import combinations

In [18]:
TRAIN_classes = 'alpha-classes'  
base_data_config = {
    "PH2" : None,
    "PH3" : False,
    "reducer": '',
    "kernel": '',
    "n": -1,
    "data_unit": sup.DATA_S_PV,
    "label_col": sup.class_numeric_column,
    "class_list": TRAIN_classes
    }

data_config_w2 = base_data_config.copy()
data_config_w2["PH2"] = True
data_config_wo2 = base_data_config.copy()
data_config_w2["PH2"] = False

train_config = {"arch" : sup.TRAIN_KNN_CODE, "k" : 1}

In [19]:
model_w2 = arch.Arch(data_config=data_config_w2, df=None, train_config=train_config)
model_wo2 = arch.Arch(data_config=data_config_wo2, df=None, train_config=train_config)

In [20]:
df_w2 = model_w2.df
df_wo2 = model_wo2.df

In [23]:
# Function to compute class centroids
def compute_class_centroids(df, label_col=sup.class_numeric_column):
    feature_cols = [col for col in df.columns if col != label_col]
    return df.groupby(label_col)[feature_cols].mean().values

# Compute average difficulty score using both centroid sets
def compute_average_difficulty_score(centroids1, centroids2, class_indices):
    def score(centroids):
        vectors = [centroids[i] for i in class_indices]
        sim_matrix = cosine_similarity(vectors)
        upper_triangle = sim_matrix[np.triu_indices(len(class_indices), k=1)]
        return np.mean(upper_triangle)
    return (score(centroids1) + score(centroids2)) / 2

# Sample random combinations of class indices, respecting total possible
def sample_k_combinations(n_classes, k, num_samples=1000, seed=42):
    total_possible = comb(n_classes, k)
    actual_samples = min(num_samples, total_possible)

    if total_possible <= num_samples:
        return list(combinations(range(n_classes), k))

    random.seed(seed + k)
    samples = set()
    while len(samples) < actual_samples:
        sample = tuple(sorted(random.sample(range(n_classes), k)))
        samples.add(sample)
    return list(samples)

# Main function to get easy/average/hard groupings using both centroid sets
def get_difficulty_based_combinations(centroids1, centroids2, k, num_samples=1000):
    combos = sample_k_combinations(len(centroids1), k, num_samples)
    scored_combos = [
        (combo, compute_average_difficulty_score(centroids1, centroids2, combo))
        for combo in combos
    ]
    scored_combos.sort(key=lambda x: x[1])  # Higher = harder

    scores = [score for _, score in scored_combos]
    percentiles = {
        'easy': np.percentile(scores, 10),
        'average': np.percentile(scores, 50),
        'hard': np.percentile(scores, 90),
    }

    def find_closest(target):
        return min(scored_combos, key=lambda x: abs(x[1] - target))[0]

    return {key: find_closest(val) for key, val in percentiles.items()}


In [24]:
# Compute centroids
centroids_w2 = compute_class_centroids(df_w2)
centroids_wo2 = compute_class_centroids(df_wo2)

In [29]:
# Example
k = 28
results = get_difficulty_based_combinations(centroids_w2, centroids_wo2, k, num_samples=10)

In [30]:
print(f"Dataset A (k={k}):", results)

Dataset A (k=28): {'easy': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), 'average': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27), 'hard': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27)}


In [43]:
subsets = dict()

for k in range(2, 29):
  subsets[k] = get_difficulty_based_combinations(centroids_w2, centroids_wo2, k, num_samples=1000)


subsets[29] = {'easy' : tuple(range(1,29+1)), 'average' : tuple(range(1,29+1)), 'hard' : tuple(range(1,29+1))}

In [44]:
subsets

{2: {'easy': (8, 12), 'average': (2, 19), 'hard': (19, 23)},
 3: {'easy': (21, 24, 27), 'average': (4, 13, 19), 'hard': (2, 18, 22)},
 4: {'easy': (0, 5, 6, 13), 'average': (6, 16, 18, 20), 'hard': (1, 3, 7, 16)},
 5: {'easy': (2, 5, 9, 11, 12),
  'average': (2, 3, 9, 24, 26),
  'hard': (10, 11, 17, 19, 22)},
 6: {'easy': (0, 3, 14, 20, 22, 27),
  'average': (0, 10, 13, 14, 20, 21),
  'hard': (4, 14, 19, 22, 24, 27)},
 7: {'easy': (0, 5, 7, 13, 16, 23, 26),
  'average': (4, 5, 6, 11, 12, 14, 22),
  'hard': (1, 6, 9, 13, 14, 25, 27)},
 8: {'easy': (0, 1, 3, 6, 10, 13, 18, 25),
  'average': (0, 7, 10, 12, 13, 20, 23, 26),
  'hard': (0, 4, 10, 11, 13, 17, 18, 22)},
 9: {'easy': (1, 7, 10, 16, 20, 22, 24, 25, 27),
  'average': (5, 6, 7, 13, 14, 20, 23, 25, 27),
  'hard': (1, 2, 3, 5, 7, 10, 14, 15, 20)},
 10: {'easy': (0, 6, 8, 11, 14, 15, 17, 20, 22, 27),
  'average': (0, 4, 7, 8, 10, 11, 12, 14, 15, 23),
  'hard': (1, 4, 5, 6, 15, 16, 17, 20, 22, 27)},
 11: {'easy': (0, 2, 4, 5, 7, 8, 13

In [46]:
import json

# Convert integer keys to strings and tuples to lists
json_friendly = {
    str(k): {d: list(v) for d, v in v_dict.items()}
    for k, v_dict in subsets.items()
}

# Save to file
with open(os.path.join(sup.DATA_ROOT, f"{TRAIN_classes}-subsets.json"), "w") as f:
    json.dump(json_friendly, f, indent=2)
