# Context
This notebook drives the training process for different models.

In [1]:
# Set project's environment variables
import os
import sys
from dotenv import load_dotenv
load_dotenv(dotenv_path="../../../project.env")
sys.path.append(os.environ["PYTHONPATH"])

In [2]:
# Import project-wide and PH2 specific variables and functions
import superheader as sup
import TRAIN.architecture.archeader as arch



Chosen class grouping: all-classes


Directory /Users/diego/Desktop/iteso/TOG/ exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/data exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/src exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/bin exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/media exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/scores exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/data/PH1/all-classes exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/data/PH2/all-classes exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/data/PH3/all-classes exists. Continuing with execution
device: mps
Directory /Users/diego/Desktop/iteso/TOG/bin/load/TRAIN/distilbert-base-uncased exists. Continuing with execution
Directory /Users/diego/Desktop/iteso/TOG/bin/load/TRAIN/prajjwal1/bert

In [3]:
from math import comb
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random
from itertools import combinations

In [4]:
TRAIN_classes = 'all-classes'  
base_data_config = {
    "PH2" : None,
    "PH3" : False,
    "reducer": '',
    "kernel": '',
    "n": -1,
    "data_unit": sup.DATA_S_PV,
    "label_col": sup.class_numeric_column,
    "class_list": TRAIN_classes
    }

data_config_w2 = base_data_config.copy()
data_config_w2["PH2"] = True
data_config_wo2 = base_data_config.copy()
data_config_wo2["PH2"] = False

train_config = {"arch" : sup.TRAIN_KNN_CODE, "k" : 1}

In [5]:
model_w2 = arch.Arch(data_config=data_config_w2, df=None, train_config=train_config)
model_wo2 = arch.Arch(data_config=data_config_wo2, df=None, train_config=train_config)

In [6]:
df_w2 = model_w2.df
df_wo2 = model_wo2.df

In [7]:
df_w2

Unnamed: 0,class_numeric,f0_h_v1x,f0_h_v1y,f0_h_v1z,f0_h_v2x,f0_h_v2y,f0_h_v2z,f0_h_v3x,f0_h_v3y,f0_h_v3z,...,f11_wh18z,f11_wh19x,f11_wh19y,f11_wh19z,f11_wh20x,f11_wh20y,f11_wh20z,f11_cp_h_mean_x,f11_cp_h_mean_y,f11_cp_h_mean_z
0,29,-0.951009,-0.274887,-0.001478,0.127148,-0.530356,0.703171,-0.213781,-0.711935,1.001942,...,-1.268045,0.894222,-0.102190,-1.276307,0.861770,-0.070783,-1.281769,-1.167972,0.038990,-0.803620
1,37,0.320837,-0.352668,0.387809,-0.376790,-0.518251,0.517883,0.278289,-0.125759,-0.655780,...,-1.386466,0.293177,0.730440,-1.461858,0.229640,0.704243,-1.442524,-0.426328,1.245385,-2.763109
2,30,-0.748425,-0.306723,-0.080616,0.260777,-0.503787,0.418407,0.019463,-0.824458,0.966399,...,-1.550982,0.742714,0.341162,-1.585854,0.701450,0.356758,-1.570713,-1.032909,0.679415,-2.038376
3,31,-0.371990,-0.357051,0.170788,0.497411,-0.476576,0.553372,0.093783,-0.671866,0.909241,...,-1.619154,0.867519,0.217375,-1.627450,0.804281,0.237664,-1.621564,-1.208228,0.769651,-1.692909
4,32,-0.301859,-0.357578,0.080005,0.607412,-0.428571,0.175897,0.415199,-0.758818,0.967600,...,-1.700858,0.566085,0.459268,-1.752322,0.494593,0.449874,-1.740109,-1.386027,0.647523,-1.466057
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3896,24,-0.494675,-0.355299,0.450332,0.310572,-0.514096,0.742748,-0.316291,-0.158804,-0.798248,...,1.023826,1.023748,-0.414404,1.023962,1.001136,-0.404123,1.032103,1.411898,0.655565,-1.269543
3897,25,-0.811989,-0.318500,0.702685,0.049050,-0.545354,1.119633,-0.126065,-0.289658,-0.790091,...,1.242320,0.782239,-0.648881,1.237902,0.736633,-0.618172,1.230091,1.370845,0.751699,-1.385271
3898,26,0.675647,-0.242534,-0.764577,1.836571,0.434711,-1.075466,-1.660161,0.291618,-1.682821,...,-0.503062,1.930322,0.992768,-0.482613,1.966209,0.970719,-0.493314,1.000029,0.540896,-1.124253
3899,27,-0.239845,-0.369663,0.389420,0.001163,-0.511405,0.258397,-0.756695,-0.252875,-0.235269,...,-1.107771,-0.408672,1.003720,-1.116067,-0.423738,1.020917,-1.126080,0.797288,0.701791,-0.897438


In [8]:
# Function to compute class centroids
def compute_class_centroids(df, label_col=sup.class_numeric_column):
    feature_cols = [col for col in df.columns if col != label_col]
    return df.groupby(label_col)[feature_cols].mean().values

# Compute average difficulty score using both centroid sets
def compute_average_difficulty_score(centroids1, centroids2, class_indices):
    def score(centroids):
        vectors = [centroids[i] for i in class_indices]
        sim_matrix = cosine_similarity(vectors)
        upper_triangle = sim_matrix[np.triu_indices(len(class_indices), k=1)]
        return np.mean(upper_triangle)
    return (score(centroids1) + score(centroids2)) / 2

# Sample random combinations of class indices, respecting total possible
def sample_k_combinations(n_classes, k, num_samples=1000, seed=42):
    total_possible = comb(n_classes, k)
    if total_possible <= num_samples:
        return list(combinations(range(n_classes), k))
    
    actual_samples = min(num_samples, total_possible)
    random.seed(seed + k)
    samples = set()
    while len(samples) < actual_samples:
        sample = tuple(sorted(random.sample(range(n_classes), k)))
        samples.add(sample)
    return list(samples)

# Main function to get easy/average/hard groupings using both centroid sets
def get_difficulty_based_combinations(centroids1, centroids2, k, num_samples=1000):
    combos = sample_k_combinations(len(centroids1), k, num_samples)
    scored_combos = [
        (combo, compute_average_difficulty_score(centroids1, centroids2, combo))
        for combo in combos
    ]
    scored_combos.sort(key=lambda x: x[1])  # Higher = harder

    scores = [score for _, score in scored_combos]
    percentiles = {
        'easy': np.percentile(scores, 10),
        'average': np.percentile(scores, 50),
        'hard': np.percentile(scores, 90),
    }

    def find_closest(target):
        return min(scored_combos, key=lambda x: abs(x[1] - target))[0]

    return {key: find_closest(val) for key, val in percentiles.items()}


In [9]:
# Compute centroids
centroids_w2 = compute_class_centroids(df_w2)
centroids_wo2 = compute_class_centroids(df_wo2)

In [10]:
# Example
k = 28
results = get_difficulty_based_combinations(centroids_w2, centroids_wo2, k, num_samples=10)

In [11]:
print(f"Dataset A (k={k}):", results)

Dataset A (k=28): {'easy': (0, 1, 2, 3, 6, 7, 8, 9, 10, 12, 15, 16, 17, 18, 19, 20, 21, 23, 24, 25, 26, 27, 29, 30, 31, 33, 34, 36), 'average': (1, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 19, 20, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 36), 'hard': (1, 3, 5, 6, 7, 8, 10, 12, 13, 14, 15, 16, 17, 19, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35)}


In [12]:
subsets = dict()

for k in range(2, 38):
  print(k)
  subsets[k] = get_difficulty_based_combinations(centroids_w2, centroids_wo2, k, num_samples=10000)


subsets[38] = {'easy' : tuple(range(38)), 'average' : tuple(range(38)), 'hard' : tuple(range(38))}

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


In [13]:
subsets

{2: {'easy': (27, 32), 'average': (6, 15), 'hard': (30, 33)},
 3: {'easy': (5, 17, 25), 'average': (1, 29, 34), 'hard': (1, 15, 20)},
 4: {'easy': (0, 15, 23, 33),
  'average': (1, 8, 12, 33),
  'hard': (19, 22, 23, 32)},
 5: {'easy': (18, 23, 26, 29, 34),
  'average': (13, 15, 17, 26, 36),
  'hard': (1, 26, 33, 34, 35)},
 6: {'easy': (7, 11, 12, 23, 33, 35),
  'average': (9, 11, 13, 15, 16, 33),
  'hard': (3, 4, 5, 19, 21, 35)},
 7: {'easy': (2, 4, 10, 21, 25, 29, 32),
  'average': (0, 4, 8, 9, 12, 16, 25),
  'hard': (1, 8, 12, 24, 25, 27, 36)},
 8: {'easy': (1, 2, 9, 11, 27, 28, 29, 33),
  'average': (0, 5, 6, 14, 30, 31, 32, 36),
  'hard': (0, 1, 2, 4, 8, 10, 20, 28)},
 9: {'easy': (7, 8, 17, 18, 21, 22, 29, 30, 36),
  'average': (1, 9, 10, 13, 17, 18, 29, 31, 34),
  'hard': (3, 5, 9, 10, 13, 15, 16, 18, 21)},
 10: {'easy': (0, 3, 4, 8, 17, 27, 30, 32, 34, 36),
  'average': (0, 1, 2, 3, 6, 13, 16, 19, 22, 35),
  'hard': (0, 3, 4, 5, 8, 12, 24, 25, 27, 30)},
 11: {'easy': (6, 7, 9, 1

In [14]:
import json

# Convert integer keys to strings and tuples to lists
json_friendly = {
    str(k): {d: list(v) for d, v in v_dict.items()}
    for k, v_dict in subsets.items()
}

# Save to file
with open(os.path.join(sup.DATA_ROOT, f"{TRAIN_classes}-subsets.json"), "w") as f:
    json.dump(json_friendly, f, indent=2)
