In [1]:
import sys
sys.path.append("..")
from config import *
import os
import json
from scripts.run_active_learning import *
import numpy as np
import pandas as pd
import re

In [None]:
TOP5_SUMMARY = Top5_Similarity_Summary
MIN_MAX_SUMMARY = EDX_min_max_summary
MAPPED_CENTROIDS_JSON = MAPPED_CENTROIDS_JSON
OUTPUT_DIR = DATA_CLEAN_InIT_CHOICES
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load global summaries
top5_df = pd.read_csv(TOP5_SUMMARY)
minmax_df = pd.read_csv(MIN_MAX_SUMMARY)
with open(MAPPED_CENTROIDS_JSON, 'r') as f:
    centroids_mapped = json.load(f)

# --- Helpers ---

def convert_to_serializable(obj):
    if isinstance(obj, np.integer): return int(obj)
    elif isinstance(obj, np.floating): return float(obj)
    elif isinstance(obj, np.ndarray): return obj.tolist()
    elif isinstance(obj, np.bool_): return bool(obj)
    return obj

def run_multiple_seeds(X, n_init=5, num_seeds=10, base_seed=42):
    results_by_seed = {}
    seeds = list(range(base_seed, base_seed + num_seeds))
    for seed in seeds:
        result = select_initial_indices(X, n_init=n_init, seed=seed)
        seed_dict = {f"{k}_seed_{seed}": v for k, v in result.items()}
        results_by_seed[seed] = seed_dict
    return results_by_seed

_seed_suffix_re = re.compile(r"_seed_(\d+|default)$", re.IGNORECASE)

def _dedup_keep_order(seq):
    seen, out = set(), []
    for x in seq:
        if x not in seen:
            seen.add(x)
            out.append(x)
    return out

def _norm_centroid_key(variant: str, seed_key: str) -> str:
    v = variant.strip().lower().replace(" ", "_")
    if v.startswith("centroids_"):
        v = v.split("centroids_", 1)[1]
    return f"Centroids_{v}_{seed_key}"

def filter_base_for_seed(base_init_choices: dict, cur_seed: int) -> dict:
    filtered = {}
    for k, v in base_init_choices.items():
        kl = k.lower()
        if not kl.startswith("centroids_"):
            filtered[k] = v
            continue
        m = _seed_suffix_re.search(kl)
        if not m:
            continue
        seed_str = m.group(1)
        if seed_str == str(cur_seed):
            filtered[k] = v
    return filtered

def build_base_init_choices(folder: str) -> dict:
    base_init_choices = {}

    # Top5 Similarity
    top5_indices = top5_df[top5_df['Folder'].astype(str).str.startswith(folder)]['index'].tolist()
    if top5_indices:
        base_init_choices['Top5Similarity'] = top5_indices

    # Max/Min Comp
    minmax_subset = minmax_df[minmax_df['Folder'].astype(str).str.startswith(folder)]
    max_comp = [int(row['MaxIndex']) for _, row in minmax_subset.iterrows()]
    min_comp = [int(row['MinIndex']) for _, row in minmax_subset.iterrows()]
    if max_comp:
        base_init_choices['Max Comp'] = max_comp
    if min_comp:
        base_init_choices['Min Comp'] = min_comp

    # Centroid Mappings
    matching_wafer_id = f"00{folder}"
    for wafer_key, wafer_dict in centroids_mapped.items():
        if not str(wafer_key).startswith(matching_wafer_id):
            continue
        for variant, seeds_dict in wafer_dict.items():
            if not isinstance(seeds_dict, dict):
                entries = seeds_dict if isinstance(seeds_dict, list) else []
                nearest = [int(e['nearest_stage_index']) for e in entries if isinstance(e, dict) and 'nearest_stage_index' in e]
                key = _norm_centroid_key(variant, "seed_default")
                base_init_choices[key] = _dedup_keep_order(nearest)[:5]
                continue
            for seed_key, entries in seeds_dict.items():
                nearest = [int(e['nearest_stage_index']) for e in entries if isinstance(e, dict) and 'nearest_stage_index' in e]
                key = _norm_centroid_key(variant, seed_key)
                base_init_choices[key] = _dedup_keep_order(nearest)[:5]

    return base_init_choices

# --- Main processing loop ---
datasets = [
    DATASET_10272_Ag_Au_Pd_RT,
    DATASET_10275_Ag_Au_Pd_Pt_Rh_RT,
    DATASET_10304_Au_Pd_Pt_Rh_RT,
    DATASET_10311_Au_Pd_Pt_Rh_Ru_RT,
    DATASET_10403_Ag_Au_Cu_Pd_Pt_RT,
    DATASET_10402_Ag_Au_Pd_Pt_RT,
    DATASET_10399_Au_Cu_Pd_Pt_RT,
    DATASET_10374_Ir_Pd_Pt_Rh_Ru 
]

for dataset_path in datasets:
    dataset_name = os.path.basename(dataset_path).split("_")[0]
    print(f"\n Processing dataset: {dataset_name}")

    data_exp = pd.read_csv(dataset_path)
    if data_exp.empty:
        print(f" Warning: Dataset {dataset_name} is empty.")
        continue

    # Load shared strategies from global summaries
    base_init_choices = build_base_init_choices(folder=dataset_name)

    # Rename any legacy centroid keys
    renamed_init_choices = {}
    for key, val in base_init_choices.items():
        if "Centroids_saturation_and_contrast_+++" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_+++", "Centroids_saturation_high")
        elif "Centroids_saturation_and_contrast_++" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_++", "Centroids_saturation_medium")
        elif "Centroids_saturation_and_contrast_+" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_+", "Centroids_saturation_low")
        else:
            new_key = key
        renamed_init_choices[new_key] = val
    base_init_choices = renamed_init_choices

    # Log-transform target if present
    target = ["Resistance"]
    if any(t in data_exp.columns for t in target):
        data_exp[target] = np.log(data_exp[target])

    # Create dataset-specific output folder (e.g. DATA_CLEAN_InIT_CHOICES/10272)
    dataset_output_dir = os.path.join(DATA_CLEAN_InIT_CHOICES, dataset_name)
    os.makedirs(dataset_output_dir, exist_ok=True)

    # Get feature columns
    all_columns = data_exp.columns.tolist()
    features = [col for col in all_columns if col not in ["ID", "x", "y", "Resistance"]]

    # Instantiate device and extract features
    device = Resistance(data_exp, features=features, target=target)
    X_all = device.get_features()

    # Run seeds and save combined strategy per seed
    seed_results = run_multiple_seeds(X_all, n_init=5, num_seeds=10, base_seed=42)
    for seed, seed_strategies in seed_results.items():
        base_for_seed = filter_base_for_seed(base_init_choices, cur_seed=seed)
        combined_results = base_for_seed.copy()
        combined_results.update(seed_strategies)

        # Save inside dataset-specific folder
        seed_json_path = os.path.join(dataset_output_dir, f"{dataset_name}_seed_{seed}.json")
        with open(seed_json_path, "w") as f:
            json.dump(combined_results, f, indent=4, default=convert_to_serializable)




 Processing dataset: 10272
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ag' 'Au' 'Pd' 'Resistance']

 Processing dataset: 10275
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ag' 'Au' 'Pd' 'Pt' 'Rh' 'Resistance']

 Processing dataset: 10304
Headers of DataFrame:
 ['ID' 'x' 'y' 'Au' 'Pd' 'Pt' 'Rh' 'Resistance']

 Processing dataset: 10311
Headers of DataFrame:
 ['ID' 'x' 'y' 'Au' 'Pd' 'Pt' 'Rh' 'Ru' 'Resistance']

 Processing dataset: 10403
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ag' 'Au' 'Cu' 'Pd' 'Pt' 'Resistance']

 Processing dataset: 10402
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ag' 'Au' 'Pd' 'Pt' 'Resistance']

 Processing dataset: 10399
Headers of DataFrame:
 ['ID' 'x' 'y' 'Au' 'Cu' 'Pd' 'Pt' 'Resistance']

 Processing dataset: 10374
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ru' 'Rh' 'Pd' 'Ir' 'Pt' 'Resistance']
