In [1]:
import sys
sys.path.append("..")
from config import *
import os
import json
from scripts.run_active_learning import *
import numpy as np
import pandas as pd

In [2]:
# Paths
TOP8_SUMMARY = Top8_Similarity_Summary
MAPPED_CENTROIDS_JSON = MAPPED_CENTROIDS_JSON_8
OUTPUT_DIR = INIT_CHOICES_N8_DIR
# Create output dir
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load inputs
top8_df = pd.read_csv(TOP8_SUMMARY)

with open(MAPPED_CENTROIDS_JSON, 'r') as f:
    centroids_mapped = json.load(f)

# Collect only short folders (e.g., 10268, 10269, etc.)
short_folders = sorted(set(top8_df['Folder'].astype(str)))


for folder in short_folders:
    folder_indices = {}

    # Top5 Similarity indices
    top5_indices = top8_df[top8_df['Folder'].astype(str).str.startswith(folder)]['index'].tolist()
    if top5_indices:
        folder_indices['Top8Similarity'] = top5_indices

    # Centroids mapped
    matching_wafer_id = f"00{folder}"
    for wafer_key, centroids_dict in centroids_mapped.items():
        if wafer_key.startswith(matching_wafer_id):
            for sat_level, centroid_entries in centroids_dict.items():
                nearest_indices = []
                for entry in centroid_entries:
                    if isinstance(entry, dict) and 'nearest_stage_index' in entry:
                        nearest_indices.append(entry['nearest_stage_index'])

                cleaned_key = f"Centroids_{sat_level.replace(' ', '_')}"
                folder_indices[cleaned_key] = nearest_indices[:5]  # Limit to 5

    #  Only save if we collected something
    if folder_indices:
        save_path = os.path.join(OUTPUT_DIR, f"{folder}_indices.json")
        with open(save_path, 'w') as f:
            json.dump(folder_indices, f, indent=4)
   


In [3]:
# List of datasets from config
datasets = [
    DATASET_10272_Ag_Au_Pd_RT,
    DATASET_10275_Ag_Au_Pd_Pt_Rh_RT,
    DATASET_10304_Au_Pd_Pt_Rh_RT,
    DATASET_10311_Au_Pd_Pt_Rh_Ru_RT,
    DATASET_10403_Ag_Au_Cu_Pd_Pt_RT,
    DATASET_10402_Ag_Au_Pd_Pt_RT,
    DATASET_10399_Au_Cu_Pd_Pt_RT,
    DATASET_10374_Ir_Pd_Pt_Rh_Ru 
]

# For 8 points initial 

In [4]:
# Helper to convert non-serializable types
def convert_to_serializable(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, np.bool_):
        return bool(obj)
    return obj


for dataset_path in datasets:

    dataset_name = os.path.basename(dataset_path).split("_")[0]
    print(f"\n[INFO] Processing dataset: {dataset_name}")
    
    data_exp = pd.read_csv(dataset_path)

    if data_exp.empty:
        print(f"[WARNING] Dataset {dataset_name} is empty. Skipping.")
        continue

    json_path = os.path.join(INIT_CHOICES_N8_DIR, f"{dataset_name}_indices.json")

    if not os.path.exists(json_path):
        print(f"[WARNING] JSON file not found for {dataset_name}, skipping.")
        continue

    try:
        with open(json_path, "r") as f:
            init_choices = json.load(f)
    except json.JSONDecodeError as e:
        print(f"[ERROR] JSON decode error in {json_path}: {e}")
        init_choices = {}

    # Features and target
    all_columns = data_exp.columns.tolist()
    features = [col for col in all_columns if col not in ["ID", "x", "y", "Resistance"]]
    target = ["Resistance"]

    # Apply log transform
    data_exp[target] = np.log(data_exp[target])

    # Create results folder
    output_dir = os.path.join(UNCERTAINTY_PATH, dataset_name + "_results")
    os.makedirs(output_dir, exist_ok=True)

    # Generate new init strategies (n=8)
    device = Resistance(data_exp, features=features, target=target)
    X_all = device.get_features()
    y_all = device.df[target[0]].values.reshape(-1, 1)

    init_strategies = select_initial_indices(X_all, n_init=8)
    init_choices.update(init_strategies)

    # Rename strategies for clarity
    renamed_init_choices = {}
    for key, val in init_choices.items():
        if "Centroids_saturation_and_contrast_+++" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_+++", "Centroids_saturation_high")
        elif "Centroids_saturation_and_contrast_++" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_++", "Centroids_saturation_medium")
        elif "Centroids_saturation_and_contrast_+" in key:
            new_key = key.replace("Centroids_saturation_and_contrast_+", "Centroids_saturation_low")
        else:
            new_key = key
        renamed_init_choices[new_key] = val

    init_choices = renamed_init_choices

    print(f"[INFO] Final init choices for {dataset_name} (n=8): {list(init_choices.keys())}")

    # Save updated JSON
    with open(json_path, "w") as f:
        json.dump(init_choices, f, indent=4, default=convert_to_serializable)

    #print(f"[INFO] Saved updated init strategies to: {json_path}")



[INFO] Processing dataset: 10272
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ag' 'Au' 'Pd' 'Resistance']
[INFO] Final init choices for 10272 (n=8): ['Top8Similarity', 'Centroids_saturation_high', 'Centroids_saturation_medium', 'Centroids_saturation_low', 'Random', 'LHS', 'K-Means', 'Farthest', 'ODAL', 'K-Center']

[INFO] Processing dataset: 10275
Headers of DataFrame:
 ['ID' 'x' 'y' 'Ag' 'Au' 'Pd' 'Pt' 'Rh' 'Resistance']
[INFO] Final init choices for 10275 (n=8): ['Top8Similarity', 'Centroids_saturation_high', 'Centroids_saturation_medium', 'Centroids_saturation_low', 'Random', 'LHS', 'K-Means', 'Farthest', 'ODAL', 'K-Center']

[INFO] Processing dataset: 10304
Headers of DataFrame:
 ['ID' 'x' 'y' 'Au' 'Pd' 'Pt' 'Rh' 'Resistance']
[INFO] Final init choices for 10304 (n=8): ['Top8Similarity', 'Centroids_saturation_high', 'Centroids_saturation_medium', 'Centroids_saturation_low', 'Random', 'LHS', 'K-Means', 'Farthest', 'ODAL', 'K-Center']

[INFO] Processing dataset: 10311
Headers of DataFrame: