In [1]:
#!/usr/bin/env python3
"""
Initialize Redis with Baseline Pre-trained Classifiers

This script:
1. Connects to Redis (clearing DBs by default)
2. Loads a pre-trained Random Forest from baseline/Classifiers-100-converted
3. Loads dataset from baseline/resources/datasets
4. Loads test samples from baseline/resources/datasets
5. Stores everything in Redis (Dataset, Forest, Endpoints, Initial Candidate)

Directory Structure:
- Classifiers: baseline/Classifiers-100-converted/<dataset_name>/*.json
- Datasets: baseline/resources/datasets/<dataset_name>/<dataset_name>.csv
- Samples: baseline/resources/datasets/<dataset_name>/<dataset_name>.samples

Usage:
    python init_baseline.py --list-datasets
    python init_baseline.py iris --class-label "0"
    python init_baseline.py sonar --class-label "1" --test-sample-index "0,5-8,20"
"""

import sys
import os
import argparse
import pandas as pd
import numpy as np
import redis
import json
import datetime
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Shared modules
from redis_helpers.connection import connect_redis
from redis_helpers.utils import clean_all_databases
from init_utils import (
    store_forest_and_endpoints,
    initialize_seed_candidate,
    store_dataset_total_samples
)
from helpers import convert_numpy_types, parse_sample_indices
from load_rf_from_json import load_rf_from_json
from baseline.xrf import Dataset
from rf_utils import sklearn_forest_to_forest

# Constants
CLASSIFIERS_ROOT = os.path.join('baseline', 'Classifiers-100-converted')
DATASETS_ROOT = os.path.join('baseline', 'resources', 'datasets')


def list_available_datasets():
    """List all datasets with pre-trained classifiers in baseline directory."""
    print("\nAvailable Baseline Datasets:")
    print("=" * 70)

    if not os.path.exists(CLASSIFIERS_ROOT):
        print(f"[ERROR] Classifiers directory not found: {CLASSIFIERS_ROOT}")
        return

    datasets = []
    for name in sorted(os.listdir(CLASSIFIERS_ROOT)):
        dataset_dir = os.path.join(CLASSIFIERS_ROOT, name)
        if not os.path.isdir(dataset_dir):
            continue

        # Check if dataset files exist
        csv_path = os.path.join(DATASETS_ROOT, name, f"{name}.csv")
        samples_path = os.path.join(DATASETS_ROOT, name, f"{name}.samples")

        # Count JSON classifiers
        json_files = [f for f in os.listdir(dataset_dir) if f.endswith('.json')]

        status = "✓"
        notes = []
        if not os.path.exists(csv_path):
            status = "✗"
            notes.append("CSV missing")
        if not os.path.exists(samples_path):
            status = "✗"
            notes.append("samples missing")
        if len(json_files) == 0:
            status = "✗"
            notes.append("no classifiers")

        note_str = f" ({', '.join(notes)})" if notes else ""
        print(f"  {status} {name:<30} {len(json_files)} classifier(s){note_str}")

        if status == "✓":
            datasets.append(name)

    print(f"\nTotal: {len(datasets)} datasets ready to use")
    print("\nUsage: python init_baseline.py <dataset_name> --class-label <label>")


def find_classifier_json(dataset_name):
    """
    Find all classifier JSON files for a dataset.

    Returns:
        List of paths to JSON classifier files
    """
    classifier_dir = os.path.join(CLASSIFIERS_ROOT, dataset_name)

    # Try finding directory with hyphens if original not found
    if not os.path.exists(classifier_dir) and '_' in dataset_name:
        alt_name = dataset_name.replace('_', '-')
        alt_dir = os.path.join(CLASSIFIERS_ROOT, alt_name)
        if os.path.exists(alt_dir):
            classifier_dir = alt_dir
            # We don't change dataset_name here as it might be used for filtering inside


    if not os.path.exists(classifier_dir):
        return []

    json_files = [
        os.path.join(classifier_dir, fname)
        for fname in os.listdir(classifier_dir)
        if fname.endswith('.json')
    ]

    return sorted(json_files)


def load_dataset_from_baseline(dataset_name, separator=','):
    """
    Load dataset CSV and samples from baseline directory structure.

    Args:
        dataset_name: Name of the dataset
        separator: CSV separator (default: ',')

    Returns:
        (X_train, X_test, y_train, y_test, feature_names, all_classes)
    """
    # Handle potential name mismatch (underscore vs hyphen)
    actual_name = dataset_name
    dataset_dir = os.path.join(DATASETS_ROOT, dataset_name)

    if not os.path.exists(dataset_dir) and '_' in dataset_name:
        alt_name = dataset_name.replace('_', '-')
        if os.path.exists(os.path.join(DATASETS_ROOT, alt_name)):
            actual_name = alt_name

    dataset_path = os.path.join(DATASETS_ROOT, actual_name, f"{actual_name}.csv")
    samples_path = os.path.join(DATASETS_ROOT, actual_name, f"{actual_name}.samples")

    if not os.path.exists(dataset_path):
        raise FileNotFoundError(f"Dataset CSV not found: {dataset_path}")

    if not os.path.exists(samples_path):
        raise FileNotFoundError(f"Samples file not found: {samples_path}")

    if os.path.getsize(samples_path) == 0:
        raise ValueError(f"Samples file is empty: {samples_path}")

    # Load dataset using baseline Dataset class
    print(f"[INFO] Loading dataset from: {dataset_path}")
    data = Dataset(filename=dataset_path, separator=separator, use_categorical=False)

    # Get train/test split from the Dataset
    X_train_raw, X_test_raw, y_train, y_test = data.train_test_split()

    # Transform data (apply any transformations the Dataset class has)
    X_train = data.transform(X_train_raw)
    X_test = data.transform(X_test_raw)

    # Load samples - these will be our actual test samples
    print(f"[INFO] Loading samples from: {samples_path}")
    samples = np.loadtxt(samples_path, delimiter=separator)
    samples = np.atleast_2d(samples)

    # Validate sample dimensions
    expected_features = len(data.features)
    if samples.shape[1] == expected_features + 1:
        print("[INFO] Sample file includes labels; dropping last column.")
        samples = samples[:, :-1]
    elif samples.shape[1] != expected_features:
        raise ValueError(
            f"Sample file feature count mismatch: expected {expected_features}, "
            f"found {samples.shape[1]} in {samples_path}"
        )

    print(f"[INFO] Loaded {len(samples)} test samples")
    print(f"[INFO] Training set: {len(X_train)} samples")
    print(f"[INFO] Features: {data.features}")
    print(f"[INFO] Classes: {np.unique(y_train)}")

    # Convert class labels to strings for consistency (ensure int format '0' not '0.0')
    y_train = y_train.astype(int).astype(str)
    y_test = y_test.astype(int).astype(str)

    return X_train, samples, y_train, None, data.features, np.unique(y_train), data


def load_classifier_from_json(dataset_name, classifier_index=0):
    """
    Load a pre-trained classifier from JSON.

    Args:
        dataset_name: Name of the dataset
        classifier_index: Index of classifier to use if multiple exist (default: 0)

    Returns:
        (sklearn_rf, classifier_path) tuple
    """
    json_files = find_classifier_json(dataset_name)

    if not json_files:
        raise FileNotFoundError(
            f"No classifier JSON files found for dataset '{dataset_name}' "
            f"in {os.path.join(CLASSIFIERS_ROOT, dataset_name)}"
        )

    if classifier_index >= len(json_files):
        raise ValueError(
            f"Classifier index {classifier_index} out of range. "
            f"Found {len(json_files)} classifier(s)."
        )

    classifier_path = json_files[classifier_index]

    print(f"[INFO] Loading pre-trained classifier: {os.path.basename(classifier_path)}")

    # Parse parameters from filename
    filename = os.path.basename(classifier_path)
    try:
        parts = filename.split('_nbestim_')[1]
        n_estimators = int(parts.split('_maxdepth_')[0])
        max_depth = int(parts.split('_maxdepth_')[1].split('.')[0])
        print(f"[INFO] Classifier: {n_estimators} trees, max_depth={max_depth}")
    except (IndexError, ValueError):
        print(f"[WARNING] Could not parse classifier parameters from filename")

    # Load the classifier using load_rf_from_json
    sklearn_rf = load_rf_from_json(classifier_path)

    print(f"[INFO] Successfully loaded classifier with {sklearn_rf.n_estimators} trees")

    return sklearn_rf, classifier_path


In [2]:

connections, db_mapping = connect_redis()

Connected to Redis DB 0 (DATA) on port 6379
Connected to Redis DB 1 (CAN) on port 6379
Connected to Redis DB 2 (R) on port 6379
Connected to Redis DB 3 (NR) on port 6379
Connected to Redis DB 4 (CAR) on port 6379
Connected to Redis DB 5 (AR) on port 6379
Connected to Redis DB 6 (GP) on port 6379
Connected to Redis DB 7 (BP) on port 6379
Connected to Redis DB 8 (PR) on port 6379
Connected to Redis DB 9 (AP) on port 6379
Connected to Redis DB 10 (LOGS) on port 6379
Established 11 Redis connections


In [3]:
clean_all_databases(connections, db_mapping)

Cleaned database DATA (DB 0)
Cleaned database CAN (DB 1)
Cleaned database R (DB 2)
Cleaned database NR (DB 3)
Cleaned database CAR (DB 4)
Cleaned database AR (DB 5)
Cleaned database GP (DB 6)
Cleaned database BP (DB 7)
Cleaned database PR (DB 8)
Cleaned database AP (DB 9)
Cleaned database LOGS (DB 10)

Cleaned 11/11 databases


In [4]:
from rcheck_cache import rcheck_cache, saturate

DATASETS = ['ann-thyroid', 'appendicitis', 'banknote', 'biodegradation', 'ecoli', 'glass2', 'heart-c', 'ionosphere', 'iris', 'karhunen', 'letter', 'magic', 'mofn-3-7-10', 'new-thyroid', 'pendigits', 'phoneme', 'ring', 'segmentation', 'shuttle', 'sonar', 'spambase', 'spectf', 'texture', 'threeOf9', 'twonorm', 'vowel', 'waveform-21', 'waveform-40', 'wdbc', 'wine-recog', 'wpbc', 'xd6']

for dataset_name in DATASETS:
    clean_all_databases(connections, db_mapping)
    X_train, X_test_samples, y_train, _, feature_names, all_classes, data = load_dataset_from_baseline(dataset_name)
    classifier_index = 0
    sklearn_rf, classifier_path = load_classifier_from_json(dataset_name, classifier_index)
    our_forest = sklearn_forest_to_forest(sklearn_rf, feature_names)
    eu_data = store_forest_and_endpoints(connections, our_forest)
    X_test = [dict(zip(feature_names,  x_test) ) for x_test in X_test_samples]
    predictions = [our_forest.predict(x_test) for x_test in X_test ]
    validated= True
    not_validated = []
    validated_test = []
    for i in range(len(X_test)):
        nodes = []
        for tree in our_forest.trees:
            nodes.append(tree.root)
        caches = {
            'R': set(),
            'NR': set(),
            'GP': set(),
            'BP': set(),
            'AR': set(),
            'AP': set()
        }
        icf = our_forest.extract_icf(X_test[i])
        if not rcheck_cache(
                    connections=connections,
                    icf=icf,
                    label=predictions[i],
                    nodes=saturate(icf, nodes),
                    eu_data=eu_data,
                    forest=our_forest,
                    caches=caches,
                    info={}
                ):
            not_validated.append(icf)
            validated = False
        else:
            validated_test.append(X_test[i])

    print(40*"#")
    if not validated:
        print(f"{dataset_name} {len(not_validated)} over {len(X_test)} samples not validated")
    else:
        print(f"{dataset_name} FULLY VALIDATED")
    print(40*"#")



Cleaned database DATA (DB 0)
Cleaned database CAN (DB 1)
Cleaned database R (DB 2)
Cleaned database NR (DB 3)
Cleaned database CAR (DB 4)
Cleaned database AR (DB 5)
Cleaned database GP (DB 6)
Cleaned database BP (DB 7)
Cleaned database PR (DB 8)
Cleaned database AP (DB 9)
Cleaned database LOGS (DB 10)

Cleaned 11/11 databases
[INFO] Loading dataset from: baseline\resources\datasets\ann-thyroid\ann-thyroid.csv
c nof features: 21
c nof classes: 3
c nof samples: 7129
[INFO] Loading samples from: baseline\resources\datasets\ann-thyroid\ann-thyroid.samples
[INFO] Loaded 720 test samples
[INFO] Training set: 5703 samples
[INFO] Features: ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16', 'A17', 'A18', 'A19', 'A20', 'A21']
[INFO] Classes: [1. 2. 3.]
[INFO] Loading pre-trained classifier: ann-thyroid_nbestim_100_maxdepth_4.mod.json
[INFO] Classifier: 100 trees, max_depth=4
[INFO] Successfully loaded classifier with 100 trees
Storing Random F

In [5]:
dataset_name = "karhunen"

In [6]:
X_train, X_test_samples, y_train, _, feature_names, all_classes, data = load_dataset_from_baseline(dataset_name)

[INFO] Loading dataset from: baseline\resources\datasets\karhunen\karhunen.csv
c nof features: 64
c nof classes: 10
c nof samples: 1994
[INFO] Loading samples from: baseline\resources\datasets\karhunen\karhunen.samples
[INFO] Loaded 200 test samples
[INFO] Training set: 1595 samples
[INFO] Features: ['att1', 'att2', 'att3', 'att4', 'att5', 'att6', 'att7', 'att8', 'att9', 'att10', 'att11', 'att12', 'att13', 'att14', 'att15', 'att16', 'att17', 'att18', 'att19', 'att20', 'att21', 'att22', 'att23', 'att24', 'att25', 'att26', 'att27', 'att28', 'att29', 'att30', 'att31', 'att32', 'att33', 'att34', 'att35', 'att36', 'att37', 'att38', 'att39', 'att40', 'att41', 'att42', 'att43', 'att44', 'att45', 'att46', 'att47', 'att48', 'att49', 'att50', 'att51', 'att52', 'att53', 'att54', 'att55', 'att56', 'att57', 'att58', 'att59', 'att60', 'att61', 'att62', 'att63', 'att64']
[INFO] Classes: [0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]


In [None]:
X_test_samples

In [7]:
classifier_index = 0

In [8]:
sklearn_rf, classifier_path = load_classifier_from_json(dataset_name, classifier_index)

[INFO] Loading pre-trained classifier: karhunen_nbestim_100_maxdepth_5.mod.json
[INFO] Classifier: 100 trees, max_depth=5
[INFO] Successfully loaded classifier with 100 trees


In [9]:

print("[INFO] Converting classifier to internal format...")

our_forest = sklearn_forest_to_forest(sklearn_rf, feature_names)

[INFO] Converting classifier to internal format...


In [None]:
our_forest.trees[0].root

In [10]:


print("\n[INFO] Storing forest and computing endpoints...")
eu_data = store_forest_and_endpoints(connections, our_forest)

# 7. Process Test Samples
print("\n[INFO] Processing test samples...")

X_test = [dict(zip(feature_names,  x_test) ) for x_test in X_test_samples]

# Since we don't have ground truth labels for samples,
# we'll predict them and filter by target class

predictions = [our_forest.predict(x_test) for x_test in X_test ]

predictions


[INFO] Storing forest and computing endpoints...
Storing Random Forest in DATA['RF']...
Successfully stored forest with 100 trees in Redis key 'RF'
[OK] Forest saved successfully
Extracting feature thresholds...
Extracted thresholds for 64 features
Storing endpoints universe in DATA['EU']...
Successfully stored dictionary with 64 keys in Redis key 'EU'
[OK] Endpoints universe saved successfully

[INFO] Processing test samples...


['5.0',
 '6.0',
 '1.0',
 '3.0',
 '2.0',
 '1.0',
 '7.0',
 '9.0',
 '9.0',
 '8.0',
 '4.0',
 '9.0',
 '8.0',
 '8.0',
 '4.0',
 '6.0',
 '6.0',
 '5.0',
 '2.0',
 '9.0',
 '9.0',
 '5.0',
 '1.0',
 '6.0',
 '0.0',
 '5.0',
 '7.0',
 '0.0',
 '7.0',
 '0.0',
 '4.0',
 '2.0',
 '1.0',
 '7.0',
 '6.0',
 '5.0',
 '6.0',
 '0.0',
 '9.0',
 '1.0',
 '3.0',
 '3.0',
 '6.0',
 '7.0',
 '8.0',
 '2.0',
 '6.0',
 '7.0',
 '6.0',
 '4.0',
 '1.0',
 '6.0',
 '7.0',
 '8.0',
 '8.0',
 '5.0',
 '8.0',
 '8.0',
 '8.0',
 '4.0',
 '2.0',
 '7.0',
 '6.0',
 '3.0',
 '0.0',
 '2.0',
 '1.0',
 '0.0',
 '6.0',
 '6.0',
 '7.0',
 '2.0',
 '4.0',
 '1.0',
 '3.0',
 '7.0',
 '1.0',
 '5.0',
 '7.0',
 '8.0',
 '9.0',
 '1.0',
 '7.0',
 '8.0',
 '3.0',
 '7.0',
 '2.0',
 '1.0',
 '6.0',
 '6.0',
 '8.0',
 '0.0',
 '2.0',
 '3.0',
 '6.0',
 '3.0',
 '0.0',
 '7.0',
 '6.0',
 '0.0',
 '2.0',
 '1.0',
 '1.0',
 '9.0',
 '0.0',
 '4.0',
 '7.0',
 '5.0',
 '9.0',
 '3.0',
 '9.0',
 '2.0',
 '6.0',
 '2.0',
 '5.0',
 '8.0',
 '6.0',
 '4.0',
 '0.0',
 '3.0',
 '7.0',
 '9.0',
 '4.0',
 '7.0',
 '2.0',


In [11]:
from rcheck_cache import rcheck_cache, saturate

validated= True
not_validated = []
validated_test = []
for i in range(len(X_test)):
    nodes = []
    for tree in our_forest.trees:
        nodes.append(tree.root)
    caches = {
        'R': set(),
        'NR': set(),
        'GP': set(),
        'BP': set(),
        'AR': set(),
        'AP': set()
    }
    icf = our_forest.extract_icf(X_test[i])
    if not rcheck_cache(
                connections=connections,
                icf=icf,
                label=predictions[i],
                nodes=saturate(icf, nodes),
                eu_data=eu_data,
                forest=our_forest,
                caches=caches,
                info={}
            ):
        not_validated.append(icf)
        validated = False
    else:
        validated_test.append(X_test[i])

if not validated:
    print(f"{len(not_validated)} samples not validated")

X_test = validated_test

1 samples not validated


In [None]:
not_validated

In [12]:
clean_all_databases(connections, db_mapping)

Cleaned database DATA (DB 0)
Cleaned database CAN (DB 1)
Cleaned database R (DB 2)
Cleaned database NR (DB 3)
Cleaned database CAR (DB 4)
Cleaned database AR (DB 5)
Cleaned database GP (DB 6)
Cleaned database BP (DB 7)
Cleaned database PR (DB 8)
Cleaned database AP (DB 9)
Cleaned database LOGS (DB 10)

Cleaned 11/11 databases


In [17]:
label = sorted(list(set(predictions)))[0]

In [18]:
label_samples = [sample for sample, pred in zip(X_test, predictions) if pred == label]

print("\n[INFO] Storing forest and computing endpoints...")
eu_data = store_forest_and_endpoints(connections, our_forest)

store_dataset_total_samples(connections, len(label_samples))


[INFO] Storing forest and computing endpoints...
Storing Random Forest in DATA['RF']...
Successfully stored forest with 100 trees in Redis key 'RF'
[OK] Forest saved successfully
Extracting feature thresholds...
Extracted thresholds for 64 features
Storing endpoints universe in DATA['EU']...
Successfully stored dictionary with 64 keys in Redis key 'EU'
[OK] Endpoints universe saved successfully
[OK] Total test samples stored: 16


True

In [19]:
from icf_eu_encoding import bitmap_mask_to_string, icf_to_bitmap_mask
from redis_helpers.samples import store_sample


def process_all_classified_samples(
    connections,
    dataset_name,
    class_label,
    our_forest,
    X_test,
    eu_data,
    dataset_type='generic',
):
    """
    Process all test samples that are classified with the specified class label.
    Store samples in DATA and their ICF representations in R.

    Args:
        connections: Redis connections dict
        dataset_name: Name of the dataset
        class_label: Target class label to filter
        our_forest: Custom Forest object
        X_test: Test features array
        y_test: Test labels array
        feature_names: List of feature names
        eu_data: Endpoints universe data
        sample_percentage: Optional percentage of samples to process
        dataset_type: Type of dataset ('uci', 'pmlb', 'openml', 'baseline', etc.)

    Returns:
        tuple: (stored_samples list, summary dict)
    """
    print(f"\n=== Processing All Samples Classified as '{class_label}' ===")

    # Find all test samples that are classified as the target class
    target_samples_data = []
    current_time = datetime.datetime.now().isoformat()

    # Apply sample percentage filtering if specified


    for i, sample in enumerate(X_test):
        predicted_label = our_forest.predict(sample)

        # Store ALL samples classified with the target label (regardless of correctness)
        if predicted_label == class_label:
            target_samples_data.append({
                'test_index': i,
                'sample_dict': sample,
                'predicted_label': predicted_label,
            })

    print(f"Found {len(target_samples_data)} samples classified as '{class_label}'")

    if len(target_samples_data) == 0:
        print("[WARNING] No samples classified with the target label!")
        return [], {}

    # Store all samples and their ICF representations
    stored_samples = []
    correct_predictions = 0

    for idx, sample_data in enumerate(target_samples_data):
        sample_key = f"sample_{dataset_name}_{class_label}_{idx}"

        # Store sample in DATA with full metadata
        data_entry = {
            'sample_dict': sample_data['sample_dict'],
            'predicted_label': sample_data['predicted_label'],
            'test_index': sample_data['test_index'],
            'dataset_name': dataset_name,
            'dataset_type': dataset_type,
            'timestamp': current_time,
        }

        # Store sample using our helper function
        if store_sample(connections['DATA'], sample_key, sample_data['sample_dict']):
            # Also store full metadata separately
            connections['DATA'].set(f"{sample_key}_meta", json.dumps(data_entry))

        # Calculate ICF and store in R
        try:
            sample_icf = our_forest.extract_icf(sample_data['sample_dict'])
            icf_bitmap = bitmap_mask_to_string(icf_to_bitmap_mask(sample_icf, eu_data))

            # Store ICF bitmap in R with metadata
            icf_metadata = {
                'sample_key': sample_key,
                'dataset_name': dataset_name,
                'dataset_type': dataset_type,
                'class_label': class_label,
                'test_index': sample_data['test_index'],
                'timestamp': current_time
            }

            connections['R'].set(icf_bitmap, json.dumps(icf_metadata))

            stored_samples.append({
                'sample_key': sample_key,
                'icf_bitmap': icf_bitmap,
                'test_index': sample_data['test_index']
            })

        except Exception as e:
            print(f"[WARNING] Failed to process sample {idx}: {e}")
            continue

    # Store summary information
    summary = {
        'dataset_name': dataset_name,
        'dataset_type': dataset_type,
        'target_class_label': class_label,
        'total_samples_processed': len(stored_samples),
        'total_test_samples': len(X_test),
        'samples_with_target_label': len(target_samples_data),
        'timestamp': current_time,
        'sample_keys': [s['sample_key'] for s in stored_samples]
    }

    connections['DATA'].set(f"summary_{dataset_name}_{class_label}", json.dumps(summary))

    print(f"[OK] Stored {len(stored_samples)} samples in DATA")
    print(f"[OK] Stored {len(stored_samples)} ICF representations in R")
    print(f"[OK] Summary stored in DATA['summary_{dataset_name}_{class_label}']")

    return stored_samples, summary

In [20]:
stored_samples, summary = process_all_classified_samples(
            connections,
            dataset_name,
            label,
            our_forest,
            label_samples,
            eu_data,
            dataset_type='baseline'
        )


=== Processing All Samples Classified as '0.0' ===
Found 11 samples classified as '0.0'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_0'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_1'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_2'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_3'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_4'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_5'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_6'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_7'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_8'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_9'
Successfully stored sample with 64 features in Redis key 'sample_karhunen_0.0_10'
[OK] Stored 11 samp

In [21]:
print("\n[INFO] Initializing seed candidates...")
for s in stored_samples:
        meta_json = connections['DATA'].get(f"{s['sample_key']}_meta")
        if meta_json:
            meta = json.loads(meta_json)
            initialize_seed_candidate(connections, meta, our_forest, eu_data)
        else:
            print("[WARNING] No samples processed, cannot initialize seed candidate.")


[INFO] Initializing seed candidates...
Generating initial ICF and storing in CAN and PR...
ICF calculated for 44 features
Generated bitmap with 3096 bits
[OK] Stored initial candidate in CAN
[OK] Stored initial candidate in PR
Generating initial ICF and storing in CAN and PR...
ICF calculated for 45 features
Generated bitmap with 3096 bits
[OK] Stored initial candidate in CAN
[OK] Stored initial candidate in PR
Generating initial ICF and storing in CAN and PR...
ICF calculated for 47 features
Generated bitmap with 3096 bits
[OK] Stored initial candidate in CAN
[OK] Stored initial candidate in PR
Generating initial ICF and storing in CAN and PR...
ICF calculated for 42 features
Generated bitmap with 3096 bits
[OK] Stored initial candidate in CAN
[OK] Stored initial candidate in PR
Generating initial ICF and storing in CAN and PR...
ICF calculated for 45 features
Generated bitmap with 3096 bits
[OK] Stored initial candidate in CAN
[OK] Stored initial candidate in PR
Generating initial I

In [22]:
# 9. Store target label for worker compatibility
connections['DATA'].set('label', label)
print(f"[INFO] Target label '{label}' set for worker processing")

# 10. Store classifier metadata
metadata = {
            'dataset': dataset_name,
            'classifier_path': classifier_path,
            'n_estimators': sklearn_rf.n_estimators,
            'max_depth': sklearn_rf.max_depth,
            'n_features': sklearn_rf.n_features_in_,
            'classes': list(sklearn_rf.classes_.astype(str)),
            'timestamp': datetime.datetime.now().isoformat()
}
connections['DATA'].set('classifier_metadata', json.dumps(metadata))

print(f"\n[SUCCESS] Successfully initialized {dataset_name}")
print(f"[SUCCESS] Pre-trained classifier loaded from: {os.path.basename(classifier_path)}")
print(f"[SUCCESS] Ready for worker processing with {len(stored_samples)} samples")


[INFO] Target label '0.0' set for worker processing

[SUCCESS] Successfully initialized karhunen
[SUCCESS] Pre-trained classifier loaded from: karhunen_nbestim_100_maxdepth_5.mod.json
[SUCCESS] Ready for worker processing with 11 samples
