In [2]:
import numpy as np
import pandas as pd
import onnxruntime as rt

from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [3]:
# Load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')
data_biased = pd.concat([data, data[data['checked'] == 1]], axis=0)
data_reduced = data_biased

# Check how imbalanced the dataset is
data_reduced['checked'].value_counts(normalize=True)
y = data_reduced['checked']
X = data_reduced.drop(['checked'], axis=1)
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True, stratify=y)

# Oversample
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_smote, y_train_smote)

# Get feature importances
feature_importances = clf.feature_importances_

# Convert feature importances into a more readable format
features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort features by importance
features_sorted = features.sort_values(by='Importance', ascending=False)
X_train_smote = X_train_smote[features_sorted.head(50)['Feature']]
X_test = X_test[features_sorted.head(50)['Feature']]

FileNotFoundError: [Errno 2] No such file or directory: 'data/synth_data_for_training.csv'

In [4]:
# Load the model
new_session = rt.InferenceSession("model/bad_model.onnx")

def get_predict_label_and_confidence(session, solution):
    y_pred_solution = session.run(None, {'X': solution.astype(np.float32)})

    # Predict checked or not checked
    label = y_pred_solution[0][0]
    
    # Only have two classes: checked and not checked
    confidence_top1_label = y_pred_solution[1][0][0]
    confidence_top2_label = y_pred_solution[1][0][1]
    
    return (label, confidence_top1_label, confidence_top2_label)


def get_mean_and_variance(X_test):
    mean = np.mean(X_test)
    variance = np.var(X_test)
    
    return mean, variance


def get_fitness(y_true, y_pred): 
    # Get the predicted label and confidence
    label_original = y_true
    label_mutated, confidence_top1_label, confidence_top2_label = y_pred
    
    # If the prediction is different from the true label: fitness = - confidence_top1_label
    # If the prediction is the same as the true label: fitness = confidence_top1_label - confidence_top2_label    
    return -confidence_top1_label if label_mutated != label_original else confidence_top1_label - confidence_top2_label


def random_gaussian_mutation(solution, mutation_rate, mean, variance):
    mutated_solution = solution.copy()
    num_features = len(mutated_solution)
    num_mutations = max(1, int(mutation_rate * num_features))
    
    # Mutate a random subset of the features
    for _ in range(num_mutations):
        feature_index = np.random.randint(num_features)
        mutation_value = np.random.normal(loc=mean, scale=np.sqrt(variance))
        mutated_solution[feature_index] += mutation_value

    return mutated_solution


def stochastic_hill_climber(initial_solution, session, y_test_sample, mean, variance, num_iterations=10000, num_neighbors=10, mutation_rate=0.1):
    
    current_solution = initial_solution
    current_fitness = get_fitness(y_test_sample, get_predict_label_and_confidence(session, current_solution))
    
    for _ in range(num_iterations):
        neighbors = [random_gaussian_mutation(current_solution, mutation_rate, mean, variance) for _ in range(num_neighbors)]
        neighbor_scores = [get_fitness(y_test_sample, get_predict_label_and_confidence(session, neighbor)) for neighbor in neighbors]
        
        # Minimize the objective function
        best_neighbor_index = np.argmin(neighbor_scores)
        best_neighbor_score = neighbor_scores[best_neighbor_index]
        
        # Check if the best neighbor is better than the current solution
        if best_neighbor_score < current_fitness:
            current_solution = neighbors[best_neighbor_index]
            current_fitness = best_neighbor_score
    
    return current_solution, current_fitness

best_solutions_scores = []
global_best_score = None

# Get random samples from the test set
subset_indices = np.random.choice(len(X_test), size=100, replace=False)
X_test_subset = X_test.iloc[subset_indices]
y_test_subset = y_test.iloc[subset_indices]

mean, variance = get_mean_and_variance(X_test)

for X_test_sample, y_test_sample in zip(X_test_subset.values, y_test_subset.values):

    # Take the first sample from the training set as the initial solution
    initial_solution = X_test_sample.reshape(1, -1)

    # Run the algorithm
    final_solution, best_score = stochastic_hill_climber(initial_solution, new_session, y_test_sample, mean, variance, num_iterations=1000, num_neighbors=10, mutation_rate=0.2)

    best_solutions_scores.append((X_test_sample, y_test_sample, final_solution, best_score))

NoSuchFile: [ONNXRuntimeError] : 3 : NO_SUCHFILE : Load model from model/bad_model.onnx failed:Load model model/bad_model.onnx failed. File doesn't exist

In [None]:
for (x_sample, y_sample, solution, o_solution) in best_solutions_scores:
    print("Original solution:", x_sample)
    print("Original label:", y_sample)
    print("Mutated solution:", solution)
    print("Fitness:", o_solution)
    global_best_score = o_solution if global_best_score is None else min(global_best_score, o_solution)
    
print("Global best score:", global_best_score)
    