In [27]:
import numpy as np
import pandas as pd
import onnxruntime as rt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [28]:
# Load the dataset
data = pd.read_csv('data/synth_data_for_training.csv')
data_biased = pd.concat([data, data[data['checked'] == 1]], axis=0)
data_reduced = data_biased

# Check how imbalanced the dataset is
data_reduced['checked'].value_counts(normalize=True)
y = data_reduced['checked']
X = data_reduced.drop(['checked'], axis=1)
X = X.astype(np.float32)
# X_test = X
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler(random_state=42)
# X_test, y_test = rus.fit_resample(X, y)
X_test = X
y_test = y


In [29]:
print(f'{type(X_test)}: {X_test.shape} \n {X_test}')
print(f'{type(X_test.values)}: {X_test.values.shape} \n {X_test.values}')

<class 'pandas.core.frame.DataFrame'>: (13910, 315) 
        adres_aantal_brp_adres  adres_aantal_verschillende_wijken  \
0                         6.0                                3.0   
1                         4.0                                2.0   
2                         4.0                                2.0   
3                         3.0                                2.0   
4                         3.0                                3.0   
...                       ...                                ...   
12609                     2.0                                2.0   
12618                     1.0                                1.0   
12629                     3.0                                3.0   
12634                     5.0                                3.0   
12638                     3.0                                2.0   

       adres_aantal_verzendadres  adres_aantal_woonadres_handmatig  \
0                            1.0                           

In [30]:
# Load the model
from concurrent.futures import ProcessPoolExecutor, as_completed
from multiprocessing import Manager

def predict(session, solution):
    y_pred_solution = session.run(None, {'X': solution.reshape(1, -1).astype(np.float32)})

    # Predict checked or not checked
    label = y_pred_solution[0][0]
    
    # Only have two classes: checked and not checked
    confidence_top1_label = y_pred_solution[1][0][0]
    confidence_top2_label = y_pred_solution[1][0][1]
    
    return (label, confidence_top1_label, confidence_top2_label)

def get_mean_and_variance(feature):
    mean = feature.mean()
    variance = feature.var()
    
    return mean, variance

def calculate_objective_value(y_true, y_pred): 
    # Get the predicted label and confidence
    label_original = y_true
    label_mutated, confidence_top1_label, confidence_top2_label = y_pred
    
    # If the prediction is different from the true label: fitness = - confidence_top1_label
    # If the prediction is the same as the true label: fitness = confidence_top1_label - confidence_top2_label    
    return -confidence_top1_label if label_mutated != label_original else confidence_top1_label - confidence_top2_label

def mutate(solution, mutation_rate):
    mutated_solution = solution.copy()
    num_features = len(mutated_solution)-1
    mutation_rate = np.random.randint(10, 50)/100
    num_mutations = max(1, int(mutation_rate * num_features))

    # Mutate a random subset of the features
    for _ in range(num_mutations):
        feature_index = np.random.randint(num_features)
        mean, variance = get_mean_and_variance(X_test[X_test.columns[feature_index]])
        mutation_value = np.random.normal(loc=mean, scale=np.sqrt(variance))
        mutated_solution[feature_index] += mutation_value if np.random.rand() < 0.5 else -mutation_value

    return mutated_solution #.reshape(1, -1)

def crossover(solution1, solution2):
    # Crossover by taking first column feature values of solution1 and second part of solution2
    # The crossover point is randomly selected to be in the last 10% of the features
    # Assumes that the label stays the same
    crossover_point = np.random.randint(len(solution1)//10, len(solution1)//3)
    new_solution1 = np.concatenate([solution1[:crossover_point], solution2[crossover_point:]])
    new_solution2 = np.concatenate([solution2[:crossover_point], solution1[crossover_point:]])
    
    return new_solution1, new_solution2

def tournament_selection(population, objective_values, tournament_size=2):
    # Randomly select tournament_size individuals from the population
    tournament_indices = np.random.choice(len(population)-1, size=tournament_size, replace=False)
    tournament_objective_values = [objective_values[i] for i in tournament_indices]
    
    # Select the best individual from the tournament
    best_index = tournament_indices[np.argmin(tournament_objective_values)] if np.random.rand() >= 0.4 else np.random.randint(len(population)-1)
    return population[best_index]

def genetic_algorithm(population, num_generations, session, population_labels, mutation_rate=0.1, crossover_rate=0.5):
    if(session == 1):
        # session = rt.InferenceSession("../../Group3/model_1.onnx")
        session = rt.InferenceSession("model/model_1.onnx")
    elif(session == 2):
        # session = rt.InferenceSession("../../Group3/model_2.onnx")
        session = rt.InferenceSession("model/model_2.onnx")
    else:
        return "Invalid session"
    # population = [X_test.reshape(1, -1) for _ in range(population_size)]
    population = pd.concat(
        [population, population_labels], axis=1).values
    
    population_size = len(population)
    objective_values = [calculate_objective_value(individual[-1], predict(session, individual[:len(individual)-1])) for individual in population]

    generations = []
    for g in range(num_generations):
        new_population = np.empty((0, population.shape[1]))
        
        # Elitism: keep the best individual in the population
        # best_index = np.argmin(objective_values)
        # new_population = np.append(new_population, [population[best_index]], axis=0)
        # new_population.append(population[best_index].reshape(1, -1))
        
        while len(new_population) < population_size:
            # Select parents using tournament selection
            parent1 = tournament_selection(population, objective_values)
            parent2 = tournament_selection(population, objective_values)
            
            # Perform crossover
            if np.random.rand() < crossover_rate:
                offspring1, offspring2 = crossover(parent1, parent2)
            else:
                offspring1, offspring2 = parent1, parent2
            
            # Perform mutation
            offspring1 = mutate(offspring1, mutation_rate)
            offspring2 = mutate(offspring2, mutation_rate)
            
            new_population = np.append(new_population, [offspring1, offspring2], axis=0)
            # new_population.extend([offspring1, offspring2])
        
        # Update the population
        population = new_population
        objective_values = [calculate_objective_value(individual[-1], predict(session, individual[:len(individual)-1])) for individual in population]
        
        # # To output statistics
        # best_per_generation_index = np.argmin(objective_values)
        # # best_per_generation_objective_value = objective_values[best_per_generation_index]
        # best_per_generation_individual = population[best_per_generation_index]
        
        # generations.append(best_per_generation_individual)
    
    best_index = np.argmin(objective_values)
    
    # return best across all generations, best objective value, and a list of generations = pairs (best individual, best individual label) per generation
    return population

In [31]:
import multiprocessing

pool = multiprocessing.Pool(processes=2)

results1 = pool.apply_async(genetic_algorithm, [X_test, 2, 1, y_test, 0.2])
generations_m1 = results1.get()

results2 = pool.apply_async(genetic_algorithm, [X_test, 2, 2, y_test, 0.2])
generations_m2 = results2.get()

In [32]:
from tabulate import tabulate
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    if(len(np.unique(y_true)) > 1):
        roc_auc = roc_auc_score(y_true, y_pred)
    else:
        roc_auc = 0.0
    
    return accuracy, precision, recall, f1, roc_auc

y_true = y_test.values

# MODEL 1
new_session_m1 = rt.InferenceSession("../../Group3/model_1.onnx")
y_pred_original_m1 = [predict(new_session_m1, sample)[0] for sample in X_test.values]

y_true_best_per_gen_m1 = [solution[-1] for solution in generations_m1]
y_pred_best_per_gen_m1 = [predict(new_session_m1, solution[:len(solution)-1])[0] for solution in generations_m1]

# MODEL 2
new_session_m2 = rt.InferenceSession("../../Group3/model_2.onnx")
y_pred_original_m2 = [predict(new_session_m2, sample)[0] for sample in X_test.values]

y_true_best_per_gen_m2 = [solution[-1] for solution in generations_m2]
y_pred_best_per_gen_m2 = [predict(
    new_session_m2, solution[:len(solution)-1])[0] for solution in generations_m2]


# Calculate metrics
metrics_original_m1 = calculate_metrics(y_true, y_pred_original_m1)
metrics_gen_m1 = calculate_metrics(y_true_best_per_gen_m1, y_pred_best_per_gen_m1)

metrics_original_m2 = calculate_metrics(y_true, y_pred_original_m2)
metrics_gen_m2 = calculate_metrics(y_true_best_per_gen_m2, y_pred_best_per_gen_m2)

# Metrics for two models
model_metrics_1 = [
    ("Original", {"accuracy": metrics_original_m1[0],
     "precision": metrics_original_m1[1], "recall": metrics_original_m1[2], "f1": metrics_original_m1[3], "roc_auc": metrics_original_m1[4]}),
    ("Mutated", {"accuracy": metrics_gen_m1[0],
     "precision": metrics_gen_m1[1], "recall": metrics_gen_m1[2], "f1": metrics_gen_m1[3], "roc_auc": metrics_gen_m1[4]})
]

model_metrics_2 = [ 
    ("Original", {"accuracy": metrics_original_m2[0],
        "precision": metrics_original_m2[1], "recall": metrics_original_m2[2], "f1": metrics_original_m2[3], "roc_auc": metrics_original_m2[4]}),
    ("Mutated", {"accuracy": metrics_gen_m2[0],
        "precision": metrics_gen_m2[1], "recall": metrics_gen_m2[2], "f1": metrics_gen_m2[3], "roc_auc": metrics_gen_m2[4]})
]

# Print the metrics

table_data_1 = [[model] + [metrics[metric] for metric in ["accuracy",
                                                        "precision", "recall", "f1", "roc_auc"]] for model, metrics in model_metrics_1]
table_data_2 = [[model] + [metrics[metric] for metric in ["accuracy",
                                                        "precision", "recall", "f1", "roc_auc"]] for model, metrics in model_metrics_2]

table_1_str = tabulate(table_data_1, headers=[
                       "Data", "Accuracy", "Precision", "Recall", "F1", "ROC"], tablefmt="grid")
table_2_str = tabulate(table_data_2, headers=[
                       "Data", "Accuracy", "Precision", "Recall", "F1", "ROC"], tablefmt="grid")

table_1_str_with_title = "Metrics model 1:\t\t\t\t\t\t\t\n" + table_1_str
table_2_str_with_title = "Metrics model 2:\n" + table_2_str

table_1_lines = table_1_str_with_title.split('\n')
table_2_lines = table_2_str_with_title.split('\n')

merged_lines = [f"{table_1_line}\t\t\t{table_2_line}" for table_1_line,
                table_2_line in zip(table_1_lines, table_2_lines)]

merged_table_str = '\n'.join(merged_lines)

print(merged_table_str)

# Print the confusion matrices
print("\n")

cm_original_1 = confusion_matrix(y_true, y_pred_original_m1)
cm_mutated_1 = confusion_matrix(y_true_best_per_gen_m1, y_pred_best_per_gen_m1)

# Confusion matrices for model 2
cm_original_2 = confusion_matrix(y_true, y_pred_original_m2)
cm_mutated_2 = confusion_matrix(y_true_best_per_gen_m2, y_pred_best_per_gen_m2)

# Constructing tables for confusion matrices
table_cm_original_model1 = [["Negative", cm_original_1[0][0], cm_original_1[0][1]],
                            ["Positive", cm_original_1[1][0], cm_original_1[1][1]]]

table_cm_mutated_model1 = [["Negative", cm_mutated_1[0][0], cm_mutated_1[0][1]],
                           ["Positive", cm_mutated_1[1][0], cm_mutated_1[1][1]]]

table_cm_original_model2 = [["Negative", cm_original_2[0][0], cm_original_2[0][1]],
                            ["Positive", cm_original_2[1][0], cm_original_2[1][1]]]

table_cm_mutated_model2 = [["Negative", cm_mutated_2[0][0], cm_mutated_2[0][1]],
                           ["Positive", cm_mutated_2[1][0], cm_mutated_2[1][1]]]

# Generating tabulated strings for confusion matrices
table_cm_original_model1_str = tabulate(table_cm_original_model1, headers=["Actual\\Predicted",
                                        "Negative", "Positive"], tablefmt="grid")
table_cm_mutated_model1_str = tabulate(table_cm_mutated_model1, headers=["Actual\\Predicted",
                                       "Negative", "Positive"], tablefmt="grid")
table_cm_original_model2_str = tabulate(table_cm_original_model2, headers=["Actual\\Predicted",
                                        "Negative", "Positive"], tablefmt="grid")
table_cm_mutated_model2_str = tabulate(table_cm_mutated_model2, headers=["Actual\\Predicted",
                                       "Negative", "Positive"], tablefmt="grid")

# Adding titles to the confusion matrices
table_cm_original_model1_str_with_title = "Model 1 Confusion matrix Original data:\n" + \
    table_cm_original_model1_str
table_cm_mutated_model1_str_with_title = "Model 1 Confusion matrix Mutated data:\n" + \
    table_cm_mutated_model1_str
table_cm_original_model2_str_with_title = "Model 2 Confusion matrix Original data:\n" + \
    table_cm_original_model2_str
table_cm_mutated_model2_str_with_title = "Model 2 Confusion matrix Mutated data:\n" + \
    table_cm_mutated_model2_str

# Splitting tabulated strings by newline character
table_cm_original_model1_lines = table_cm_original_model1_str_with_title.split(
    '\n')
table_cm_mutated_model1_lines = table_cm_mutated_model1_str_with_title.split(
    '\n')
table_cm_original_model2_lines = table_cm_original_model2_str_with_title.split(
    '\n')
table_cm_mutated_model2_lines = table_cm_mutated_model2_str_with_title.split(
    '\n')

# Merging corresponding lines from all tables
merged_cm_lines = [f"{cm_original_model1_line}\t\t{cm_mutated_model1_line}\t\t{cm_original_model2_line}\t\t{cm_mutated_model2_line}"
                   for cm_original_model1_line, cm_mutated_model1_line, cm_original_model2_line, cm_mutated_model2_line
                   in zip(table_cm_original_model1_lines, table_cm_mutated_model1_lines, table_cm_original_model2_lines, table_cm_mutated_model2_lines)]

# Joining merged lines back into a single string
merged_cm_table_str = '\n'.join(merged_cm_lines)

# Printing the merged confusion matrices
print(merged_cm_table_str)

# save the terminal output to a file
with open('genetic_algorithm_results_our_models.txt', 'w') as f:
    f.write(merged_table_str)
    f.write("\n")
    f.write(merged_cm_table_str)

Metrics model 1:										Metrics model 2:
+----------+------------+-------------+----------+----------+---------+			+----------+------------+-------------+----------+----------+----------+
| Data     |   Accuracy |   Precision |   Recall |       F1 |     ROC |			| Data     |   Accuracy |   Precision |   Recall |       F1 |      ROC |
| Original |   0.866786 |    0.923655 | 0.2917   | 0.443376 | 0.64317 |			| Original |   0.792092 |    0.460873 | 0.842688 | 0.595864 | 0.811766 |
+----------+------------+-------------+----------+----------+---------+			+----------+------------+-------------+----------+----------+----------+
| Mutated  |   0.600288 |    0.492695 | 0.561163 | 0.524705 | 0.5934  |			| Mutated  |   0.564845 |    0.466224 | 0.787594 | 0.585723 | 0.604839 |
+----------+------------+-------------+----------+----------+---------+			+----------+------------+-------------+----------+----------+----------+


Model 1 Confusion matrix Original data:		Model 1 Confusion matrix Mutated

In [33]:
print(len(y_true_best_per_gen_m1))
print(sum(y_true_best_per_gen_m1))

print(len(y_true_best_per_gen_m2))
print(sum(y_true_best_per_gen_m2))

13910
5469.0
13910
5433.0
