In [1]:
"""
This script test the performance of GA search by finding on average how many compuations (calling rkf_validator) required to 
find one of the top 10 chromosome and the best chromosome
that is found by Grid Search (ground truth)

Acknowledgement:
This script was referenced from Lab 7
ChatGPT has been used for debugging and previded ideas for writing

"""

import numpy as np
import pandas as pd
from casper_test import rkf_validator
from utilities import import_data
from utilities import set_seed
from sklearn.model_selection import train_test_split
from hypers_grid_search import HYPERPARAMETERS_CANDIDATES
from hypers_grid_search import SEARCH_SPACE_SIZE
from hypers_grid_search import chromosome_to_hyperparameters
from hypers_grid_search import create_table

# Use the result from grid search when calculate the score of a chromosome
FAST_MODE = True

DEVICE = 'cpu'

# set seed
SEED = 4660
if SEED != None:
    set_seed(SEED)

# Number of training for each hyperparameter combination
n_splits = 10
n_repeats = 4

# define GA settings
DNA_SIZE = 12             # number of bits in DNA
POP_SIZE = 100           # population size
CROSS_RATE = 0.8          # DNA crossover probability
MUTATION_RATE = 0.05      # mutation probability
   
TOURNAMENT_SIZE = 3       
N_ELITES = 5  # Number of elite individuals to carry forward

# The scores for all 4096 possible chromosomes
FITNESS_TABLE = pd.read_csv('results/fitness_table.csv', index_col=0, dtype={0: str})


def initialize_population():
    """
    Create the initial population
    output: a matrix (POP_SIZE x DNA_SIZE)
    """
    return np.random.randint(2, size=(POP_SIZE, DNA_SIZE))

def compute_fitness_all(population, train_data):
    """
    Compute the fitness of all individuals in a population
    return a fitness table containing fitness score for all individual in the population
    """
    if FAST_MODE:
        chromosome_strings = [''.join(map(str, row)) for row in population] # convert into a list of string
        return FITNESS_TABLE.loc[chromosome_strings]
    else:
        fitness_table = create_table()
        for individual in population:
            chromosome = ''.join(map(str, individual)) # convert into a string
            hyperparameters = chromosome_to_hyperparameters(chromosome)
            MSE = rkf_validator(train_data, hyperparameters, n_splits, n_repeats, device=DEVICE, fast_mode=True, verbose=False)
            fitness_table.loc[chromosome] = MSE
        return fitness_table

def tournament_selection(population, fitness_table):
    selected_indices = np.random.choice(len(population), TOURNAMENT_SIZE, replace=False)
    selected = population[selected_indices]
    fitnesses = np.array(fitness_table.iloc[selected_indices,0])
    best_index = np.argmin(fitnesses)
    return selected[best_index]

def crossover(parent1, parent2):
    """
    Perform single point crossover based on CROSS_RATE
    """
    if np.random.rand() < CROSS_RATE:
        point = np.random.randint(DNA_SIZE)
        child1 = np.hstack([parent1[:point], parent2[point:]])
        child2 = np.hstack([parent2[:point], parent1[point:]])
        return child1, child2
    else:
        return parent1.copy(), parent2.copy()

def mutate(child):
    """
    Mutate a child based on MUTATION_RATE
    """
    for point in range(DNA_SIZE):
        if np.random.rand() < MUTATION_RATE:
            child[point] = 1 if child[point] == 0 else 0
    return child

def select_elites(population, n_elites, fitness_table):
    """
    Get the top n_elites individual from the polupation
    """
    fitnesses = np.array(fitness_table.iloc[:,0])
    elite_indices = np.argsort(fitnesses)[:n_elites]
    return population[elite_indices]

def get_the_best_individual(fitness_table):
    """
    Get the best individual in a population given a individual-score mapping
    """
    best_individual = fitness_table.index[0]
    best_score = fitness_table.iloc[0,0]
    return best_individual, best_score

def genetic_algorithm_test(train_data, verbose = False):
    """
    Compute a population of hyperparameter combinations after N_GENERATIONS generations
    """

    top_10s = FITNESS_TABLE.sort_values(by='MSE', ascending=True).head(10)

    population = initialize_population() # population of the first generation

    num_gen_to_get_the_best = 0
    num_gen_to_get_top_10 = 0
    top_10_obtained = False

    # for generation in range(1, N_GENERATIONS + 1):

    generation = 1
    while(True):

        # Calculate the fitness for all individual after the i th generation
        fitness_table = compute_fitness_all(population, train_data)

        # Get the best chromosome and the score
        best_chromosome, best_score = get_the_best_individual(fitness_table)
        if verbose:
            print(f'Generation: {generation}   Best Hyperparameters: {chromosome_to_hyperparameters(best_chromosome)}   MSE: {best_score}')

        if not top_10_obtained and best_chromosome in top_10s.index:
            top_10_obtained = True
            num_gen_to_get_top_10 = generation
        
        if best_chromosome == top_10s.index[0]:
            num_gen_to_get_the_best = generation
            break

        # Start creating the (i + 1) th generation
        elites = select_elites(population, N_ELITES, fitness_table)

        new_population = elites.tolist()

        while len(new_population) < POP_SIZE:
            parent1 = tournament_selection(population, FITNESS_TABLE)
            parent2 = tournament_selection(population, FITNESS_TABLE)

            child1, child2 = crossover(parent1, parent2)

            child1 = mutate(child1)
            child2 = mutate(child2)

            new_population.extend([child1, child2])

        population = np.array(new_population[:POP_SIZE])
        generation += 1

    return num_gen_to_get_the_best, num_gen_to_get_top_10

In [2]:
# import data
data, _, _ = import_data()
train_data, test_data, _, _ = train_test_split(data, data.iloc[:,0], test_size=0.2, random_state=SEED)

# Run GA search 2000 times
NUM_TEST = 2000
result = []
for i in range(1, NUM_TEST + 1):
    num_gen_to_get_the_best, num_gen_to_get_top_10 = genetic_algorithm_test(train_data)
    result.append([num_gen_to_get_the_best, num_gen_to_get_top_10])
    if i % 200 == 0:
        print(f'{i}/{NUM_TEST}   Number of Gen to get the best chromosome: {num_gen_to_get_the_best}   Number of Gen to get one of the top 10s: {num_gen_to_get_top_10}')
result = np.array(result)

num_gen_to_get_the_best = result[:, 0]
num_gen_to_get_top_10 = result[:, 1]

print()
print("Number of Gen to get the best chromosome:")
print(f'Mean: {np.mean(num_gen_to_get_the_best)}')
print(f'Median: {np.median(num_gen_to_get_the_best)}')
print(f'Standard Deviation: {np.std(num_gen_to_get_the_best)}')
print()
print("Number of Gen to get one of the top 10s:")
print(f'Mean: {np.mean(num_gen_to_get_top_10)}')
print(f'Median: {np.median(num_gen_to_get_top_10)}')
print(f'Standard Deviation: {np.std(num_gen_to_get_top_10)}')

200/2000   Number of Gen to get the best chromosome: 25   Number of Gen to get one of the top 10s: 5
400/2000   Number of Gen to get the best chromosome: 243   Number of Gen to get one of the top 10s: 7
600/2000   Number of Gen to get the best chromosome: 20   Number of Gen to get one of the top 10s: 4
800/2000   Number of Gen to get the best chromosome: 69   Number of Gen to get one of the top 10s: 4
1000/2000   Number of Gen to get the best chromosome: 36   Number of Gen to get one of the top 10s: 8
1200/2000   Number of Gen to get the best chromosome: 46   Number of Gen to get one of the top 10s: 2
1400/2000   Number of Gen to get the best chromosome: 12   Number of Gen to get one of the top 10s: 11
1600/2000   Number of Gen to get the best chromosome: 5   Number of Gen to get one of the top 10s: 4
1800/2000   Number of Gen to get the best chromosome: 9   Number of Gen to get one of the top 10s: 4
2000/2000   Number of Gen to get the best chromosome: 1   Number of Gen to get one of 