In [8]:
import os
import pandas as pd
from IPython.display import display
import numpy as np
from numba import jit, prange, njit, types
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
from numba.core.errors import NumbaWarning
import warnings


from numba.typed import Dict, List

from scripts import *
from tqdm.notebook import tqdm

#load_and_print_csvs_from_folders()
#test_n_creation()


In [None]:
test_fitness()
test_mutation()
test_crossover()

In [10]:
def mutate_genotype(parent_genotype, grammar, probability, empty_genotype ):
    parameter_hash = genotype_hash(parent_genotype)

    offspring_genotype = deep_copy_genotype(parent_genotype, empty_genotype)

    offspring_genotype_hash = genotype_hash(offspring_genotype)

    print(f"pre mutation: Parent hash: {parameter_hash} To be mutated hash: {offspring_genotype_hash}")

    #print("Offspring genotype: ", offspring_genotype)
    mutate_genotype_inplace(offspring_genotype, grammar, probability)

    offspring_genotype_hash = genotype_hash(offspring_genotype)
    parameter_hash_2 = genotype_hash(parent_genotype)

    print(f"post mutation: Parent hash: {parameter_hash_2} Mutated hash: {offspring_genotype_hash}")
    assert_equality_of_hashes(parameter_hash, parameter_hash_2)
    #print("Parent genotype: ", parent_genotype)
   # raise Exception("Stop")
    return offspring_genotype

def next_generation(genotype_list,
                    grammar: dict,
                    p_mutation: float,
                    p_crossover: float,
                    elite_percentage: float,
                    variables_values: np.ndarray,
                    y_values: np.ndarray,
                    tournament_size: int,
                    print_run_logs=False):

    population_size = len(genotype_list)

    def calculate_single_fitness(genotype):
        rmse, _ = calculate_fitness(variables_values, y_values, genotype, grammar, NODE_TYPE, GENOTYPE_TYPE)
        return rmse

    # Calculate fitness for all genotypes
    fitness_list = calculate_all_fitnesses(genotype_list, variables_values, y_values, grammar, NODE_TYPE, GENOTYPE_TYPE)

    # print("After fitness genotypes: ")
    #
    # for i in range(len(genotype_list)):
    #     print("\nGenotype: ", i)
    #     print_resumed_genotype(genotype_list[i])
    # print("\n\n\n")

    unique, counts = np.unique(fitness_list, return_counts=True)



    # Count number of repeated items
    num_repeated = np.sum(counts) - len(counts)

    # Metrics: Best and worst fitness
    best_fitness = np.min(fitness_list)
    worst_fitness = np.max(fitness_list)

    # Metrics: Average fitness
    avg_fitness = np.mean(fitness_list)


    # Select the elite individuals
    num_elite = int(population_size * elite_percentage)
    elite_indices = best_n_items(fitness_list, num_elite)
    elite_genotypes = [genotype_list[idx] for idx in elite_indices]

    elite_fitness = [fitness_list[idx] for idx in elite_indices]

    best_elite = np.min(elite_fitness)
    worst_elite = np.max(elite_fitness)
    avg_elite = np.mean(elite_fitness)

    # Tournament selection for survivors
    survivors_indices = selection_tournament(fitness_list, tournament_size)

    survivor_hashes = [genotype_hash(genotype_list[idx]) for idx in survivors_indices]

    survivors_genotypes = [genotype_list[idx] for idx in survivors_indices]


    worse_than_father = 0
    better_than_father = 0

    print("Survivors: ", len(survivors_genotypes))
    # Reproduce, mutate, and crossover to create new offspring
    new_genotypes = []
    while len(new_genotypes) < population_size - num_elite:
        print("\n\nPopulation size: ", len(new_genotypes))
        r = np.random.random()
        parent_idx = np.random.randint(0, len(survivors_genotypes))
        parent_genotype = survivors_genotypes[parent_idx]
        #print_resumed_genotype(parent_genotype)
        if print_run_logs:
            print(f"Hash for parent: {survivor_hashes[parent_idx]}")

        if r < p_mutation:
            if print_run_logs:
                print("Mutating...")
            # Mutate
            offspring_genotype = mutate_genotype(parent_genotype, grammar, p_mutation, GENOTYPE_TYPE)
        elif r < p_mutation + p_crossover:
            if print_run_logs:
                print("Crossover...")
            # Crossover
            parent2_idx = np.random.randint(0, len(survivors_genotypes))

            attemps = 0
            max_attemps = 10
            while parent2_idx == parent_idx and attemps < max_attemps:
                parent2_idx = np.random.randint(0, len(survivors_genotypes))
                attemps += 1
            if attemps == max_attemps:
                print("Couldn't find a different parent to crossover, just mutating")
                # If we couldn't find a different parent, just mutate
                offspring_genotype = mutate_genotype(parent_genotype, grammar, p_mutation, GENOTYPE_TYPE)
            else:
                if print_run_logs:
                    print(f"Hash for parent 2: {survivor_hashes[parent2_idx]}")

                parent2_genotype = survivors_genotypes[parent2_idx]
                #print_resumed_genotype(parent_genotype)
                #print_resumed_genotype(parent2_genotype)
                # print("++++++++++ end parents")
                offspring_1, offspring_2 = crossover_numba(parent_genotype, parent2_genotype, p_crossover, GENOTYPE_TYPE)

                #print_resumed_genotype(offspring_1)
                #print_resumed_genotype(offspring_2)

                parents_mean_fitness = (fitness_list[parent_idx] + fitness_list[parent2_idx]) / 2

                offspring_1_fitness = calculate_single_fitness(offspring_1)
                offspring_2_fitness = calculate_single_fitness(offspring_2)
                if print_run_logs:
                    print("Created offspring with fitness: ", offspring_1_fitness, " and ", offspring_2_fitness)
                if offspring_1_fitness < parents_mean_fitness:
                    better_than_father += 1
                else:
                    worse_than_father += 1
                if offspring_2_fitness < parents_mean_fitness:
                    better_than_father += 1
                else:
                    worse_than_father += 1

                offspring_genotype = offspring_1
                new_genotypes.append(offspring_2)
        else:
            if print_run_logs:
                print("Reproducing...")
            # Reproduce
            offspring_genotype = deep_copy_genotype(parent_genotype, GENOTYPE_TYPE)

        new_genotypes.append(offspring_genotype)

    # Combine elite and new offspring to create the next generation
    next_gen = elite_genotypes + new_genotypes

    # take the extra genotypes out
    if len(next_gen) > population_size:
        fitness_list_next_gen = calculate_all_fitnesses(next_gen, variables_values, y_values, grammar, NODE_TYPE, GENOTYPE_TYPE)
        worst_individuals = selection_tournament(fitness_list_next_gen, len(fitness_list_next_gen), True)
        print("Worst individuals for removal: ", worst_individuals)
        for idx in worst_individuals:
            del next_gen[idx]



    variables = np.array([
        best_fitness,
        worst_fitness,
        avg_fitness,
        better_than_father,
        worse_than_father,
        num_repeated,
        best_elite,
        worst_elite,
        avg_elite,
        num_elite
    ])
    index_to_variable_name = {
        0: "best_fitness",
        1: "worst_fitness",
        2: "avg_fitness",
        3: "better_than_father",
        4: "worse_than_father",
        5: "num_repeated",
        6: "best_elite",
        7: "worst_elite",
        8: "avg_elite",
        9: "num_elite"
    }
    for i, value in enumerate(variables):
        print(f"{index_to_variable_name[i]}: {value}")

    print("Next gen length: ", len(next_gen))
    return next_gen, variables


genotypes, grammar = create_n_genotypes(20, 3, 2)

# print("Original genotypes: ")
# for i in range(len(genotypes)):
#     print("\nGenotype: ", i)
#     print_resumed_genotype(genotypes[i])
# print("\n\n\n")

test_data = np.array([[-1.23592861, -1.36410559],
                      [-0.60259712, -0.60758157],
                      [2.80419539, 2.66919459],
                      [-0.22628393, -2.97797806],
                      [2.0402239, -0.59282888]])

test_y = np.array([6.51571868, 1.14283484, 40.67709954, 7.42636336, 9.6026114])

next_gen, _ = next_generation(genotypes, grammar, 0.3, 0.6, 0.1, test_data, test_y, 2, True)

Before tree creation: 1g4mr27i026hf and copy 1g4mr27i026hf and empty 0
After tree creation: 1g4mr27i026hf and copy 144ikh5rcuqdy and empty 0
After fitness: 1g4mr27i026hf and copy 144ikh5rcuqdy and empty 0
After fitness: 1g4mr27i026hf and copy 144ikh5rcuqdy and empty 0
After fitness: 1g4mr27i026hf and copy 144ikh5rcuqdy and empty 0
After fitness: 1g4mr27i026hf and copy 144ikh5rcuqdy and empty 0
After fitness: 1g4mr27i026hf and copy 144ikh5rcuqdy and empty 0
Before tree creation: fv44t8v8myi2 and copy fv44t8v8myi2 and empty 0
After tree creation: fv44t8v8myi2 and copy 14z8ly4q6o3x3 and empty 0
After fitness: fv44t8v8myi2 and copy 14z8ly4q6o3x3 and empty 0
After fitness: fv44t8v8myi2 and copy 14z8ly4q6o3x3 and empty 0
After fitness: fv44t8v8myi2 and copy 14z8ly4q6o3x3 and empty 0
After fitness: fv44t8v8myi2 and copy 14z8ly4q6o3x3 and empty 0
After fitness: fv44t8v8myi2 and copy 14z8ly4q6o3x3 and empty 0
Before tree creation: 1ottymshwykf2 and copy 1ottymshwykf2 and empty 0
After tree crea

In [None]:
def parse_df(case_df):
    # y is the last column from the df, extract it to y
    y = case_df.iloc[:, -1].to_numpy()

    # drop the last column from the df, and store the rest in X
    X = case_df.drop(case_df.columns[-1], axis=1)
    variable_matrix = X.to_numpy()

    return variable_matrix, y

def genetic_programming(num_generations, population_size, max_depth, p_mutation, p_crossover, elite_percentage, variables_values, y_values, tournament_size):

    num_variables = len(variables_values[0])
    print("Number of variables: ", num_variables)

    # Create the grammar
    genotype_list, grammar = create_n_genotypes(population_size, max_depth, num_variables)

    # Initialize the genotypes
    current_genotypes = genotype_list

    # Define the statistics array to store the statistics for each generation
    stats_columns = 10

    stats_shape = (num_generations, stats_columns)
    stats = np.zeros(shape=stats_shape, dtype=np.float64)

    # Run the algorithm for the specified number of generations
    for generation in tqdm(range(num_generations), desc="Generations"):
        print(f"\n=== Generation {generation + 1} ===")
        current_genotypes, current_stats = next_generation(current_genotypes, grammar, p_mutation, p_crossover, elite_percentage, variables_values, y_values, tournament_size, True)

        # Store the statistics for the current generation
        stats[generation, :] = current_stats

    return current_genotypes, stats

# read csv from inside synth1, called synth1-train.csv

df = pd.read_csv("synth1/synth1-train.csv", header=None)
display(df.head())

variable_matrix, y = parse_df(df)

final_genotype, stats = genetic_programming(10, 50, 3, 0.3, 0.6, 0.1, variable_matrix, y, 10)
