In [1]:
# """Uncomment if running this file independently"""
# data_file = '../combined_data.csv' # Load variable so data_loader can locale the csv filesystem path accordingly.
# %run ../data_loader.ipynb 

'Uncomment if running this file independently'

In [2]:
# # https://deap.readthedocs.io/en/master/index.html
# https://deap.readthedocs.io/en/master/tutorials/basic/part1.html
# %pip install deap

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from deap import base, creator, tools, algorithms
import numpy as np
from sklearn.model_selection import train_test_split

# Define problem to DEAP (we aim to minimize RMSE, hence weights=(-1.0,))
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
toolbox.register("attr_bool", np.random.randint, 0, 2)
toolbox.register("individual", tools.initRepeat, creator.Individual,
                 toolbox.attr_bool, n=X.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Define the fitness function for regression
def evalModel(individual):
    X_selected = X[:, [i for i, bit in enumerate(individual) if bit == 1]]
    X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

    if X_train.shape[1] == 0:  # Prevent training with 0 features
        return (float("inf"),)  # Return infinite RMSE as a penalty

    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    rmse = sqrt(mean_squared_error(y_test, predictions))  # Calculate RMSE
    return (rmse,)

toolbox.register("evaluate", evalModel)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

# Parameters for GA
population_size = 39
crossover_probability = 0.5
mutation_probability = 0.2
number_of_generations = 4

pop = toolbox.population(n=population_size)
hof = tools.HallOfFame(1)
# Run the Genetic Algorithm
result, logbook = algorithms.eaSimple(pop, toolbox, cxpb=crossover_probability,
                             mutpb=mutation_probability, ngen=number_of_generations,
                             verbose=True, halloffame=hof)