## Importación de Bibliotecas y Configuración Inicial

Importamos todas las librerías necesarias para la ejecución del programa.

In [13]:
!pip install deap

import random
import operator
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from deap import base, creator, tools, gp



---
## Carga del Conjunto de Datos

Cargamos el conjunto de datos desde un archivo CSV.

In [14]:
df = pd.read_csv('Datasets/lungcancer_binario.csv')

---
## Preprocesamiento de Datos

Definimos una función para convertir los atributos del DataFrame a variables binarias o aplicar one-hot encoding, sin modificar los atributos que ya son binarios

In [15]:
def convert_to_binary(df):
    """
    Convierte los atributos de un DataFrame a binarios o one-hot,
    sin modificar los atributos que ya son binarios.
    
    Parámetros:
    df : pd.DataFrame
        El DataFrame a convertir.

    Retorna:
    pd.DataFrame
        Un nuevo DataFrame con las columnas convertidas a formato binario o one-hot.
    """
    df_binary = df.copy()

    for column in df_binary.columns:
        if df_binary[column].dtype in ['int64', 'float64']:
            if df_binary[column].nunique() == 2:
                df_binary[column] = df_binary[column].astype(int)  
            elif df_binary[column].nunique() > 3:
                df_binned = pd.cut(df_binary[column], bins=4, labels=False)
                df_one_hot = pd.get_dummies(df_binned, prefix=column)
                df_binary = pd.concat([df_binary, df_one_hot], axis=1)
                df_binary.drop(column, axis=1, inplace=True)
            else:
                df_one_hot = pd.get_dummies(df_binary[column], prefix=column)
                df_binary = pd.concat([df_binary, df_one_hot], axis=1)
                df_binary.drop(column, axis=1, inplace=True)
        elif df_binary[column].dtype == 'object':
            df_one_hot = pd.get_dummies(df_binary[column], prefix=column)
            df_binary = pd.concat([df_binary, df_one_hot], axis=1)
            df_binary.drop(column, axis=1, inplace=True)

    return df_binary

In [16]:
df_bin = convert_to_binary(df)

---
## División de Datos en Entrenamiento y Prueba

Dividimos los datos en conjuntos de entrenamiento y prueba para evaluar el rendimiento del modelo

In [17]:
X = df_bin.drop(['CancerPulmon'], axis=1)
Y = df_bin['CancerPulmon']

X = X.values.astype(int)
Y = Y.values.astype(int)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(
    X, Y, test_size=0.2, random_state=42, stratify=Y)

---
## Definición de Funciones Lógicas

Definimos las funciones lógicas que se usarán en el algoritmo genético.

In [18]:
def logical_and(a, b):
    return int(a and b)

def logical_or(a, b):
    return int(a or b)

def logical_not(a):
    return int(not a)

---
## Configuración del Algoritmo Genético

Establecemos las primitivas, la estructura de fitness y el toolbox para el algoritmo genético.

In [19]:
num_vars = Xtrain.shape[1]
var_names = ['x' + str(i+1) for i in range(num_vars)]

pset = gp.PrimitiveSet("MAIN", num_vars)
pset.renameArguments(**{'ARG' + str(i): var_names[i] for i in range(num_vars)})

pset.addPrimitive(logical_and, 2)
pset.addPrimitive(logical_or, 2)
pset.addPrimitive(logical_not, 1)

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", gp.PrimitiveTree, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register("expr", gp.genHalfAndHalf, pset=pset, min_=1, max_=3)
toolbox.register("individual", tools.initIterate,
                 creator.Individual, toolbox.expr)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("compile", gp.compile, pset=pset)



---
## Evaluación de Individuos

Definimos la función de evaluación basada en la métrica F1.

In [20]:
def eval_individual(individual):
    func = toolbox.compile(expr=individual)
    predictions = []
    for i in range(len(Xtrain)):
        args = tuple(Xtrain[i])
        pred = func(*args)
        predictions.append(pred)
    y_pred = np.array(predictions)
    y_true = Ytrain

    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / \
        (precision + recall) if (precision + recall) > 0 else 0
    return f1,

---
## Registro de Funciones Genéticas

In [21]:
toolbox.register("evaluate", eval_individual)

toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("mate", gp.cxOnePoint)
toolbox.register("expr_mut", gp.genFull, min_=0, max_=2)
toolbox.register("mutate", gp.mutUniform, expr=toolbox.expr_mut, pset=pset)

---
## Control de Crecimiento del Árbol

Limitamos el crecimiento del árbol para evitar el bloat.

In [22]:
MAX_HEIGHT = 5
MAX_SIZE = 50
toolbox.decorate("mate", gp.staticLimit(
    key=operator.attrgetter("height"), max_value=MAX_HEIGHT))
toolbox.decorate("mate", gp.staticLimit(key=len, max_value=MAX_SIZE))
toolbox.decorate("mutate", gp.staticLimit(
    key=operator.attrgetter("height"), max_value=MAX_HEIGHT))
toolbox.decorate("mutate", gp.staticLimit(key=len, max_value=MAX_SIZE))

---
## Función Main
Definimos la función main del algoritmo que ejecuta las generaciones.

In [23]:
def main():
    random.seed(42)
    pop = toolbox.population(n=300)
    hof = tools.HallOfFame(1)

    stats_fit = tools.Statistics(lambda ind: ind.fitness.values[0])
    stats_size = tools.Statistics(len)
    mstats = tools.MultiStatistics(fitness=stats_fit, size=stats_size)
    mstats.register("avg", np.mean)
    mstats.register("std", np.std)
    mstats.register("min", np.min)
    mstats.register("max", np.max)

    toolbox.register("map", map)

    fitnesses = list(toolbox.map(toolbox.evaluate, pop))
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit

    hof.update(pop)

    ngen = 150
    for gen in range(ngen):
        offspring = toolbox.select(pop, len(pop) - 1)
        offspring = list(map(toolbox.clone, offspring))
        
        for child1, child2 in zip(offspring[::2], offspring[1::2]):
            if random.random() < 0.5:
                toolbox.mate(child1, child2)
                del child1.fitness.values
                del child2.fitness.values
        for mutant in offspring:
            if random.random() < 0.2:
                toolbox.mutate(mutant)
                del mutant.fitness.values
                
        invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
        fitnesses = toolbox.map(toolbox.evaluate, invalid_ind)
        for ind, fit in zip(invalid_ind, fitnesses):
            ind.fitness.values = fit
            
        pop[:] = offspring + [hof[0]]
        
        hof.update(pop)
        
        record = mstats.compile(pop)
        print(f"Gen {gen}: Max F1 Score = {record['fitness']['max']:.4f}")
    return pop, hof


---
## Evaluación en el Conjunto de Prueba

Definimos la función para evaluar el mejor individuo en el conjunto de prueba.

In [24]:
def evaluate_on_test_set(individual):
    func = toolbox.compile(expr=individual)
    predictions = []
    for i in range(len(Xtest)):
        args = tuple(Xtest[i])
        pred = func(*args)
        predictions.append(pred)
    y_pred = np.array(predictions)
    y_true = Ytest
    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / \
        (precision + recall) if (precision + recall) > 0 else 0
    return f1


---
## Ejecución del Programa y Visualización de Resultados

Ejecutamos el programa y mostramos los resultados.

In [25]:
if __name__ == "__main__":
    pop, hof = main()
    best_individual = hof[0]
    print("\nBest individual:")
    print(best_individual)
    print("\nBest training F1 Score:", best_individual.fitness.values[0])
    test_f1 = evaluate_on_test_set(best_individual)
    print("Test F1 Score:", test_f1)


Gen 0: Max F1 Score = 0.9429
Gen 1: Max F1 Score = 0.9487
Gen 2: Max F1 Score = 0.9487
Gen 3: Max F1 Score = 0.9487
Gen 4: Max F1 Score = 0.9502
Gen 5: Max F1 Score = 0.9533
Gen 6: Max F1 Score = 0.9554
Gen 7: Max F1 Score = 0.9554
Gen 8: Max F1 Score = 0.9554
Gen 9: Max F1 Score = 0.9554
Gen 10: Max F1 Score = 0.9554
Gen 11: Max F1 Score = 0.9557
Gen 12: Max F1 Score = 0.9557
Gen 13: Max F1 Score = 0.9557
Gen 14: Max F1 Score = 0.9557
Gen 15: Max F1 Score = 0.9557
Gen 16: Max F1 Score = 0.9557
Gen 17: Max F1 Score = 0.9576
Gen 18: Max F1 Score = 0.9578
Gen 19: Max F1 Score = 0.9578
Gen 20: Max F1 Score = 0.9580
Gen 21: Max F1 Score = 0.9602
Gen 22: Max F1 Score = 0.9604
Gen 23: Max F1 Score = 0.9604
Gen 24: Max F1 Score = 0.9604
Gen 25: Max F1 Score = 0.9604
Gen 26: Max F1 Score = 0.9628
Gen 27: Max F1 Score = 0.9628
Gen 28: Max F1 Score = 0.9628
Gen 29: Max F1 Score = 0.9628
Gen 30: Max F1 Score = 0.9628
Gen 31: Max F1 Score = 0.9628
Gen 32: Max F1 Score = 0.9630
Gen 33: Max F1 Score