In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from joblib import Parallel, delayed

dataset = pd.read_csv('diabetes.csv')

X = dataset.drop('Outcome', axis=1)
y = dataset['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

population_size = 50
crossover_probability = 0.6
mutation_probability = 0.05
tournament_size = 5
n_runs = [100, 500, 1000, 2000]

def calculate_fitness_v2(individual, X_train, X_test, y_train, y_test):
    X_train_selected = X_train.iloc[:, individual == 1]
    X_test_selected = X_test.iloc[:, individual == 1]

    model = RandomForestClassifier(random_state=48)

    model.fit(X_train_selected, y_train)

    y_pred = model.predict(X_test_selected)

    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

def parallel_fitness(population, X_train, X_test, y_train, y_test):
    fitness_values = Parallel(n_jobs=-1)(delayed(calculate_fitness_v2)(individual, X_train, X_test, y_train, y_test) for individual in population)
    return np.array(fitness_values)

def genetic_algorithm_optimized(X_train, X_test, y_train, y_test, population_size, crossover_probability, mutation_probability, tournament_size, n_runs):
    population = np.random.randint(2, size=(population_size, X_train.shape[1]))

    for n_run in n_runs:
        for run in range(n_run):
            fitness_values = parallel_fitness(population, X_train, X_test, y_train, y_test)

            parents = tournament_selection(population, fitness_values, tournament_size)

            children = crossover(parents, crossover_probability)

            children = mutation(children, mutation_probability)

            new_population = np.vstack((population, children))

            new_fitness_values = parallel_fitness(new_population, X_train, X_test, y_train, y_test)

            best_individuals = fitness_based_selection(new_population, new_fitness_values, population_size)

            population = new_population[best_individuals]

        best_individual_index = np.argmax(fitness_values)
        best_individual = population[best_individual_index]
        best_accuracy = fitness_values[best_individual_index]

        print(f"Run {n_run}: Best individual: {best_individual}, Accuracy: {best_accuracy}")

def tournament_selection(population, fitness_values, tournament_size):
    parents = []

    for _ in range(len(population)):
        tournament_indices = np.random.choice(len(population), tournament_size, replace=False)
        tournament_fitness_values = fitness_values[tournament_indices]
        best_parent_index = tournament_indices[np.argmax(tournament_fitness_values)]
        parents.append(population[best_parent_index])

    return np.array(parents)

def crossover(parents, crossover_probability):
    children = []

    for i in range(0, len(parents), 2):
        parent1 = parents[i]
        parent2 = parents[i + 1]

        if np.random.rand() < crossover_probability:
            point = np.random.randint(len(parent1))
            child1 = np.hstack((parent1[:point], parent2[point:]))
            child2 = np.hstack((parent2[:point], parent1[point:]))
        else:
            child1, child2 = parent1.copy(), parent2.copy()

        children.append(child1)
        children.append(child2)

    return np.array(children)

def mutation(children, mutation_probability):
    for i in range(len(children)):
        for j in range(len(children[i])):
            if np.random.rand() < mutation_probability:
                children[i][j] = 1 - children[i][j]

    return children

def fitness_based_selection(population, fitness_values, new_population_size):
    best_individuals_index = np.argsort(fitness_values)[-new_population_size:]
    return best_individuals_index

genetic_algorithm_optimized(X_train, X_test, y_train, y_test, population_size, crossover_probability, mutation_probability, tournament_size, n_runs)


Run 100: Best individual: [1 1 1 0 0 1 1 1], Accuracy: 0.7792207792207793
Run 500: Best individual: [1 1 1 0 1 1 1 1], Accuracy: 0.7792207792207793
Run 1000: Best individual: [1 1 1 0 1 1 1 1], Accuracy: 0.7792207792207793
Run 2000: Best individual: [1 1 1 0 0 1 1 1], Accuracy: 0.7792207792207793
