In [41]:
import pandas as pd
import random
from copy import deepcopy
import numpy as np

In [42]:
import import_ipynb 
from accuracy import calc_accuracy

In [43]:
X_train = pd.read_csv('data/X_train.csv')
X_validation  = pd.read_csv('data/X_validation.csv')

y_train = pd.read_csv('data/y_train.csv')
y_validation  = pd.read_csv('data/y_validation.csv')

In [44]:
class Individual:
    def __init__(self,code,fitness = -float('inf')):
        self.code = code
        self.fitness = fitness

In [45]:
class GA:

    def __init__(self, X_train :pd.DataFrame, y_train :pd.DataFrame, X_validation :pd.DataFrame, y_validation:pd.DataFrame, population_size:int, 
                 num_generations:int, tournament_size:int, mutation_probability:float, alpha :float,elitism_size:int, patience: int
                ):
        self.X_train = X_train
        self.y_train = y_train
        self.X_validation = X_validation
        self.y_validation = y_validation
        self.all_column_names = X_train.columns
        self.population_size = population_size
        self.num_generations = num_generations
        self.tournament_size = tournament_size
        self.mutation_probability = mutation_probability
        self.alpha = alpha
        self.elitism_size = elitism_size
        self.patience = patience
    
        self.population = [ Individual(code=(np.random.rand(X_train.shape[1]) < 0.25)) for _ in range(self.population_size) ]
        self.best_fitness = -float('inf')
        self.best_solution = None
        self.history = []
        self.no_improve = 0
        
    def calc_fitness(self,index): 
        if not any(self.population[index].code):
            self.population[index].fitness =  -float('inf')
            return
        
        #new_column_names = [j for i,j in enumerate(self.all_column_names) if self.population[index].code[i]]
        acc = calc_accuracy(self.population[index].code,self.X_train,self.y_train,self.X_validation,self.y_validation)
        
        num_features =  sum(self.population[index].code)
        
        self.population[index].fitness = self.alpha * acc + (1 - self.alpha) * (1 - num_features / self.X_train.shape[1])   
        # a*tacnost + b * (1 - izabrani_atributi / ukupno_atributa )
        #ili a * tacnost + (1 - a) ...
        #mozda da oduzimam drugi deo
        

    def crossover(self,parent1,parent2):
        breakpoint = random.randrange(1, len(parent1.code))
        child1 = Individual(np.concatenate([parent1.code[:breakpoint],parent2.code[breakpoint:]]) )
        child2 =  Individual(np.concatenate([parent2.code[:breakpoint],parent1.code[breakpoint:]]) )

        return child1,child2

    def mutation(self,child):
        for i in range(len(child.code)):
            if random.random() < self.mutation_probability:
                child.code[i] = not child.code[i]
        return child
        
    def tournament_selection(self):
        contenders = random.sample(range(len(self.population)), self.tournament_size)
        return max(contenders, key=lambda i: self.population[i].fitness)
        

    def run(self):
               
        for generation in range(self.num_generations):
            
            for i in range(self.population_size):
                self.calc_fitness(i)

            generation_best = max(self.population,key = lambda ind:ind.fitness)
            
            if generation_best.fitness  > self.best_fitness:
                self.best_fitness = generation_best.fitness
                self.best_solution = generation_best.code.copy()
                self.no_improve = 0
            else:
                self.no_improve+=1
            
            if self.no_improve >= self.patience:
                print(f"No improvement for {self.patience} generations")
                break
            valid = [ind.fitness for ind in self.population if ind.fitness != -float('inf')]
            average_fitness = sum(valid) / len(valid)
            #print(average_fitness)
            
            self.history.append((generation,average_fitness,self.best_fitness,self.best_solution))
            new_population = []
            sorted_population = sorted(self.population,key = lambda ind:ind.fitness,reverse=True)
            
            for i in range(self.elitism_size):
                new_population.append(deepcopy(sorted_population[i]))
            
            for i in range(self.elitism_size,self.population_size,2):
                parent1 = self.population[self.tournament_selection()]
                parent2 = self.population[self.tournament_selection()]
                child1, child2 = self.crossover(parent1, parent2)
                new_population.append(self.mutation(child1))
                if len(new_population) < self.population_size:
                    new_population.append(self.mutation(child2))
            self.population = new_population #ne treba deepcopy jer se odmah pravi novi new_population

    

        return self.best_solution, self.best_fitness, self.history
    

In [46]:
ga = GA(X_train,y_train,X_validation,y_validation , population_size = 20, num_generations = 20, tournament_size=4, mutation_probability = 0.05,alpha = 0.5,elitism_size = 2,patience = 5)
selected_features, fitness,history = ga.run()

No improvement for 5 generations


In [47]:
fitness

0.8766306634871651

In [48]:
ga.history

[(0,
  0.8260204797306775,
  0.8766306634871651,
  array([False, False,  True, ..., False, False, False])),
 (1,
  0.8270669682517419,
  0.8766306634871651,
  array([False, False,  True, ..., False, False, False])),
 (2,
  0.8187453242624025,
  0.8766306634871651,
  array([False, False,  True, ..., False, False, False])),
 (3,
  0.8109423949127976,
  0.8766306634871651,
  array([False, False,  True, ..., False, False, False])),
 (4,
  0.8194446392668443,
  0.8766306634871651,
  array([False, False,  True, ..., False, False, False]))]