# Genetics

In [1]:
import random
from deap import base
from deap import creator
from deap import tools
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize

In [2]:
pred = pd.read_csv('data/full/logreg_train.csv')

In [3]:
#creator.create("FitnessMaxMin", base.Fitness, weights=(1.0,-1))
creator.create("FitnessMax", base.Fitness, weights=(1.0, ))
creator.create("Individual", list, fitness=creator.FitnessMax)

In [4]:
IND_SIZE = 15

toolbox = base.Toolbox()
# Attribute generator 
toolbox.register("attr_float", random.uniform, 0,5)
# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual,
                 toolbox.attr_float, n=IND_SIZE)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [5]:
def compute_candidate(individual, elem):
    
    bon = np.array(elem[1:6]) * np.array(individual).squeeze()[:5]
    nb = np.array(elem[6:11]) * np.array(individual).squeeze()[5:10]
    rf = np.array(elem[11:16]) * np.array(individual).squeeze()[10:]
    
    candidate = (bon + nb + rf)
    
    if candidate.sum() == 0:
        candidate = np.array([0.2, 0.2, 0.2, 0.2, 0.2])
    else:
        candidate = candidate/candidate.sum()
    
    return candidate

In [6]:
def evalOneMax(individual):
    
    # Empty confusion matrix generation
    conf_mat = np.zeros(shape=(5,5))
    
    # Initialize loss (avg number of over-confident predictions)
    loss = 0
    
    # For all rows in the dataset, store the actual target and apply the chromosomes (weights)
    # to combine random forest predictions with naive bayes ones
    for index,row in pred.iterrows():
        actual = int(row['target'])
        candidate = compute_candidate(individual, row)
        
        # Store the predicted targett
        predicted = np.argmax(candidate)
        # If too much confident, update loss
        if predicted >= 0.8:
            loss += 1 
        
        # Update confusion matrix: add 1 in the right cell
        conf_mat[actual,predicted] += 1
    
    # Compute average loss
    loss = loss/len(pred)
    
    # Initialize f1_macro_score
    f1_macro = 0
    
    # Check confusion matrix to calculate tp,fp,fn, one class at a time
    for i in range (0,5):
        
        # TPs coincide with diagonal values [actual = predicted]
        tp = conf_mat[i,i]
        # Exclude TPs from FP and FN computations
        conf_mat[i,i] = 0
        # FPs coincide with the sum of the column for that class
        fp = conf_mat[:,i].sum()
        # FNs coincide with the sum of the row for that class
        fn = conf_mat[i,:].sum()
        # F1 = 2TP / (2TP + FP +FN)
        f1_macro += 2*tp / (2*tp + fp + fn)
        
    # Averaging F1 with regards to all the classes
    f1_macro = f1_macro/5
    
    return f1_macro, loss

In [7]:
def evalMax(individual):
    
    # Empty confusion matrix generation
    conf_mat = np.zeros(shape=(5,5))
    
    # Initialize loss (avg number of over-confident predictions)
    loss = 0
    
    # For all rows in the dataset, store the actual target and apply the chromosomes (weights)
    # to combine random forest predictions with naive bayes ones
    for index,row in pred.iterrows():
        actual = int(row['target'])
        candidate = compute_candidate(individual, row)
        
        # Store the predicted targett
        predicted = np.argmax(candidate)
        # If too much confident, update loss
        if predicted >= 0.8:
            loss += 1 
        
        # Update confusion matrix: add 1 in the right cell
        conf_mat[actual,predicted] += 1
    
    # Compute average loss
    loss = loss/len(pred)
    
    # Initialize f1_macro_score
    f1_macro = 0
    
    # Check confusion matrix to calculate tp,fp,fn, one class at a time
    for i in range (0,5):
        
        # TPs coincide with diagonal values [actual = predicted]
        tp = conf_mat[i,i]
        # Exclude TPs from FP and FN computations
        conf_mat[i,i] = 0
        # FPs coincide with the sum of the column for that class
        fp = conf_mat[:,i].sum()
        # FNs coincide with the sum of the row for that class
        fn = conf_mat[i,:].sum()
        # F1 = 2TP / (2TP + FP +FN)
        f1_macro += 2*tp / (2*tp + fp + fn)
        
    # Averaging F1 with regards to all the classes
    f1_macro = f1_macro/5
    
    fitness = 3*f1_macro - loss
    
    return (fitness, )

In [8]:
def crossover(ind1, ind2):
    
    fit1 = ind1.fitness.values
    fit2 = ind2.fitness.values
    
    fit = np.array([fit1, fit2])

    fit = fit/fit.sum()
    
    child1 = np.array(ind1) * np.array(fit[0]) + np.array(ind2) * np.array(fit[1])
    child2 = np.array(ind1) * 0.5 + np.array(ind2) * 0.5
    
    return (child1, child2)

In [9]:
def mutate(mutant):
    
    choice = np.random.choice(10, 3)
    
    mutant[choice] = 0
    
    return mutant

In [10]:
#toolbox.register("evaluate", evalOneMax)
toolbox.register("mate", crossover)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)
toolbox.register("elitism", tools.selBest)
toolbox.register("slacks", tools.selWorst)
toolbox.register("evaluateOneLoss", evalMax)

In [11]:
pop = toolbox.population(n=200)

In [12]:
# Evaluate the entire population
fitnesses = list(map(toolbox.evaluateOneLoss, pop))
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit

In [13]:
CXPB, MUTPB = 0.95, 0.45

In [14]:
# Extracting all the fitnesses of
fits = [ind.fitness.values for ind in pop]

In [15]:
# Generations
g = 0
    
# Begin the evolution
while g < 50:
    # A new generation
    g = g + 1
    print("-- Generation %i --" % g)
    # Select the next generation individuals
    offspring = toolbox.select(pop, len(pop)) + toolbox.elitism(pop, 3)
    # Clone the selected individuals
    offspring = list(map(toolbox.clone, offspring))
    # Apply crossover and mutation on the offspring
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < CXPB:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < MUTPB:
            toolbox.mutate(mutant)
            del mutant.fitness.values
    # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluateOneLoss, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit
        
    pop[:] = offspring
    
    # Gather all the fitnesses in one list and print the stats
    fits = [ind.fitness.values for ind in pop]

    best = toolbox.elitism(pop, 1)
    
    print("  Best individual with new loss:")
    print(list(best))
    print("  -- Fit: --")
    print("     %s" % max(fits))
    print("  -- F1 - Loss: --")
    print("     %s  -  %s" % (evalOneMax(best)[0], evalOneMax(best)[1]))
    print("    Population of %s individuals" % len(offspring))

-- Generation 1 --
  Best individual with new loss:
[[4.10911503780933, 0.2340688221492937, 4.181915090492482, 4.145025774160452, 2.6685081788031746, 4.867916911159929, 2.8087542061870447, 4.680651902001779, 2.800568004461698, 2.3259421644305305, 4.462306771553116, 2.1292457216351646, 2.1627039831805654, 2.317301796533924, 0.16322313438788927]]
  -- Fit: --
     2.1712928827507856
  -- F1 - Loss: --
     0.9571374523546027  -  0.7001194743130227
    Population of 203 individuals
-- Generation 2 --
  Best individual with new loss:
[[4.10911503780933, 0.2340688221492937, 4.181915090492482, 4.145025774160452, 2.6685081788031746, 4.867916911159929, 2.8087542061870447, 4.680651902001779, 2.800568004461698, 2.3259421644305305, 4.462306771553116, 2.1292457216351646, 2.1627039831805654, 2.317301796533924, 0.16322313438788927]]
  -- Fit: --
     2.1712928827507856
  -- F1 - Loss: --
     0.9571374523546027  -  0.7001194743130227
    Population of 206 individuals
-- Generation 3 --
  Best indivi