In [1]:
import random, os
import numpy as np
import pandas as pd
from functools import reduce
import seaborn as sns
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import model_selection
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

In [2]:
from deap import base
from deap import creator
from deap import tools
from deap import algorithms



In [3]:
from fastai.imports import *
from fastai.structured import *

  return f(*args, **kwds)


In [4]:
df_raw = pd.read_feather('tmp/bulldozers-raw')
df_raw.SalePrice = np.log(df_raw.SalePrice)
df, y, nas = proc_df(df_raw, 'SalePrice')
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df, y, test_size=0.99, random_state=0)

In [5]:
params_default = {
         'n_estimators': list(range(10,100,50)),
         'max_depth': [None, 3, 5, 10],
         'min_samples_split': [2, 4, 6, 8],
         'max_features': [0.2, 0.5, 0.8 , 1]}

In [6]:
def get_random(individual='q', params=params_default):
    ind = individual()
    for key in params.keys():
        ind[key]=(random.choice(params[key]))
    return ind

In [7]:
def evalOneMax(individual):
    clf = RandomForestRegressor(**individual, n_jobs=8)
    clf.fit(X_train, y_train)
    return clf.score(X_test, y_test),

In [8]:
IND_SIZE = 5

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", dict, fitness=creator.FitnessMin)
toolbox = base.Toolbox()

toolbox.register("Individual", get_random, creator.Individual, params_default)
toolbox.register("population", tools.initRepeat, list, toolbox.Individual)

In [9]:
pop = toolbox.population(n=20)

In [10]:
def mutate_params(individual, indpb):
    for key in params_default.keys():
        if random.random()<0.5:
            individual[key]=(random.choice(params_default[key]))
    return individual,

def mate_individuals(ind1, ind2):
    for key in params_default.keys():
        if random.random()<0.5:
            ind1[key] = ind1[key]
            ind2[key] = ind2[key]
        else:
            ind1[key] = ind2[key]
            ind2[key] = ind1[key]
    return (ind1, ind2)

In [11]:
toolbox.register("evaluate", evalOneMax)
toolbox.register("mate", mate_individuals)
toolbox.register("mutate", mutate_params, indpb=0.5)
toolbox.register("select", tools.selTournament, tournsize=3)

In [12]:
fitnesses = list(map(toolbox.evaluate, pop))
for ind, fit in zip(pop, fitnesses):
    ind.fitness.values = fit

In [13]:
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)
    
pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.1, ngen=5, 
                                   stats=stats, halloffame=hof, verbose=True)

gen	nevals	avg     	std     	min     	max     
0  	0     	0.605723	0.134321	0.420942	0.808843
1  	16    	0.481651	0.10333 	0.351042	0.74581 
2  	9     	0.416402	0.0471809	0.270102	0.464577
3  	9     	0.382422	0.119121 	0.144942	0.787272
4  	13    	0.356949	0.0909149	0.144942	0.612405
5  	8     	0.289642	0.065792 	0.144942	0.37269 


In [None]:
acc = []
evals =[]
for cv in tqdm(range(0,100)):
    pop = toolbox.population(n=5)
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.2, mutpb=0.1, ngen=100, 
                                       stats=stats, halloffame=hof)
    acc.append(log[5]['min'])
    evals.append(np.sum([x['nevals'] for x in log]))

  0%|          | 0/100 [00:00<?, ?it/s]gen	nevals	avg     	std     	min     	max    
0  	5     	0.635818	0.160567	0.352706	0.78898
1  	2     	0.589098	0.121944	0.352706	0.675251
2  	2     	0.438723	0.123163	0.311368	0.590847
3  	1     	0.341844	0.0160449	0.311368	0.352706
4  	0     	0.325308	0.0175592	0.311368	0.352706
5  	2     	0.327484	0.0198628	0.311368	0.355184
6  	4     	0.38608 	0.104263 	0.311368	0.592583
7  	0     	0.333364	0.0109979	0.311368	0.338863
8  	0     	0.327865	0.0134697	0.311368	0.338863
9  	1     	0.319719	0.0110508	0.311368	0.338863
10 	3     	0.337055	0.0284584	0.311368	0.379462
11 	0     	0.321624	0.0205115	0.311368	0.362647
12 	0     	0.321624	0.0205115	0.311368	0.362647
13 	1     	0.35431 	0.0858833	0.311368	0.526077
14 	1     	0.318707	0.0146766	0.311368	0.34806 
15 	2     	0.320196	0.0132113	0.311368	0.345443
16 	0     	0.311368	0        	0.311368	0.311368
17 	2     	0.320607	0.0122462	0.311368	0.341872
18 	0     	0.311368	0        	0.311368	0.311368
19 	0  

In [None]:
np.mean(acc)

In [None]:
np.median(acc)

In [None]:
np.mean(evals)

In [None]:
# Variable keeping track of the number of generations
g = 0
CXPB, MUTPB = 0.5, 0.2    
# Begin the evolution
while g < 10:
    # A new generation
    g = g + 1
    print("-- Generation %i --" % g)
    offspring = toolbox.select(pop, len(pop))
    offspring = list(map(toolbox.clone, offspring))
    
    for child1, child2 in zip(offspring[::2], offspring[1::2]):
        if random.random() < CXPB:
            toolbox.mate(child1, child2)
            del child1.fitness.values
            del child2.fitness.values

    for mutant in offspring:
        if random.random() < MUTPB:
            toolbox.mutate(mutant)
            del mutant.fitness.values
            # Evaluate the individuals with an invalid fitness
    invalid_ind = [ind for ind in offspring if not ind.fitness.valid]
    fitnesses = map(toolbox.evaluate, invalid_ind)
    for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = fit
    pop[:] = offspring
    
    
fits = [ind.fitness.values[0] for ind in pop]   
length = len(pop)
mean = sum(fits) / length
sum2 = sum(x*x for x in fits)
std = abs(sum2 / length - mean**2)**0.5
        
print("  Min %s" % min(fits))
print("  Max %s" % max(fits))
print("  Avg %s" % mean)
print("  Std %s" % std)