# Loading an imbalanced dataset

In [86]:
from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter

In [115]:
ecoli = fetch_datasets()['abalone']
X, y = ecoli.data, ecoli.target


In [116]:
# from sklearn.datasets import load_breast_cancer

# data = load_breast_cancer()
# X, y = data.data, data.target

In [117]:
default_classes = np.unique(y)
print(default_classes)

[-1  1]


In [118]:
maj_class = -1
min_class = 1
if sum(y == default_classes[0]) > sum(y == default_classes[1]):
#     maj_class = default_classes[0]
#     min_class = default_classes[1]
    y[y==default_classes[0]] = maj_class
    y[y==default_classes[1]] = min_class
else:
#     maj_class = default_classes[1]
#     min_class = default_classes[0]
    y[y==default_classes[1]] = maj_class
    y[y==default_classes[0]] = min_class
    
print("There are {} instances for the majoritary class".format(sum(y == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y == min_class)))
classes = [maj_class,min_class]

There are 3786 instances for the majoritary class
There are 391 instanes for the minoritary class


In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=0)
#number of features of the dataset 
D = X_train.shape[1]

print("There are {} instances for the majoritary class".format(sum(y_train == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y_train == min_class)))

There are 2839 instances for the majoritary class
There are 293 instanes for the minoritary class


# DE-guided UNDERSAMPLING of the majority class instances

In [120]:
import random
import array

from deap import base
from deap import benchmarks
from deap import creator
from deap import tools


Functions

In [121]:
# function to generate the initial random population from the training set
def load_individuals(X,y,creator,n):
    maj_samples = X[y == maj_class]
    min_samples = X[y == min_class]
    individuals = []
    for i in range(n):
        random_maj = maj_samples[random.randint(0,maj_samples.shape[0]-1)]
        random_min = min_samples[random.randint(0,min_samples.shape[0]-1)]
        individual = np.asarray(np.concatenate((random_maj,random_min)))
        
        individual = creator(individual)
        individuals.append(individual)
    return individuals

# returns the euclidean distance between two points
def euclidean(v1, v2):
    return sum((p-q)**2 for p, q in zip(v1, v2)) ** .5

#returns the sum of the distances from each sample in X_train to the closest center
#we are interested in minimizing this sum of distances
def evaluate(X,individual):
    S = 0
    for x in X:
        dist = dist_to_closest_center(x,individual[:D],individual[D:])
        S += dist
        
    return S,

#computes the euclidean distance for both centers and returns the shortest one
def dist_to_closest_center(x,maj_center,min_center):
    dist_majcenter = euclidean(x,maj_center)
    dist_mincenter = euclidean(x,min_center)
    return min(dist_majcenter,dist_mincenter)


In [122]:
NDIM = D*2

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
#creator.create("Individual", list, fitness=creator.FitnessMin)
creator.create("Individual", array.array, typecode='d', fitness=creator.FitnessMin)
#creator.create("Individual", np.ndarray, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
#toolbox.register("attr_float", random.uniform, -3, 3)
#toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, NDIM)
#toolbox.register("individual", selectRandomSamplesOneForEachClass, creator.Individual)
toolbox.register("population",load_individuals, X_train, y_train, creator.Individual)
#toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("select", tools.selRandom, k=3)

toolbox.register("evaluate", evaluate, X_train)



In [123]:
def DE_clustering(CR,F,POP_SIZE,NGEN):
    # Differential evolution parameters
    #CR = 0.25
    #F = 1  
    #MU = 300
    #NGEN = 200    
    
    pop = toolbox.population(n=POP_SIZE);
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    logbook = tools.Logbook()
    logbook.header = "gen", "evals", "std", "min", "avg", "max"
    
    # Evaluate the individuals
    fitnesses = toolbox.map(toolbox.evaluate, pop)
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    
    record = stats.compile(pop)
    logbook.record(gen=0, evals=len(pop), **record)
    print(logbook.stream)
    
    for g in range(1, NGEN):
        for k, agent in enumerate(pop):
            a,b,c = toolbox.select(pop)
            y = toolbox.clone(agent)
            index = random.randrange(NDIM)
            for i, value in enumerate(agent):
                if i == index or random.random() < CR:
                    y[i] = a[i] + F*(b[i]-c[i])
            y.fitness.values = toolbox.evaluate(y)
            if y.fitness > agent.fitness:
                pop[k] = y
            #print(pop[k].fitness)
        hof.update(pop)
        record = stats.compile(pop)
        logbook.record(gen=g, evals=len(pop), **record)
        print(logbook.stream)

    print("Best individual is ", hof[0], hof[0].fitness.values[0])
    return hof[0]

Compute H clustering processes and obtain one pair of cluster centers for each of them

In [124]:
H = 6
clustering_centers = []
for i in range(H):
    clustering_centers.append(DE_clustering(0.6,0.5,10,10))

gen	evals	std    	min    	avg    	max    
0  	10   	487.217	2322.35	2871.19	3721.12
1  	10   	389.46 	2322.35	2786.17	3539.06
2  	10   	360.201	2322.35	2761.56	3539.06
3  	10   	325.04 	2322.35	2744.36	3366.97
4  	10   	325.04 	2322.35	2744.36	3366.97
5  	10   	320.38 	2322.35	2670.62	3366.97
6  	10   	324.397	2319.88	2666.69	3366.97
7  	10   	326.33 	2319.88	2649.22	3366.97
8  	10   	326.33 	2319.88	2649.22	3366.97
9  	10   	300.871	2319.88	2637.19	3246.66
Best individual is  Individual('d', [0.0, 0.0, 1.0, 0.6, 0.48, 0.17, 1.0575, 0.582, 0.2365, 0.33849999999999997, 0.0, 1.0, 0.0, 0.4, 0.29874999999999996, 0.105, 0.3415, 0.176, 0.13099999999999998, 0.15175]) 2319.8783515474347
gen	evals	std    	min    	avg    	max    
0  	10   	499.713	2279.32	2759.47	3866.05
1  	10   	482.712	2279.32	2740.99	3783.74
2  	10   	292.441	2279.32	2609.25	3271.21
3  	10   	292.623	2187.41	2485.98	3271.21
4  	10   	110.704	2187.41	2371.02	2637.72
5  	10   	89.7953	2187.41	2350.84	2548.86
6  	10   	88.8756	

Take majority samples and compute for each of them their cluster stability

In [125]:
#classifies sample x to the class which center is closer to
def classify(x,centers):
    dist_majcenter = euclidean(x,centers[:len(x)])
    dist_mincenter = euclidean(x,centers[len(x):])
    return np.argmin([dist_majcenter,dist_mincenter])

In [126]:
majority_samples = X_train[y_train==maj_class]

cluster_stabilities = []
for sample in majority_samples:
    
    S = 0
    for clustering in clustering_centers:
        c = classes[classify(sample,clustering)]
        if c==maj_class:
            S += 1
    cluster_stabilities.append(S/H)

print(cluster_stabilities)

[1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.6666666666666666, 0.0, 0.6666666666666666, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.6666666666666666, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.6666666666666666, 0.0, 0.8333333333333334, 1.0, 0.6666666666666666, 1.0, 0.6666666666666666, 1.0, 0.6666666666666666, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.8333333333333334, 1.0, 1.0, 0.0, 0.6666666666666666, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.6666666666666666, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.6666666666666666, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.6666666666666666, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.6666666666666666, 0.6666666666666666, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6666666666666666,

Samples in the majority class with the clustering stability value Ci higher than a given threshold alpha are non-bounday samples.
We take alpha as 80% of the total clustering times (H???????????)

In [127]:
#alpha = H*0.8 ????????????????????
alpha = 0.8
boundary_points = majority_samples[np.array(cluster_stabilities)<=alpha]
non_boundary_points = majority_samples[np.array(cluster_stabilities)>alpha]

print(boundary_points.shape)
print(non_boundary_points.shape)

(1012, 10)
(1827, 10)


We randomly under-sample the non boundary points of the majority class, giving more importance to the data distribution information in the under-sample process. (RUS puro o ponderando los ejemplos en base a su Ci?????????)

QUÉ PROPORCIÓN DE LOS NON-BOUNDARY SELECCIONAMOS??????????'

In [128]:
#RUS PURO
RUSsize = int(non_boundary_points.shape[0]*0.4)
indices = np.random.randint(non_boundary_points.shape[0], size=RUSsize)
nbp_us = non_boundary_points[indices]
print(nbp_us.shape)

(730, 10)


In [129]:
#RUS ponderado por el cluster stability de cada ejemplo
C_non_boundary = np.array(cluster_stabilities)[np.array(cluster_stabilities)>alpha]
indices = np.random.choice(np.arange(non_boundary_points.shape[0]),size=RUSsize,
                           p = C_non_boundary/sum(C_non_boundary))
nbp_us = non_boundary_points[indices]
print(nbp_us.shape)

(730, 10)


In [130]:
new_majorityclass_training = np.vstack((boundary_points,nbp_us))
print(new_majorityclass_training.shape)


(1742, 10)


Resumen del undersampling

In [131]:
print("Conjunto de entrenamiento original de tamaño: {}".format(X_train.shape[0]))
n_may,n_min = sum(y_train == maj_class),sum(y_train == min_class)
print("De los cuales:\n \t nº de ejemplos clase MAYORITARIA: {}\n \t nº de ejemplos clase MINORITARIA: {}"
      .format(n_may,n_min))
print("CONJUNTO DE DATOS NO-BALANCEADO")
print("IR = {}".format(n_may/n_min))

print("nº de ejemplos clase MAYORITARIA tras aplicar DE-guided UNDERSAMPLING: {}".format(new_majorityclass_training.shape[0]))
print("Conjunto de entrenamiento actual de tamaño: {}".format(new_majorityclass_training.shape[0]+n_min))

Conjunto de entrenamiento original de tamaño: 3132
De los cuales:
 	 nº de ejemplos clase MAYORITARIA: 2839
 	 nº de ejemplos clase MINORITARIA: 293
CONJUNTO DE DATOS NO-BALANCEADO
IR = 9.689419795221843
nº de ejemplos clase MAYORITARIA tras aplicar DE-guided UNDERSAMPLING: 1742
Conjunto de entrenamiento actual de tamaño: 2035


# DE-guided OVERSAMPLING of the minority class instances

# ADA-BOOST combined with DE-guided resampling

In [132]:
from imblearn.over_sampling import SMOTE
import math

In [133]:
minority_samples = X_train[y_train==min_class]
#prepare adaboost training set joining the undersampled majority class instances with the minority class instances
X_US = np.vstack((new_majorityclass_training,minority_samples))
y_US = np.hstack((np.full(new_majorityclass_training.shape[0],maj_class),
                  np.full(minority_samples.shape[0],min_class)))
print(X_US.shape,y_US.shape)

(2035, 10) (2035,)


In [134]:
#generate N = n_majority+n_minority weights
#--> then we cant train DT in fitnesses with weigths bc syn samples dont have
N = X_US.shape[0]
weights = np.full(N,1/N)

#print(weights)

In [135]:
#generate N = n_majority + 2*n_minority
#N = minority_samples.shape[0]*2+new_majorityclass_training.shape[0]
#w = np.full(N,1/N)

#print(weights)

Apply SMOTE to the original undersampled set

In [136]:
def compute_synthetics(X,y,maj_class,min_class):
    n_majority_samples = sum(y==maj_class)
    n_minority_samples = sum(y==min_class)
    print('Original dataset shape %s' % Counter(y))
    sm = SMOTE(sampling_strategy = {maj_class: n_majority_samples,min_class: n_minority_samples*2})
    X_after_SMOTE, y_after_SMOTE = sm.fit_resample(X, y)
    print('Resampled dataset shape %s' % Counter(y_after_SMOTE))
    synthetic_samples = X_after_SMOTE[y_after_SMOTE==min_class][n_minority_samples:]
    #print('Synthetic samples: ',synthetic_samples)
    return synthetic_samples

In [137]:
# individual = np.random.randint(0,2,size=minority_samples.shape[0])
# print(individual)
# print(sum(individual))

In [138]:
# selected_syn = synthetic_samples[individual>0]
# print(selected_syn.shape)
# Xtr = np.vstack((X_US,selected_syn))
# print(Xtr.shape)
# ytr = np.hstack((y_US,np.full(selected_syn.shape[0],min_class)))
# print(ytr)
# print(sum(ytr==min_class))
# print(sum(ytr==maj_class))

# #NO TENGO WEIGHTS PARA LOS SINTÉTICOS!!!!!!
# select_w = np.hstack((np.full(X_US.shape[0],1),individual))


# Xtr, Xtst, ytr, ytst = train_test_split(Xtr, ytr, test_size=0.3, random_state=1) # 70% training and 30% test
# weights = np.full(Xtr.shape[0],1/Xtr.shape[0])
# #dt = trainDT(Xtr,ytr,w[select_w>0])
# dt = trainDT(Xtr,ytr)
# #test??????????????????????????
# G = Gmean(dt,Xtst,ytst)

## DE to select the best synthetics

In [139]:
from sklearn.model_selection import train_test_split # Import train_test_split function
# Xtr, Xtst, ytr, ytst = train_test_split(X_US, y_US, test_size=0.3, random_state=1) # 70% training and 30% test
# weights = np.full(Xtr.shape[0],1/Xtr.shape[0])
# dt = trainDT(Xtr,ytr,weights)
# Gmean(dt,Xtst,ytst)

In [143]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from imblearn.metrics import geometric_mean_score
def compute_fitness(p,X_before_SMOTE,y_before_SMOTE,maj_class,min_class,synthetic_samples, individual):
    selected_syn = []
    for i, value in enumerate(individual):
        if individual[i]>0:
            selected_syn.append(synthetic_samples[i-1])
    #selected_syn = synthetic_samples[individual>0]
    selected_syn = np.array(selected_syn)
    Xtr = np.vstack((X_before_SMOTE,selected_syn))
    ytr = np.hstack((y_before_SMOTE,np.full(selected_syn.shape[0],min_class)))
    #realizar splitTest??????????????''
    Xtr, Xtst, ytr, ytst = train_test_split(Xtr, ytr, test_size=0.3)
    #dt = trainDT(Xtr,ytr,weights)
    dt = trainDT(Xtr,ytr)
    #test??????????????????????????
    #G = Gmean(dt,X_before_SMOTE,y_before_SMOTE)
    G = Gmean(dt,Xtst,ytst)
    n_minority = len(individual)+sum(individual)
    n_majority = X_before_SMOTE[y_before_SMOTE==maj_class].shape[0]
    f = G - abs(1-(n_minority/n_majority*p))
    #print("f: ",f)
    return f,

def trainDT(X_train,y_train,w=None):
    clf = DecisionTreeClassifier()
    clf = clf.fit(X_train,y_train,sample_weight=w)
    return clf

def Gmean(clf,X_test,y_test):
    y_pred = clf.predict(X_test)
    gmean = geometric_mean_score(y_test, y_pred)
    #print("Gmean:",gmean)
    return gmean
    

In [144]:
def DE_oversampling(CR,F_0,POP_SIZE,NGEN):
    # Differential evolution parameters
    #CR = 0.25
    #F = 1  
    #MU = 300
    #NGEN = 200    
    
    pop = toolbox.population(n=POP_SIZE);
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    logbook = tools.Logbook()
    logbook.header = "gen", "evals", "std", "min", "avg", "max"
    #print(pop)
    # Evaluate the individuals
    fitnesses = toolbox.map(toolbox.evaluate, pop)
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    
    record = stats.compile(pop)
    logbook.record(gen=0, evals=len(pop), **record)
    print(logbook.stream)
    
    for g in range(1, NGEN):
        for k, agent in enumerate(pop):
            a,b,c = toolbox.select(pop)
            #we adopt a self-adaptative operator
            l = math.exp(1-(NGEN/(NGEN+1-g)))
            F = F_0*(2**l)
            d = toolbox.clone(agent) #donor vector
            sig_d = toolbox.clone(agent)
            y = toolbox.clone(agent)
            index = random.randrange(NDIM_DE_SMOTE)
            for i, value in enumerate(agent):
                d[i] = a[i] + F*(b[i]-c[i]) #donor vector
                #the mutated donor is mapped to binary space by a sigmoid function with displacement
                sig_d[i] = round(1/(1+math.exp(-(d[i]))))
                if i == index or random.random() < CR:
                    #y[i] = a[i] + F*(b[i]-c[i])
                    y[i] = sig_d[i]
            y.fitness.values = toolbox.evaluate(y)
            if y.fitness > agent.fitness:
                pop[k] = y
            #print(pop[k].fitness)
        hof.update(pop)
        record = stats.compile(pop)
        logbook.record(gen=g, evals=len(pop), **record)
        print(logbook.stream)

    print("Best individual is ", hof[0], hof[0].fitness.values[0])
    return hof[0]

In [145]:
NDIM_DE_SMOTE = minority_samples.shape[0]
n_majority = new_majorityclass_training.shape[0]
p = 0.2

creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", array.array, typecode='d', fitness=creator.FitnessMax, clf=None)

toolbox = base.Toolbox()
toolbox.register("attr_int", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_int, NDIM_DE_SMOTE)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("select", tools.selRandom, k=3)

toolbox.register("evaluate", compute_fitness, p,
                 X_US,y_US,maj_class,min_class,syn)

T = 10
clf_weights = []
clfs = []
for t in range(T):
    #generate synthetic samples from minority class
    syn = compute_synthetics(X_US,y_US,maj_class,min_class)
    
    #compute DE and obtain the best subset of synthetics to train with
    selection_mask = DE_oversampling(0.6,0.5,10,100)
    selected_syn = []
    for i, value in enumerate(selection_mask):
        if selection_mask[i]>0:
            selected_syn.append(syn[i])
    selected_syn = np.array(selected_syn)
    Xtr = np.vstack((X_US,selected_syn))
    ytr = np.hstack((y_US,np.full(selected_syn.shape[0],min_class)))
    
    clf = trainDT(Xtr,ytr)
    
    y_pred = clf.predict(X_US)
    e = sum(weights*(y_pred!=y_US))
    
    w_clf = 0.5*math.log((1-e)/e)
    
    Z = 2*math.sqrt(e*(1-e))
    weights = weights*np.exp(-w_clf*y_US*y_pred)/Z
    weights /= sum(weights)
    
    clf_weights.append(w_clf)
    clfs.append(clf)
    



Original dataset shape Counter({-1: 1742, 1: 293})
Resampled dataset shape Counter({-1: 1742, 1: 586})
gen	evals	std      	min      	avg      	max      
0  	10   	0.0230694	-0.314525	-0.286835	-0.239861
1  	10   	0.0224066	-0.294243	-0.269607	-0.233976
2  	10   	0.021026 	-0.28717 	-0.259444	-0.229162
3  	10   	0.0183381	-0.282709	-0.255357	-0.229162
4  	10   	0.0219576	-0.282709	-0.250443	-0.212905
5  	10   	0.0226972	-0.27772 	-0.238958	-0.203848
6  	10   	0.0172979	-0.262446	-0.230631	-0.203848
7  	10   	0.014299 	-0.241771	-0.228499	-0.203848
8  	10   	0.0137334	-0.241771	-0.226041	-0.203848
9  	10   	0.012832 	-0.241438	-0.223666	-0.203848
10 	10   	0.0125894	-0.239861	-0.215227	-0.198118
11 	10   	0.013182 	-0.239861	-0.213551	-0.198118
12 	10   	0.0129094	-0.234136	-0.211285	-0.197741
13 	10   	0.0114715	-0.234136	-0.210332	-0.197741
14 	10   	0.0114394	-0.234136	-0.210058	-0.197741
15 	10   	0.0130583	-0.234136	-0.207939	-0.189727
16 	10   	0.0118761	-0.234136	-0.205803	-0.1897

  w_clf = 0.5*math.log((1-e)/e)
  weights = weights*np.exp(-w_clf*y_US*y_pred)/Z


1  	10   	0.0211041	-0.293089	-0.270882	-0.220084
2  	10   	0.0167434	-0.277198	-0.253708	-0.220084
3  	10   	0.0189899	-0.277198	-0.23635 	-0.20584 
4  	10   	0.0203004	-0.277198	-0.231367	-0.20584 
5  	10   	0.0140627	-0.24424 	-0.224931	-0.20584 
6  	10   	0.0125128	-0.241232	-0.221459	-0.200364
7  	10   	0.00991314	-0.234162	-0.215883	-0.200364
8  	10   	0.00815484	-0.224276	-0.212914	-0.200364
9  	10   	0.00815484	-0.224276	-0.212914	-0.200364
10 	10   	0.0108693 	-0.224276	-0.211253	-0.187779
11 	10   	0.00963744	-0.220084	-0.209678	-0.187779
12 	10   	0.00963744	-0.220084	-0.209678	-0.187779
13 	10   	0.00951415	-0.220084	-0.208022	-0.187779
14 	10   	0.00951415	-0.220084	-0.208022	-0.187779
15 	10   	0.0111478 	-0.220084	-0.204717	-0.184285
16 	10   	0.0121751 	-0.220084	-0.204109	-0.181691
17 	10   	0.0115193 	-0.217441	-0.203527	-0.181691
18 	10   	0.0115193 	-0.217441	-0.203527	-0.181691
19 	10   	0.0115193 	-0.217441	-0.203527	-0.181691
20 	10   	0.0115193 	-0.217441	-0.203

31 	10   	0.0106595	-0.204497	-0.182643	-0.167855
32 	10   	0.0106595	-0.204497	-0.182643	-0.167855
33 	10   	0.0106595	-0.204497	-0.182643	-0.167855
34 	10   	0.010829 	-0.204497	-0.182524	-0.166666
35 	10   	0.010829 	-0.204497	-0.182524	-0.166666
36 	10   	0.010829 	-0.204497	-0.182524	-0.166666
37 	10   	0.0108058	-0.204497	-0.182452	-0.166666
38 	10   	0.0108058	-0.204497	-0.182452	-0.166666
39 	10   	0.0108058	-0.204497	-0.182452	-0.166666
40 	10   	0.0108058	-0.204497	-0.182452	-0.166666
41 	10   	0.0108058	-0.204497	-0.182452	-0.166666
42 	10   	0.0122649	-0.204497	-0.181268	-0.161729
43 	10   	0.0122649	-0.204497	-0.181268	-0.161729
44 	10   	0.0149738	-0.204497	-0.179   	-0.152526
45 	10   	0.0144051	-0.201888	-0.178349	-0.152526
46 	10   	0.013967 	-0.201888	-0.177733	-0.152526
47 	10   	0.0127339	-0.200276	-0.174825	-0.152526
48 	10   	0.0127339	-0.200276	-0.174825	-0.152526
49 	10   	0.0111167	-0.191262	-0.173923	-0.152526
50 	10   	0.0106735	-0.191262	-0.173362	-0.152526


62 	10   	0.00570587	-0.184786	-0.178492	-0.166937
63 	10   	0.00570587	-0.184786	-0.178492	-0.166937
64 	10   	0.00570587	-0.184786	-0.178492	-0.166937
65 	10   	0.00570587	-0.184786	-0.178492	-0.166937
66 	10   	0.00570587	-0.184786	-0.178492	-0.166937
67 	10   	0.00580544	-0.184786	-0.177337	-0.166937
68 	10   	0.00694783	-0.184786	-0.17582 	-0.164191
69 	10   	0.00694783	-0.184786	-0.17582 	-0.164191
70 	10   	0.00694783	-0.184786	-0.17582 	-0.164191
71 	10   	0.00694783	-0.184786	-0.17582 	-0.164191
72 	10   	0.00694783	-0.184786	-0.17582 	-0.164191
73 	10   	0.00694783	-0.184786	-0.17582 	-0.164191
74 	10   	0.0068167 	-0.184786	-0.175475	-0.164191
75 	10   	0.0068167 	-0.184786	-0.175475	-0.164191
76 	10   	0.00675628	-0.184786	-0.175421	-0.164191
77 	10   	0.00675628	-0.184786	-0.175421	-0.164191
78 	10   	0.00675628	-0.184786	-0.175421	-0.164191
79 	10   	0.00649151	-0.184786	-0.175127	-0.164191
80 	10   	0.00636906	-0.184786	-0.175018	-0.164191
81 	10   	0.00636906	-0.184786	

90 	10   	0.00841687	-0.190783	-0.175287	-0.161187
91 	10   	0.00841687	-0.190783	-0.175287	-0.161187
92 	10   	0.00841687	-0.190783	-0.175287	-0.161187
93 	10   	0.00841687	-0.190783	-0.175287	-0.161187
94 	10   	0.00841687	-0.190783	-0.175287	-0.161187
95 	10   	0.00841687	-0.190783	-0.175287	-0.161187
96 	10   	0.0078304 	-0.184503	-0.172185	-0.15976 
97 	10   	0.0078304 	-0.184503	-0.172185	-0.15976 
98 	10   	0.0078304 	-0.184503	-0.172185	-0.15976 
99 	10   	0.0078304 	-0.184503	-0.172185	-0.15976 
Best individual is  Individual('d', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0

gen	evals	std      	min      	avg      	max      
0  	10   	0.0338489	-0.349223	-0.297038	-0.241108
1  	10   	0.0246539	-0.324672	-0.276139	-0.241108
2  	10   	0.018748 	-0.288514	-0.252153	-0.225017
3  	10   	0.0307779	-0.288514	-0.237806	-0.174551
4  	10   	0.0271264	-0.262008	-0.226681	-0.174551
5  	10   	0.025768 	-0.262008	-0.225073	-0.174551
6  	10   	0.0229334	-0.254377	-0.220827	-0.174551
7  	10   	0.0210917	-0.254377	-0.21493 	-0.174551
8  	10   	0.0201007	-0.251098	-0.21355 	-0.174551
9  	10   	0.0201007	-0.251098	-0.21355 	-0.174551
10 	10   	0.0198355	-0.232422	-0.208181	-0.174551
11 	10   	0.0198355	-0.232422	-0.208181	-0.174551
12 	10   	0.0188969	-0.232422	-0.204909	-0.174551
13 	10   	0.0188969	-0.232422	-0.204909	-0.174551
14 	10   	0.0166065	-0.229451	-0.201035	-0.174551
15 	10   	0.0133374	-0.216383	-0.197799	-0.174551
16 	10   	0.0133374	-0.216383	-0.197799	-0.174551
17 	10   	0.0118914	-0.215737	-0.195275	-0.174551
18 	10   	0.0118914	-0.215737	-0.195275	-0.174551


29 	10   	0.0119837 	-0.205299	-0.18475 	-0.166003
30 	10   	0.0112212 	-0.200486	-0.184269	-0.166003
31 	10   	0.0112212 	-0.200486	-0.184269	-0.166003
32 	10   	0.0112212 	-0.200486	-0.184269	-0.166003
33 	10   	0.0112091 	-0.200486	-0.184258	-0.166003
34 	10   	0.0112091 	-0.200486	-0.184258	-0.166003
35 	10   	0.0111943 	-0.200486	-0.183898	-0.166003
36 	10   	0.0111943 	-0.200486	-0.183898	-0.166003
37 	10   	0.0111943 	-0.200486	-0.183898	-0.166003
38 	10   	0.0111943 	-0.200486	-0.183898	-0.166003
39 	10   	0.0111943 	-0.200486	-0.183898	-0.166003
40 	10   	0.0111943 	-0.200486	-0.183898	-0.166003
41 	10   	0.0111943 	-0.200486	-0.183898	-0.166003
42 	10   	0.0125383 	-0.200486	-0.180895	-0.162071
43 	10   	0.0119791 	-0.200486	-0.180426	-0.162071
44 	10   	0.0119791 	-0.200486	-0.180426	-0.162071
45 	10   	0.0119791 	-0.200486	-0.180426	-0.162071
46 	10   	0.0119791 	-0.200486	-0.180426	-0.162071
47 	10   	0.0119791 	-0.200486	-0.180426	-0.162071
48 	10   	0.0119791 	-0.200486	

57 	10   	0.0100853 	-0.186108	-0.174877	-0.156455
58 	10   	0.0105932 	-0.186108	-0.17298 	-0.156455
59 	10   	0.0105932 	-0.186108	-0.17298 	-0.156455
60 	10   	0.0101578 	-0.186108	-0.171167	-0.156455
61 	10   	0.0101578 	-0.186108	-0.171167	-0.156455
62 	10   	0.0101578 	-0.186108	-0.171167	-0.156455
63 	10   	0.0101578 	-0.186108	-0.171167	-0.156455
64 	10   	0.0101578 	-0.186108	-0.171167	-0.156455
65 	10   	0.0098128 	-0.186108	-0.170458	-0.156455
66 	10   	0.0098128 	-0.186108	-0.170458	-0.156455
67 	10   	0.0098128 	-0.186108	-0.170458	-0.156455
68 	10   	0.00954056	-0.186108	-0.168942	-0.156455
69 	10   	0.00954056	-0.186108	-0.168942	-0.156455
70 	10   	0.00954056	-0.186108	-0.168942	-0.156455
71 	10   	0.00954056	-0.186108	-0.168942	-0.156455
72 	10   	0.00954056	-0.186108	-0.168942	-0.156455
73 	10   	0.00954056	-0.186108	-0.168942	-0.156455
74 	10   	0.00954056	-0.186108	-0.168942	-0.156455
75 	10   	0.00954056	-0.186108	-0.168942	-0.156455
76 	10   	0.00954056	-0.186108	

86 	10   	0.0154584 	-0.185743	-0.164142	-0.135688
87 	10   	0.0154584 	-0.185743	-0.164142	-0.135688
88 	10   	0.0154584 	-0.185743	-0.164142	-0.135688
89 	10   	0.0136892 	-0.184079	-0.161567	-0.135688
90 	10   	0.0136892 	-0.184079	-0.161567	-0.135688
91 	10   	0.0136892 	-0.184079	-0.161567	-0.135688
92 	10   	0.0134781 	-0.184079	-0.16042 	-0.135688
93 	10   	0.0134781 	-0.184079	-0.16042 	-0.135688
94 	10   	0.0134781 	-0.184079	-0.16042 	-0.135688
95 	10   	0.0134781 	-0.184079	-0.16042 	-0.135688
96 	10   	0.0134781 	-0.184079	-0.16042 	-0.135688
97 	10   	0.0134781 	-0.184079	-0.16042 	-0.135688
98 	10   	0.0134781 	-0.184079	-0.16042 	-0.135688
99 	10   	0.0137906 	-0.184079	-0.157601	-0.135688
Best individual is  Individual('d', [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0,

In [146]:
print(clfs)
print(clf_weights)

[DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier(), DecisionTreeClassifier()]
[inf, nan, nan, nan, nan, nan, nan, nan, nan, nan]


In [147]:
def DERS_BOOST_classification(x,clfs,clf_weights):
    S = 0
    for t in range(len(clfs)):
        y = clfs[t].predict(x)
        S += clf_weights[t]*y
    if S>=0:
        return 1
    else:
        return -1

In [148]:
clf = trainDT(X_US,y_US)

y_pred = clf.predict(X_US)
y_pred[-2:] = -1
fails = (y_pred!=y_US)
e = sum(weights*fails)
w_clf = 0.5*math.log((1-e)/e)
Z = 2*math.sqrt(e*(1-e))
b = -(w_clf*y_US*y_pred)
a = np.exp(b)
w = weights*a/Z
print(w)
w /= sum(w)
print(w)
print(sum(w))

[nan nan nan ... nan nan nan]
[nan nan nan ... nan nan nan]
nan


In [150]:
DERS_BOOST_classification(X_test[0].reshape(1,-1),clfs,clf_weights)

-1

In [154]:
y_pred = []
for x in X_test:
    y_pred.append(DERS_BOOST_classification(x.reshape(1,-1),clfs,clf_weights))

print(sum(y_pred==y_test))
print(y_test.shape)
    

947
(1045,)
