# Loading an imbalanced dataset

In [97]:
from imblearn.datasets import fetch_datasets
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter

In [179]:
ecoli = fetch_datasets()['abalone']
X, y = ecoli.data, ecoli.target


In [180]:
# from sklearn.datasets import load_breast_cancer

# data = load_breast_cancer()
# X, y = data.data, data.target

In [181]:
default_classes = np.unique(y)
print(default_classes)

[-1  1]


In [182]:
maj_class = -1
min_class = 1
if sum(y == default_classes[0]) > sum(y == default_classes[1]):
#     maj_class = default_classes[0]
#     min_class = default_classes[1]
    y[y==default_classes[0]] = maj_class
    y[y==default_classes[1]] = min_class
else:
#     maj_class = default_classes[1]
#     min_class = default_classes[0]
    y[y==default_classes[1]] = maj_class
    y[y==default_classes[0]] = min_class
    
print("There are {} instances for the majoritary class".format(sum(y == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y == min_class)))
classes = [maj_class,min_class]

There are 3786 instances for the majoritary class
There are 391 instanes for the minoritary class


In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=0)
#number of features of the dataset 
D = X_train.shape[1]

print("There are {} instances for the majoritary class".format(sum(y_train == maj_class)))
print("There are {} instanes for the minoritary class".format(sum(y_train == min_class)))

There are 2839 instances for the majoritary class
There are 293 instanes for the minoritary class


# DE-guided UNDERSAMPLING of the majority class instances

In [184]:
import random
import array

from deap import base
from deap import creator
from deap import tools


Functions

In [185]:
# function to generate the initial random population from the training set
def load_individuals(X,y,creator,n):
    maj_samples = X[y == maj_class]
    min_samples = X[y == min_class]
    individuals = []
    for i in range(n):
        random_maj = maj_samples[random.randint(0,maj_samples.shape[0]-1)]
        random_min = min_samples[random.randint(0,min_samples.shape[0]-1)]
        individual = np.asarray(np.concatenate((random_maj,random_min)))
        
        individual = creator(individual)
        individuals.append(individual)
    return individuals

# returns the euclidean distance between two points
def euclidean(v1, v2):
    return sum((p-q)**2 for p, q in zip(v1, v2)) ** .5

#returns the sum of the distances from each sample in X_train to the closest center
#we are interested in minimizing this sum of distances
def evaluate(X,individual):
    S = 0
    for x in X:
        dist = dist_to_closest_center(x,individual[:D],individual[D:])
        S += dist
        
    return S,

#computes the euclidean distance for both centers and returns the shortest one
def dist_to_closest_center(x,maj_center,min_center):
    dist_majcenter = euclidean(x,maj_center)
    dist_mincenter = euclidean(x,min_center)
    return min(dist_majcenter,dist_mincenter)


In [186]:
NDIM = D*2

creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
#creator.create("Individual", list, fitness=creator.FitnessMin)
creator.create("Individual", array.array, typecode='d', fitness=creator.FitnessMin)
#creator.create("Individual", np.ndarray, fitness=creator.FitnessMin)

toolbox = base.Toolbox()
#toolbox.register("attr_float", random.uniform, -3, 3)
#toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_float, NDIM)
#toolbox.register("individual", selectRandomSamplesOneForEachClass, creator.Individual)
toolbox.register("population",load_individuals, X_train, y_train, creator.Individual)
#toolbox.register("population", tools.initRepeat, list, toolbox.individual)
toolbox.register("select", tools.selRandom, k=3)

toolbox.register("evaluate", evaluate, X_train)



In [187]:
def DE_clustering(CR,F,POP_SIZE,NGEN):
    # Differential evolution parameters
    #CR = 0.25
    #F = 1  
    #MU = 300
    #NGEN = 200    
    
    pop = toolbox.population(n=POP_SIZE);
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    logbook = tools.Logbook()
    logbook.header = "gen", "evals", "std", "min", "avg", "max"
    
    # Evaluate the individuals
    fitnesses = toolbox.map(toolbox.evaluate, pop)
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    
    record = stats.compile(pop)
    logbook.record(gen=0, evals=len(pop), **record)
    print(logbook.stream)
    
    for g in range(1, NGEN):
        for k, agent in enumerate(pop):
            a,b,c = toolbox.select(pop)
            y = toolbox.clone(agent)
            index = random.randrange(NDIM)
            for i, value in enumerate(agent):
                if i == index or random.random() < CR:
                    y[i] = a[i] + F*(b[i]-c[i])
            y.fitness.values = toolbox.evaluate(y)
            if y.fitness > agent.fitness:
                pop[k] = y
            #print(pop[k].fitness)
        hof.update(pop)
        record = stats.compile(pop)
        logbook.record(gen=g, evals=len(pop), **record)
        print(logbook.stream)

    print("Best individual is ", hof[0], hof[0].fitness.values[0])
    return hof[0]

Compute H clustering processes and obtain one pair of cluster centers for each of them

In [188]:
H = 6
clustering_centers = []
for i in range(H):
    clustering_centers.append(DE_clustering(0.6,0.5,10,100))

gen	evals	std    	min   	avg    	max    
0  	10   	352.144	2501.5	3081.23	3672.92
1  	10   	231.476	2501.5	2959.84	3369.8 
2  	10   	223.349	2501.5	2933.46	3369.8 
3  	10   	246.685	2501.5	2887.23	3314.08
4  	10   	195.36 	2501.5	2842.45	3092.24
5  	10   	181.487	2501.5	2784.8 	3053.21
6  	10   	197.498	2457.66	2743.07	2995.12
7  	10   	202.089	2434.03	2711.84	2995.12
8  	10   	228.302	2268.23	2693.62	2995.12
9  	10   	206.387	2268.23	2666.47	2911.9 
10 	10   	189.164	2268.23	2614.05	2911.9 
11 	10   	149.754	2268.23	2558.91	2820.2 
12 	10   	141.776	2268.23	2548.68	2820.2 
13 	10   	102.416	2268.23	2488.9 	2712.75
14 	10   	120.701	2268.23	2459.16	2712.75
15 	10   	85.9301	2268.23	2415.31	2513.01
16 	10   	63.3863	2268.23	2365.28	2479.25
17 	10   	57.2633	2268.23	2341.93	2444.12
18 	10   	40.6711	2268.23	2314.37	2402.89
19 	10   	38.3853	2266.31	2303.85	2402.89
20 	10   	23.5224	2260.89	2289.71	2327.57
21 	10   	20.2218	2233.43	2270.46	2304.43
22 	10   	17.8152	2217.97	2257.72	2275.78

80 	10   	0.0142989	2143.48	2143.51	2143.53
81 	10   	0.0138198	2143.48	2143.51	2143.53
82 	10   	0.0122204	2143.48	2143.5 	2143.53
83 	10   	0.0140173	2143.48	2143.5 	2143.53
84 	10   	0.0133222	2143.48	2143.49	2143.53
85 	10   	0.00864053	2143.48	2143.49	2143.5 
86 	10   	0.00864016	2143.48	2143.48	2143.5 
87 	10   	0.00773163	2143.47	2143.48	2143.5 
88 	10   	0.00908411	2143.46	2143.48	2143.49
89 	10   	0.00889026	2143.46	2143.48	2143.49
90 	10   	0.00747306	2143.45	2143.47	2143.48
91 	10   	0.00680785	2143.45	2143.46	2143.48
92 	10   	0.0063119 	2143.44	2143.46	2143.46
93 	10   	0.00701999	2143.44	2143.45	2143.46
94 	10   	0.00636827	2143.43	2143.45	2143.46
95 	10   	0.00516832	2143.43	2143.44	2143.45
96 	10   	0.00534634	2143.43	2143.44	2143.45
97 	10   	0.00491173	2143.43	2143.44	2143.44
98 	10   	0.00751415	2143.42	2143.43	2143.44
99 	10   	0.00785865	2143.42	2143.43	2143.44
Best individual is  Individual('d', [0.5, 0.0, 0.530647697840041, 0.5646170898701318, 0.44874169541332143

52 	10   	15.4307	2157.68	2174.8 	2207.63
53 	10   	11.7828	2157.68	2170.48	2199.67
54 	10   	7.49234	2156.33	2167.24	2180.28
55 	10   	6.68566	2156.33	2164.74	2176.87
56 	10   	6.32007	2156.33	2163.71	2175.36
57 	10   	6.95856	2155.62	2162.44	2175.36
58 	10   	5.89257	2153.81	2159.05	2173.72
59 	10   	3.56202	2149.01	2155.39	2163.09
60 	10   	4.36171	2149.01	2153.92	2163.09
61 	10   	3.6099 	2149.01	2153.04	2158.96
62 	10   	2.91278	2149.01	2152.43	2157.28
63 	10   	1.69758	2149.01	2151.1 	2154.53
64 	10   	1.68242	2149.01	2151.04	2154.53
65 	10   	1.91979	2147.93	2150.71	2154.53
66 	10   	1.18199	2147.44	2149.47	2151.07
67 	10   	1.46827	2147.21	2148.78	2151.07
68 	10   	1.26114	2146.49	2147.96	2151.07
69 	10   	0.856371	2145.91	2147.21	2149.21
70 	10   	0.557859	2145.91	2146.92	2147.8 
71 	10   	0.689825	2145.66	2146.71	2147.8 
72 	10   	0.664247	2145.66	2146.55	2147.8 
73 	10   	0.614823	2145.44	2146.12	2147.09
74 	10   	0.541411	2145.37	2145.98	2147.09
75 	10   	0.402713	2144.74	2

23 	10   	174.538	2249.94	2418.19	2753.44
24 	10   	174.538	2249.94	2418.19	2753.44
25 	10   	151.678	2231.36	2362.6 	2753.44
26 	10   	156.67 	2199.79	2348.4 	2753.44
27 	10   	154.648	2199.79	2327.43	2753.44
28 	10   	157.101	2173.87	2307.19	2753.44
29 	10   	155.695	2173.87	2287.29	2742.4 
30 	10   	157.012	2173.87	2280.12	2742.4 
31 	10   	157.971	2173.87	2274.96	2742.4 
32 	10   	158.251	2173.87	2274.07	2742.4 
33 	10   	161.428	2173.87	2261.67	2742.4 
34 	10   	162.338	2173.87	2259.19	2742.4 
35 	10   	163.642	2173.87	2253.35	2742.4 
36 	10   	164.687	2173.87	2249.61	2742.4 
37 	10   	166.169	2163.87	2245.8 	2742.4 
38 	10   	100.946	2163.87	2218.54	2518.23
39 	10   	86.7676	2163.87	2211.49	2469.28
40 	10   	15.5052	2163.87	2183.81	2222.55
41 	10   	13.3902	2161.95	2178.7 	2213.29
42 	10   	11.1362	2161.95	2173.75	2198.87
43 	10   	8.59959	2160.21	2169.59	2184.4 
44 	10   	6.79236	2157.1 	2166.2 	2179.6 
45 	10   	5.69709	2157.1 	2164.92	2174.27
46 	10   	6.37106	2154.94	2162.93	

Take majority samples and compute for each of them their cluster stability

In [189]:
#classifies sample x to the class which center is closer to
def classify(x,centers):
    dist_majcenter = euclidean(x,centers[:len(x)])
    dist_mincenter = euclidean(x,centers[len(x):])
    return np.argmin([dist_majcenter,dist_mincenter])

In [190]:
majority_samples = X_train[y_train==maj_class]

cluster_stabilities = []
for sample in majority_samples:
    
    S = 0
    for clustering in clustering_centers:
        c = classes[classify(sample,clustering)]
        if c==maj_class:
            S += 1
    cluster_stabilities.append(S/H)

print(cluster_stabilities)

[0.8333333333333334, 0.8333333333333334, 0.16666666666666666, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.16666666666666666, 0.8333333333333334, 0.16666666666666666, 0.8333333333333334, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.8333333333333334, 0.8333333333333334, 0.16666666666666666, 0.8333333333333334, 0.8333333333333334, 0.16666666666666666, 0.16666666666666666, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.16666666666666666, 0.16666666666666666, 0.8333333333333334, 0.8333333333333334, 0.16666666666666666, 0.8333333333333334, 0.8333333333333334, 0.16666666666666666, 0.16666666666666666, 0.8333333333333334, 0.16666666666666666, 0.8333333333333334, 0.16666666666666666, 0.16666666666666666, 0.8333333333333334, 0.8333333333333334, 0.8333333333333334, 0.

Samples in the majority class with the clustering stability value Ci higher than a given threshold alpha are non-bounday samples.
We take alpha as 80% of the total clustering times (H???????????)

In [191]:
#alpha = H*0.8 ????????????????????
alpha = 0.8
boundary_points = majority_samples[np.array(cluster_stabilities)<=alpha]
non_boundary_points = majority_samples[np.array(cluster_stabilities)>alpha]

print(boundary_points.shape)
print(non_boundary_points.shape)

(813, 10)
(2026, 10)


We randomly under-sample the non boundary points of the majority class, giving more importance to the data distribution information in the under-sample process. (RUS puro o ponderando los ejemplos en base a su Ci?????????)

QUÉ PROPORCIÓN DE LOS NON-BOUNDARY SELECCIONAMOS??????????'

In [192]:
#RUS PURO
RUSsize = int(non_boundary_points.shape[0]*0.4)
indices = np.random.randint(non_boundary_points.shape[0], size=RUSsize)
nbp_us = non_boundary_points[indices]
print(nbp_us.shape)

(810, 10)


In [193]:
#RUS ponderado por el cluster stability de cada ejemplo
C_non_boundary = np.array(cluster_stabilities)[np.array(cluster_stabilities)>alpha]
indices = np.random.choice(np.arange(non_boundary_points.shape[0]),size=RUSsize,
                           p = C_non_boundary/sum(C_non_boundary))
nbp_us = non_boundary_points[indices]
print(nbp_us.shape)

(810, 10)


In [194]:
new_majorityclass_training = np.vstack((boundary_points,nbp_us))
print(new_majorityclass_training.shape)


(1623, 10)


Resumen del undersampling

In [195]:
print("Conjunto de entrenamiento original de tamaño: {}".format(X_train.shape[0]))
n_may,n_min = sum(y_train == maj_class),sum(y_train == min_class)
print("De los cuales:\n \t nº de ejemplos clase MAYORITARIA: {}\n \t nº de ejemplos clase MINORITARIA: {}"
      .format(n_may,n_min))
print("CONJUNTO DE DATOS NO-BALANCEADO")
print("IR = {}".format(n_may/n_min))

print("nº de ejemplos clase MAYORITARIA tras aplicar DE-guided UNDERSAMPLING: {}".format(new_majorityclass_training.shape[0]))
print("Conjunto de entrenamiento actual de tamaño: {}".format(new_majorityclass_training.shape[0]+n_min))

Conjunto de entrenamiento original de tamaño: 3132
De los cuales:
 	 nº de ejemplos clase MAYORITARIA: 2839
 	 nº de ejemplos clase MINORITARIA: 293
CONJUNTO DE DATOS NO-BALANCEADO
IR = 9.689419795221843
nº de ejemplos clase MAYORITARIA tras aplicar DE-guided UNDERSAMPLING: 1623
Conjunto de entrenamiento actual de tamaño: 1916


In [None]:

toolbox.unregister("population")
toolbox.unregister("select")

toolbox.unregister("evaluate")

# DE-guided OVERSAMPLING of the minority class instances

# ADA-BOOST combined with DE-guided resampling

In [196]:
from imblearn.over_sampling import SMOTE
import math

In [197]:
minority_samples = X_train[y_train==min_class]
#prepare adaboost training set joining the undersampled majority class instances with the minority class instances
X_US = np.vstack((new_majorityclass_training,minority_samples))
y_US = np.hstack((np.full(new_majorityclass_training.shape[0],maj_class),
                  np.full(minority_samples.shape[0],min_class)))
print(X_US.shape,y_US.shape)

(1916, 10) (1916,)


Apply SMOTE to the original undersampled set

In [198]:
def compute_synthetics(X,y,n_maj,n_min,maj_class,min_class): 
    print('Original dataset shape %s' % Counter(y))
    sm = SMOTE(sampling_strategy = {maj_class: n_maj,min_class: n_min*2})
    X_after_SMOTE, y_after_SMOTE = sm.fit_resample(X, y)
    print('Resampled dataset shape %s' % Counter(y_after_SMOTE))
    synthetic_samples = X_after_SMOTE[y_after_SMOTE==min_class][n_min:]
    #print('Synthetic samples: ',synthetic_samples)
    return synthetic_samples

In [199]:
# individual = np.random.randint(0,2,size=minority_samples.shape[0])
# print(individual)
# print(sum(individual))

## DE to select the best synthetics

In [248]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from imblearn.metrics import geometric_mean_score

def Gmean(clf,X_test,y_test):
    y_pred = clf.predict(X_test)
    gmean = geometric_mean_score(y_test, y_pred)
    print("Gmean:",gmean)
    return gmean
    
def compute_fitness(p,X_before_SMOTE,y_before_SMOTE,maj_class,min_class,synthetic_samples, individual):
    #print(synthetic_samples[0])
    selected_syn = []
    for i, value in enumerate(individual):
        if individual[i]>0:
            selected_syn.append(synthetic_samples[i-1])
    #selected_syn = synthetic_samples[individual>0]
    selected_syn = np.array(selected_syn)
    X = np.vstack((X_before_SMOTE,selected_syn))
    y = np.hstack((y_before_SMOTE,np.full(selected_syn.shape[0],min_class)))
    #realizar splitTest??????????????''
    Xtr, Xtst, ytr, ytst = train_test_split(X, y, test_size=0.3)
    #dt = trainDT(Xtr,ytr,weights)
    dt = trainDT(Xtr,ytr)
    #test??????????????????????????
    #G = Gmean(dt,X_before_SMOTE,y_before_SMOTE)
    G = Gmean(dt,Xtst,ytst)
    n_minority = len(individual)+sum(individual)
    n_majority = X_before_SMOTE[y_before_SMOTE==maj_class].shape[0]
    #f = G - abs(1-(n_minority/n_majority*p))
    #print("f: ",f)
    return G,

def trainDT(X_train,y_train,w=None):
    clf = DecisionTreeClassifier()
    clf = clf.fit(X_train,y_train,sample_weight=w)
    return clf



In [249]:
def DE_oversampling(CR,F_0,POP_SIZE,NGEN):
    # Differential evolution parameters
    #CR = 0.25
    #F = 1  
    #MU = 300
    #NGEN = 200    
    
    pop = toolbox.population(n=POP_SIZE);
    hof = tools.HallOfFame(1)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("std", np.std)
    stats.register("min", np.min)
    stats.register("max", np.max)
    
    logbook = tools.Logbook()
    logbook.header = "gen", "evals", "std", "min", "avg", "max"
    #print(pop)
    # Evaluate the individuals
    fitnesses = toolbox.map(toolbox.evaluate, pop)
    for ind, fit in zip(pop, fitnesses):
        ind.fitness.values = fit
    
    record = stats.compile(pop)
    logbook.record(gen=0, evals=len(pop), **record)
    print(logbook.stream)
    
    for g in range(1, NGEN):
        for k, agent in enumerate(pop):
            a,b,c = toolbox.select(pop)
            #we adopt a self-adaptative operator
            #l = math.exp(1-(NGEN/(NGEN+1-g)))
            #F = F_0*(2**l)
            #d = toolbox.clone(agent) #donor vector
            #sig_d = toolbox.clone(agent)
            y = toolbox.clone(agent)
            index = random.randrange(NDIM_DE_SMOTE)
            for i, value in enumerate(agent):
                #d[i] = a[i] + F*(b[i]-c[i]) #donor vector
                #the mutated donor is mapped to binary space by a sigmoid function with displacement
                #sig_d[i] = round(1/(1+math.exp(-(d[i]))))
                if i == index or random.random() < CR:
                    y[i] = a[i] + F*(b[i]-c[i])
#                     y[i] = sig_d[i]
            y.fitness.values = toolbox.evaluate(y)
            if y.fitness > agent.fitness:
                pop[k] = y
            #print(pop[k].fitness)
        hof.update(pop)
        record = stats.compile(pop)
        logbook.record(gen=g, evals=len(pop), **record)
        print(logbook.stream)

    print("Best individual is ", hof[0], hof[0].fitness.values[0])
    return hof[0]

In [250]:
# NDIM_DE_SMOTE = minority_samples.shape[0]
# n_majority = new_majorityclass_training.shape[0]
# p = 0.2

# creator.create("FitnessMax", base.Fitness, weights=(1.0,))
# creator.create("Individual", array.array, typecode='d', fitness=creator.FitnessMax, clf=None)

# toolbox = base.Toolbox()
# toolbox.register("attr_int", random.randint, 0, 1)
# toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_int, NDIM_DE_SMOTE)
# toolbox.register("population", tools.initRepeat, list, toolbox.individual)
# toolbox.register("select", tools.selRandom, k=3)

# syn = np.empty(X_US.shape)
# toolbox.register("evaluate", compute_fitness, p,
#                  X_US,y_US,maj_class,min_class,syn)


In [251]:
def DERS_BOOST_classification(x,clfs,clf_weights):
    S = 0
    for t in range(len(clfs)):
        y = clfs[t].predict(x)
        S += clf_weights[t]*y
    if S>=0:
        return 1
    else:
        return -1

In [252]:
def DERS_Boost_train(X,y,maj_class,min_class,T=10,CR=0.6,F=0.5,POP_SIZE=10,NGEN=100):
    N = X.shape[0]
    minority_samples = X[y==min_class]
    majority_samples = X[y==maj_class]
    n_maj = majority_samples.shape[0]
    n_min = minority_samples.shape[0]
    
    Xweights = np.full(N,1/N)
    
    #lista para almacenar el peso de cada clasificador débil
    clf_weights = []
    #lista para almacenar los clasifiadores débiles
    clfs = []
    
    #configuration for the DE algorithm
    p = 0.2
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", array.array, typecode='d', fitness=creator.FitnessMax, clf=None)

    toolbox = base.Toolbox()
    toolbox.register("attr_int", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_int, n_min)
    toolbox.register("population", tools.initRepeat, list, toolbox.individual)
    toolbox.register("select", tools.selRandom, k=3)
    
    for i in range(T):
        
        #generar n_min instancias sintéticas de la clase minoritaria con SMOTE
        syn = compute_synthetics(X,y,n_maj,n_min,maj_class,min_class)
        
        toolbox.register("evaluate", compute_fitness, p,
                     X,y,maj_class,min_class,syn)
        toolbox.unregister("evaluate")
        #DE to select the best synthetics to train the base classifier
        selection_mask = DE_oversampling(CR,F,POP_SIZE,NGEN)
        selected_syn = []
        for i, value in enumerate(selection_mask):
            if selection_mask[i]>0:
                selected_syn.append(syn[i])
        selected_syn = np.array(selected_syn)
        Xtr = np.vstack((X,selected_syn))
        ytr = np.hstack((y,np.full(selected_syn.shape[0],min_class)))
        
        #asignar pesos para las instancias sintéticas
        syn_weights = np.full(selected_syn.shape[0],1/N)
        weights = np.hstack((Xweights,syn_weights))
        #weights = weights.astype('longdouble')

        #entrenar clasificador débil con los pesos
        print(weights)
        clf = trainDT(Xtr,ytr,weights)
        Gmean(clf,X_test,y_test)
        #calcular el error como la suma de los pesos de los ejemplos mal clasificados
        #considerar sólo los pesos de los ejemplos del cjto. original (no syn)
        y_pred = clf.predict(X)
        e = sum(Xweights*(y_pred!=y))
        print(e)
        
        if e==1:
            #return clfs,clf_weights
            w_clf = -2.3
        elif e!=0:
            #peso del clasificador débil
            w_clf = 0.5*math.log((1-e)/e)
            #actualización de pesos
#             Z = 2*math.sqrt(e*(1-e))
#             Xweights = Xweights*np.exp(-w_clf*y*y_pred)/Z
            Xweights = Xweights*np.exp(-w_clf*y*y_pred)
            Xweights /= sum(Xweights)
        else:
            w_clf = 2.3

        clf_weights.append(w_clf)
        clfs.append(clf)
        
    clf_weights
    return clfs,clf_weights

In [253]:
clfs,clf_weights = DERS_Boost_train(X_US,y_US,maj_class,min_class,NGEN=2)



Original dataset shape Counter({-1: 1623, 1: 293})
Resampled dataset shape Counter({-1: 1623, 1: 586})
gen	evals	std    	min    	avg    	max    
0  	10   	582.499	2424.77	3070.79	4013.01


NameError: name 'F' is not defined

In [213]:
DERS_BOOST_classification(X_test[0].reshape(1,-1),clfs,clf_weights)
y_test[0]
print(clf_weights)

[2.3, 2.3, 2.3, 2.3, 2.3, 2.3, 2.3, 2.3, 2.3, 2.3]


In [242]:
y_pred = []
for i in range(X_test.shape[0]):
    c = DERS_BOOST_classification(X_test[i].reshape(1,-1),clfs,clf_weights)
#     print("{} vs. {}".format(c,y_test[i]))
    if c!=y_test[i]:
        print("error")
    y_pred.append(c)

print(sum(y_pred==y_test))
print(y_test.shape)
    

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
erro

In [235]:
G=Gmean(clfs[0],X_test,y_test)
minority_samples = X_US[y_US==min_class]
majority_samples = X_US[y_US==maj_class]
n_maj = majority_samples.shape[0]
n_min = minority_samples.shape[0]
print(n_min)
print(n_maj)
n_minority = n_min+n_min/2
n_majority = n_maj
f = G - abs(1-(n_minority/n_majority*p))
print(f)

Gmean: 0.5284403338836017
293
1623
-0.41740070123654616
