In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
import math
import numpy as np
import scipy.stats
import copy
import random

In [13]:
class Chromossome:
    def __init__(self, algorithm, **hyperparemeter_range):
        self.hyperparemeter_range = hyperparemeter_range
        self.classifier = algorithm()
        self.mutate()
        
    def fit(self, X, y):
        self.classifier.fit(X, y)
        
    def predict(self, X):
        return self.classifier.predict(X)
    
    def mutate(self):
        param = {}        
        for hyperparameter, h_range in self.hyperparemeter_range.items():
            if isinstance(h_range[0], str):
                param[hyperparameter] = random.choice(h_range)
            elif isinstance(h_range[0], float):
                param[hyperparameter] = random.uniform(h_range[0], h_range[1])
            else:
                param[hyperparameter] = random.randint(h_range[0], h_range[1])
        
        self.classifier.set_params(**param)

In [56]:
class DiversityEnsembleClassifier:
    def __init__(self, algorithms, population_size = 100, max_epochs = 100, random_state=None):
        self.population_size = population_size
        self.max_epochs = max_epochs
        self.population = []
        self.random_state = random_state
        random.seed(random_state)
        for algorithm, hyperparameters in algorithms.items():
            for i in range(0, math.ceil(population_size/len(algorithms.keys()))):
                self.population.append(Chromossome(algorithm, **hyperparameters))
        
    def fit(self, X, y):
        print('Running epoch ', end='')
        
        max_diversity = float('-inf')
        
        for epoch in range(self.max_epochs):
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=epocj)
            random.seed(self.random_state)
            print(epoch, end='...')
            predictions = np.empty([2*self.population_size, y_val.shape[0]])
            
            for i in range(0, self.population_size):
                new_chromossome = copy.deepcopy(self.population[i])
                new_chromossome.mutate()
                self.population.append(new_chromossome)
                
            for i in range(2*self.population_size):
                chromossome = self.population[i]
                chromossome.fit(X_train, y_train)
                predictions[i] = np.logical_and(chromossome.predict(X_val), y_val)
                 
            distances = np.zeros(2*self.population_size)
            target_chromossome = np.argmax(predictions.sum(axis=1))
            new_population = [self.population[target_chromossome]]            
            
            for i in range(0, self.population_size-1):
                distances[target_chromossome] = float('-inf')
                d_i = np.logical_xor(predictions, predictions[target_chromossome]).sum(axis=1)
                d_i[d_i == 0] = -10
                distances += d_i
                target_chromossome = np.argmax(distances)
                new_population.append(self.population[target_chromossome])
                if epoch == self.max_epochs -1:
                    print(predictions[target_chromossome])           
            diversity = distances[distances>float('-inf')].sum()
            print(diversity)
            if diversity > max_diversity:
                self.population =new_population
                max_diversity = diversity              
            
        for chromossome in self.population:
            chromossome.fit(X, y)
            
    def predict(self, X):
        predictions = np.empty([self.population_size, X.shape[0]])
        for i in range(0, self.population_size):
            predictions[i] = self.population[i].predict(X)
        return scipy.stats.mode(predictions, axis=0).mode
        

In [57]:
wine = datasets.load_wine()

In [58]:
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2, random_state=42)
alg = {
            KNeighborsClassifier: {'n_neighbors':[1, 120]},
            SVC: {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 
                  'degree' : [0, 3]
                  },
            DecisionTreeClassifier: {'min_samples_leaf':[1, 100], 'max_depth':[3, 20]}
      }
dec = DiversityEnsembleClassifier(alg, population_size=50, max_epochs=100, random_state=42)
dec.fit(X_train,  y_train)
print('Accuracy', np.logical_and(dec.predict(X_test), y_test).sum()/y_test.shape[0])


Running epoch 0...-18615.0
1...-14331.0
2...-20502.0
3...-7633.0
4...-5350.0
5...-1123.0
6...-5385.0
7...-7104.0
8...-5406.0
9...544.0
10...-435.0
11...-7293.0
12...-3493.0
13...-1026.0
14...-4947.0
15...-15810.0
16...-7089.0
17...-11471.0
18...-2889.0
19...-17.0
20...-7065.0
21...-8517.0
22...-3565.0
23...2383.0
24...781.0
25...-2422.0
26...6063.0
27...-24990.0
28...1456.0
29...6939.0
30...1530.0
31...2499.0
32...-2403.0
33...-78.0
34...-21063.0
35...-3567.0
36...-1527.0
37...-3693.0
38...-255.0
39...-491.0
40...-6600.0
41...-3585.0
42...-760.0
43...-147.0
44...3372.0
45...1776.0
46...-8568.0
47...-14433.0
48...-2981.0
49...-3570.0
50...-24990.0
51...-2589.0
52...3789.0
53...-4114.0
54...4647.0
55...-2634.0
56...-7854.0
57...-24990.0
58...-1867.0
59...3276.0
60...-17136.0
61...-24990.0
62...393.0
63...2600.0
64...-3921.0
65...-2014.0
66...-6600.0
67...1104.0
68...-10956.0
69...-3156.0
70...2915.0
71...-9041.0
72...-833.0
73...2142.0
74...548.0
75...-10242.0
76...-9537.0
77...-11405.0


In [46]:
print([x.classifier for x in dec.population])

[SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=0, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=9,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=47, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=60, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'), SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_funct

In [45]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
print('Accuracy', np.logical_and(rf.predict(X_test), y_test).sum()/y_test.shape[0])


Accuracy 0.6111111111111112
