In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import math
import numpy as np
import scipy.stats
import copy
import random
import operator

In [14]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [117]:
class Chromossome:
    def __init__(self, algorithm, random_state=None, **hyperparameter_range):
        self.hyperparameter_range = hyperparameter_range
        try:
            self.classifier = algorithm(random_state=random_state)
        except:
            self.classifier = algorithm()
        self.mutate()
        self.fitness = 0
        self.is_fitted = False
        
    def fit(self, X, y):
        is_fitted = True
        self.classifier.fit(X, y)
        
    def predict(self, X):
        return self.classifier.predict(X)
    
    def check_is_fitted(self):
        return self.is_fitted
    
    def mutate(self, n_positions=None):
        param = {}        
        if not n_positions or n_positions>len(self.hyperparameter_range):
            n_positions = len(self.hyperparameter_range)
        mutation_positions = random.sample(range(0, len(self.hyperparameter_range)), n_positions)
        i = 0
        for hyperparameter, h_range in self.hyperparameter_range.items():
            if i in mutation_positions:
                if isinstance(h_range[0], str):
                    param[hyperparameter] = random.choice(h_range)
                elif isinstance(h_range[0], float):
                    param[hyperparameter] = random.uniform(h_range[0], h_range[1]+1)
                else:
                    param[hyperparameter] = random.randint(h_range[0], h_range[1]+1)
            i+= 1
        
        self.classifier.set_params(**param)

In [129]:
class DiversityEnsembleClassifier:
    def __init__(self, algorithms, population_size = 100, max_epochs = 100, random_state=None):
        self.population_size = population_size
        self.max_epochs = max_epochs
        self.population = []
        self.random_state = random_state
        for algorithm, hyperparameters in algorithms.items():
            for i in range(0, math.ceil(population_size/len(algorithms.keys()))):
                self.population.append(Chromossome(algorithm, random_state=random_state, **hyperparameters))
    
    def generate_offspring(self):
        for i in range(0, self.population_size):
            new_chromossome = copy.deepcopy(self.population[i])
            new_chromossome.mutate(1)
            self.population.append(new_chromossome)
            
    def fit_predict_population(self, kfolds, X, y):
        predictions = np.empty([2*self.population_size, y.shape[0]])
        for i in range(2*self.population_size):
            chromossome = self.population[i]
            for train, val in kfolds.split(X):
                if not chromossome.check_is_fitted(): chromossome.fit(X[train], y[train])
                predictions[i][val] = np.equal(chromossome.predict(X[val]), y[val])
        return predictions 
    
    def diversity_selection(self, predictions):
        distances = np.zeros(2*self.population_size)
        pop_fitness = predictions.sum(axis=1)
        target_chromossome = np.argmin(pop_fitness)
        new_population = []            
        selected = [target_chromossome]
        diversity  = np.zeros(2*self.population_size)
        for i in range(0, self.population_size-1):
            distances[target_chromossome] = float('-inf')
            d_i = np.logical_xor(predictions, predictions[target_chromossome]).sum(axis=1)
            distances += d_i
            diversity += d_i
            target_chromossome = np.argmax(distances)
            selected.append(target_chromossome)
            self.population[target_chromossome].fitness = pop_fitness[target_chromossome]
        for x in selected:
            new_population.append(self.population[x]) 
        print(diversity[selected].sum())
        return new_population
    
    def fit(self, X, y):
        print('Running epoch ', end='')        
        kf = KFold(n_splits=5, random_state=self.random_state)      
        
        for epoch in range(self.max_epochs):             
            print(epoch, end='...')            
            self.generate_offspring()                
            predictions = self.fit_predict_population(kf, X, y)            
            self.population = self.diversity_selection(predictions)
            
        for chromossome in self.population:
            chromossome.fit(X, y)
            
    def predict(self, X):
        predictions = np.empty((self.population_size, len(X)))
        y = np.empty(len(X))
        for chromossome in range(0, self.population_size):
            predictions[chromossome] = self.population[chromossome].predict(X)
        for i in range(0, len(X)):
            pred = {}
            for j in range(0, self.population_size):
                if predictions[j][i] in pred:
                    pred[predictions[j][i]] += self.population[j].fitness
                else: 
                    pred[predictions[j][i]]  = self.population[j].fitness
            y[i] = max(pred.items(), key=operator.itemgetter(1))[0]
        return y
        """
        predictions = np.empty([self.population_size, X.shape[0]])
        for i in range(0, self.population_size):
            predictions[i] = self.population[i].predict(X)
        return scipy.stats.mode(predictions, axis=0).mode[0]
        """

In [130]:
wine = datasets.load_wine()

In [131]:
X_train, X_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size=0.2, random_state=42)

In [134]:
alg = {
            KNeighborsClassifier: {'n_neighbors':[1, 107]},
            SVC: {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'], 
                  'degree' : [0, 3]
                  },
            DecisionTreeClassifier: {'min_samples_leaf':[1, 100], 'max_depth':[3, 20]},
            RandomForestClassifier: {'min_samples_leaf':[1, 100], 'max_depth':[3, 20],
                                     'n_estimators':[100, 100]},
            GaussianNB: {},
            LinearDiscriminantAnalysis: {}
    
      }
dec = DiversityEnsembleClassifier(alg, population_size=50, max_epochs=100, random_state=42)
dec.fit(X_train,  y_train)
print(accuracy_score(y_test, dec.predict(X_test)))

Running epoch 0...123773.0
1...126653.0
2...128511.0
3...130551.0
4...131607.0
5...131848.0
6...132259.0
7...132845.0
8...133965.0
9...134682.0
10...134736.0
11...134736.0
12...134843.0
13...134935.0
14...135094.0
15...135094.0
16...135094.0
17...135377.0
18...135621.0
19...135621.0
20...135826.0
21...136170.0
22...136304.0
23...136304.0
24...136394.0
25...136553.0
26...136559.0
27...136559.0
28...136559.0
29...136559.0
30...136559.0
31...136822.0
32...136988.0
33...137018.0
34...137079.0
35...137079.0
36...137079.0
37...137079.0
38...137079.0
39...137086.0
40...137086.0
41...137086.0
42...137086.0
43...137159.0
44...137159.0
45...137159.0
46...137159.0
47...137159.0
48...137159.0
49...137222.0
50...137222.0
51...137243.0
52...137243.0
53...137243.0
54...137243.0
55...137243.0
56...137231.0
57...137236.0
58...137237.0
59...137237.0
60...137237.0
61...137237.0
62...137237.0
63...137237.0
64...137237.0
65...137237.0
66...137237.0
67...137237.0
68...137237.0
69...137237.0
70...137237.0
71

In [122]:
#Inspect the classifiers
print([x.classifier for x in dec.population])

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=87, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=32, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'), KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'), DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            m

In [123]:
#Check number of unique classifiers
unique = set([str(x.classifier.get_params()) for x in dec.population])
len(unique)

20

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
print(accuracy_score(y_test, rf.predict(X_test)))

In [None]:
np.empty()