In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection

import evopipe
import steps
import warnings

from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
filename = 'wilt-train.csv'

data = pd.read_csv(filename, sep=',')
train_X = data[data.columns[1:]]
train_Y = data[data.columns[0]]
le = preprocessing.LabelEncoder()

ix = train_Y.index
train_Y = pd.Series(le.fit_transform(train_Y), index=ix)


test_filename = 'wilt-test.csv'

data = pd.read_csv(test_filename, sep=',')
test_X = data[data.columns[1:]]
test_Y = data[data.columns[0]]
le = preprocessing.LabelEncoder()

ix = test_Y.index
test_Y = pd.Series(le.fit_transform(test_Y), index=ix)

params = steps.get_params(len(train_X.iloc[0]))

In [3]:
print("Format:")
print("------------")
print("Classifier")
print("score")
print("cross-validation score")
print("GridSearch score")
print("------------")
print()

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    for name, cls in steps.clfs.items():
        classif = cls()
        print(classif.__class__.__name__)
        classif.fit(train_X, train_Y)
        print(classif.score(test_X, test_Y))
        print(model_selection.cross_val_score(classif, test_X, test_Y).mean())
        
        pipe_params = params[name]
        gs = model_selection.GridSearchCV(classif, pipe_params, n_jobs=-1, verbose=5)
        gs.fit(train_X, train_Y)
        
        print(gs.best_params_)
        print(gs.score(test_X, test_Y))
        print()

Format:
------------
Classifier
score
cross-validation score
GridSearch score
------------

SVC


KeyboardInterrupt: 

In [None]:
scorer = metrics.make_scorer(metrics.cohen_kappa_score, weights='quadratic')
clf = evopipe.EvoPipeClassifier(steps.preproc, steps.clfs, params, mutpb=0.5, swap_mutpb=0.5, param_mutpb=0.85,
                                ind_mutpb=0.8, scorer=scorer)
clf.fit(train_X, train_Y, test_X, test_Y)

score = clf.score(test_X, test_Y)
print("\nBest pipeline test score: {}\n".format(score))

best_pipes = clf.best_pipelines()

for pipe, score in best_pipes:
    pipe.fit(train_X, train_Y)
    # res_Y = pipe.predict(test_X)
    
    # score = metrics.cohen_kappa_score(test_Y, res_Y, weights='quadratic')
    
    pipe_named_steps = []
    for key, val in pipe.steps:
        pipe_named_steps.append(key)       
    print("Score: {}, Pipe: {}".format(score, pipe_named_steps))

Evolution starting...

Gen 6:

Hall of fame:
[('QDA', OrderedDict([('reg_param', 0.5), ('tol', 0.01)]))]
[('kBest', OrderedDict([('k', 5)])), ('gaussianNB', OrderedDict())]
[('gaussianNB', OrderedDict())]
[('kBest', OrderedDict([('k', 5)])), ('LDA', OrderedDict([('shrinkage', 0.5), ('solver', 'eigen')]))]
[('LDA', OrderedDict([('shrinkage', 0.5), ('solver', 'eigen')]))]

Gen 11:

Hall of fame:
[('QDA', OrderedDict([('reg_param', 0.5), ('tol', 0.01)]))]
[('PCA', OrderedDict([('n_components', 5), ('whiten', False)])), ('gaussianNB', OrderedDict())]
[('PCA', OrderedDict([('n_components', 5), ('whiten', True)])), ('PAC', OrderedDict([('C', 10), ('loss', 'squared_hinge')]))]
[('kBest', OrderedDict([('k', 5)])), ('gaussianNB', OrderedDict())]
[('gaussianNB', OrderedDict())]

Gen 16:

Hall of fame:
[('kBest', OrderedDict([('k', 5)])), ('QDA', OrderedDict([('reg_param', 0.5), ('tol', 0.01)]))]
[('QDA', OrderedDict([('reg_param', 0.5), ('tol', 0.01)]))]
[('PCA', OrderedDict([('n_components', 5)

In [None]:
print(clf.logbook)

gen = clf.logbook.select("gen")
avgs, mins, maxs, vars = clf.logbook.chapters["fitness"].select("avg", "min", "max", "var")
avgs_tt, mins_tt, maxs_tt, vars_tt = clf.logbook.chapters["train_test"].select("avg", "min", "max", "var")

sns.set()

fig = plt.figure()
ax1 = fig.add_subplot(111)
line1 = ax1.plot(gen, maxs, label='Maximum Fitness')
ax1.set_xlabel("Generation")
ax1.set_ylabel("Fitness")

line2 = ax1.plot(gen, avgs, label='Average Fitness')

line3 = ax1.plot(gen, maxs_tt, label='Maximum Test score')
line4 = ax1.plot(gen, avgs_tt, label='Average Test score')

lines = line1 + line2 + line3 + line4
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc="best")

plt.show()