In [None]:
from sklearn import tree
from sklearn.tree.export import export_text
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.model_selection import KFold
from sklearn.base import clone
import numpy as np

def cross_validate_tests(X, Y, clf, n_splits = 4):
    
    kf = KFold(n_splits = n_splits)
    best = None
    bestAcc = 0
    scores = []
    X = np.array(X)
    Y = np.array(Y)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        clf = clf.fit(X_train, Y_train)
        score = clf.score(X_test, Y_test)
        if score>bestAcc:
            best = clf
        clf = clone(clf)
        scores.append(score)
        
    scores = np.array(scores)
    print(scores.mean(), scores.std())
    
    return scores, best
        
    

In [None]:
def doTree(data, gini = True, no_decr=True, maxDepth = 4, randomS = 1234):
    print("Using a Decision Tree")
    #uses gini criterion
    if gini:
        if no_decr:
            clf = tree.DecisionTreeClassifier(max_depth = maxDepth, random_state = randomS)
        else:
            clf = tree.DecisionTreeClassifier(max_depth = maxDepth, min_impurity_decrease = 0.1, random_state = randomS)
    else:
        #uses entropy criterion
        if no_decr:
            clf = tree.DecisionTreeClassifier(criterion= "entropy", max_depth = maxDepth, max_leaf_nodes = 3, random_state = randomS)
        else:
            clf = tree.DecisionTreeClassifier(criterion= "entropy", max_depth = maxDepth, max_leaf_nodes = 3, min_impurity_decrease = 0.2, random_state = randomS)

    scores, clf = cross_validate_tests(data['data'], data['target'], clf)
    r = export_text(clf, feature_names=data['feature_names'])
    #print(r)
    return (scores.mean(), clf, scores.std(), scores, r)

In [None]:
def doNN(data, lbfgs=True, size =(5,2), randomS = 1234):
    print("Using a Neural Network")
    #lbfgs is better for small data sets
    if lbfgs:
        nn = MLPClassifier(solver='lbfgs', hidden_layer_sizes=size, max_iter = 350, random_state=randomS)
        #nn = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(10, 10, 10), random_state=1234)
        #nn = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(20, 5, 20, 5), random_state=1234)    
        #nn = MLPClassifier(solver='lbfgs', hidden_layer_sizes=(100, 50, 20, 10, 5), random_state=1234)
    else:
        #uses adam as solver
        nn = MLPClassifier(alpha=1e-5, hidden_layer_sizes=size, max_iter = 350, random_state=randomS)
        #nn = MLPClassifier(hidden_layer_sizes=(10, 10, 10), random_state=1234)
        #nn = MLPClassifier(hidden_layer_sizes=(10, 5, 10, 5), random_state=1234)    
        #nn = MLPClassifier(hidden_layer_sizes=(100, 50, 20, 10, 5), random_state=1234)
    
    scores, nn = cross_validate_tests(data['data'], data['target'], nn)
    return (scores.mean(), nn, scores.std(), scores)

In [None]:
def doKneigh(data, kn=3, randomS = 1234):
    print("Using the K-neighbours algorithm")
    neigh = KNeighborsClassifier(n_neighbors=kn)
    #neigh = KNeighborsClassifier(n_neighbors=3)
    #neigh = KNeighborsClassifier(n_neighbors=5)
    #neigh = KNeighborsClassifier(n_neighbors=8)
    scores, neigh = cross_validate_tests(data['data'], data['target'], neigh)
    return (scores.mean(), neigh, scores.std(), scores)

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

def doSVC(data, kernel ="rbf", randomS = 1234) :
    print("using SVC")
    svclf = make_pipeline(StandardScaler(), SVC(kernel = kernel, gamma='auto', random_state=randomS))
    
    scores, svclf = cross_validate_tests(data['data'], data['target'], svclf)
    return (scores.mean(), svclf, scores.std(), scores)

In [None]:
from sklearn.ensemble import RandomForestClassifier

def doRandomForest(data, n_est= 10, gini=True, randomS = 1234):
    print("using random forest")
    if gini:
        rndF = RandomForestClassifier(n_estimators=n_est, max_depth = 3, random_state=randomS)
    else:
        rndF = RandomForestClassifier(n_estimators=n_est, criterion="entropy", max_depth = 3, random_state=randomS)
    
    scores, rndF = cross_validate_tests(data['data'], data['target'], rndF)
    return (scores.mean(), rndF, scores.std(), scores)

In [2]:
from sklearn.tree import export_graphviz
from pydotplus import graph_from_dot_data
from IPython.display import display, Image

def visualize_tree(tree, columns):
    dotData = export_graphviz(tree, feature_names = columns, class_names = ['Do not choose', 'Choose'], filled = True, rounded = True)
    graph = graph_from_dot_data(dotData)
    display(Image(graph.create_png()))
    
    
def visualize_random_forest(forest, columns):
    for tree in forest.estimators_:
        visualize_tree(tree, columns)
        
def visualize_grad_boost(grad, columns):
    for tree in grad.estimators_:
        visualize_tree(tree[0], columns)

In [None]:
def doAll(data):
    answers= {}
    
    print('tree with gini')    
    bestVals = doTree(data)
    answers['tree with gini'] = bestVals
    
    print('tree with gini and impurity decrease')
    bestVals = doTree(data, True, False)        
    answers['tree with gini and impurity decrease'] = bestVals
    
    print('tree with entropy')
    bestVals = doTree(data, False)
    answers['tree with entropy'] = bestVals
    
    print('tree with entropy and impurity decrease')
    bestVals = doTree(data, False, False)    
    answers['tree with entropy and impurity decrease'] = bestVals
    
    
    
    print('tree with gini with deth of 3')
    bestVals = doTree(data, maxDepth = 3)
    answers['tree with gini with deth of 3'] = bestVals
    
    print('tree with gini and impurity decrease with depth of 3')
    bestVals = doTree(data, True, False, maxDepth = 3)
    answers['tree with gini and impurity decrease with depth of 3'] = bestVals
    
    print('tree with entropy with depth of 3')
    bestVals = doTree(data, False, maxDepth = 3)
    answers['tree with entropy with deth of 3'] = bestVals
    
    print('tree with entropy and impurity decrease with depth of 3')
    bestVals = doTree(data, False, False, maxDepth = 3)
    answers['tree with entropy and impurity decrease with depth of 3'] = bestVals
    
    
    print('small NN with lbfgs')
    bestVals = doNN(data)
    answers['small NN with lbfgs'] = bestVals
    
    print('medium NN with lbfgs')
    bestVals = doNN(data, True, (10, 10, 10))
    answers['medium NN with lbfgs'] = bestVals
    
    print('mixed NN with lbfgs')
    bestVals = doNN(data, True, (20, 5, 20, 5))
    answers['mixed NN with lbfgs'] = bestVals
    
    print('big NN with lbfgs')
    bestVals = doNN(data, True, (100, 50, 20, 10, 5))
    answers['big NN with lbfgs'] = bestVals
    
    print('small NN with adam')
    bestVals = doNN(data, False)
    answers['small NN with adam'] = bestVals
    
    print('medium NN with adam')
    bestVals = doNN(data, False, (10, 10, 10))
    answers['medium NN with adam'] = bestVals
    
    print('mixed NN with adam')
    bestVals = doNN(data, False, (20, 5, 20, 5))
    answers['mixed NN with adam'] = bestVals
    
    print('big NN with adam')
    bestVals = doNN(data, False, (100, 50, 20, 10, 5))
    answers['big NN with adam'] = bestVals 
    
    
    print('KNeigh of 2')
    answers['KNeigh of 2'] = doKneigh(data, 2)
    if len(data['target'])>5:
        print('KNeigh of 3')
        answers['KNeigh of 3'] = doKneigh(data, 3)
    else: 
        answers['KNeigh of 3'] = (0, None)
    if len(data['target'])>10:
        print('KNeigh of 5')
        answers['KNeigh of 5'] = doKneigh(data, 5)
    else: 
        answers['KNeigh of 5'] = (0, None)
    
    if len(data['target'])>20:
        print('KNeigh of 8')
        answers['KNeigh of 8'] = doKneigh(data, 8)
    else: 
        answers['KNeigh of 8'] = (0, None)
    
    
    print('linear division svc')
    bestVals = doSVC(data, 'linear')
    answers['linear division svc'] = bestVals
    
    print('poly division svc')
    bestVals = doSVC(data, 'poly')
    answers['poly division svc'] = bestVals
    
    print('rbf division svc')
    bestVals = doSVC(data)
    answers['rbf division svc'] = bestVals
    
    print('random forest with gini and 3 trees')
    bestVals = doRandomForest(data,3)
    answers['random forest with gini and 3 trees'] =bestVals
    
    print('random forest with gini and 5 trees')
    bestVals = doRandomForest(data,5)
    answers['random forest with gini and 5 trees'] =bestVals
    
    print('random forest with gini and 10 trees')
    bestVals = doRandomForest(data,10)
    answers['random forest with gini and 10 trees'] = bestVals
    
    print('random forest with entropy and 3 trees')
    bestVals = doRandomForest(data,3, False)
    answers['random forest with entropy and 3 trees'] = bestVals
    
    print('random forest with entropy and 5 trees')
    bestVals = doRandomForest(data,5, False)
    answers['random forest with entropy and 5 trees'] = bestVals
    
    print('random forest with entropy and 10 trees')
    bestVals = doRandomForest(data,10, False)
    answers['random forest with entropy and 10 trees'] = bestVals
    
    
    return answers

In [None]:
def finalTests(data):
    answers= {}
    
    print('tree with gini')    
    bestVals = doTree(data)
    answers['tree with gini'] = bestVals
    
    print('tree with entropy')
    bestVals = doTree(data, False)
    answers['tree with entropy'] = bestVals    
    
    print('tree with gini with deth of 3')
    bestVals = doTree(data, maxDepth = 3)
    answers['tree with gini with deth of 3'] = bestVals
    
    print('tree with entropy with depth of 3')
    bestVals = doTree(data, False, maxDepth = 3)
    answers['tree with entropy with deth of 3'] = bestVals
    
    print('poly division svc')
    bestVals = doSVC(data, 'poly')
    answers['poly division svc'] = bestVals
    
    print('rbf division svc')
    bestVals = doSVC(data)
    answers['rbf division svc'] = bestVals
    
    print('random forest with gini and 3 trees')
    bestVals = doRandomForest(data,3)
    answers['random forest with gini and 3 trees'] =bestVals
    
    print('random forest with gini and 10 trees')
    bestVals = doRandomForest(data,10)
    answers['random forest with gini and 10 trees'] = bestVals
    
    print('random forest with entropy and 3 trees')
    bestVals = doRandomForest(data,3, False)
    answers['random forest with entropy and 3 trees'] = bestVals
    
    print('random forest with entropy and 10 trees')
    bestVals = doRandomForest(data,10, False)
    answers['random forest with entropy and 10 trees'] = bestVals
    
    
    return answers