In [60]:
# helpers
import ast
import numpy as np
import pandas as pd
from time import time
from collections import defaultdict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.utils import compute_sample_weight
from sklearn.tree import DecisionTreeClassifier as dtclf
import sklearn.model_selection as ms
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt


def balanced_accuracy(truth, pred):
    wts = compute_sample_weight('balanced', truth)
    return accuracy_score(truth, pred, sample_weight=wts)

scorer = make_scorer(balanced_accuracy)    

# Plots learning curve
def plot_learning_curve(title,
                        train_sizes,
                        train_scores, test_scores, ylim=None, 
                        cv=None,
                        n_jobs=None):
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Test score")

    plt.legend(loc="best")
    
    return plt

# Grid search + learning curve
def basicResults(clfObj, trgX, trgY, tstX, tstY, params, clf_type=None, dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    print('Calculating basic results for', dataset)
    
    # Saves results of hyperparameters grid search
    cv = ms.GridSearchCV(clfObj, 
                         n_jobs=-1, 
                         param_grid=params,
                         verbose=False,
                         cv=5, 
                         scoring=scorer, 
                         return_train_score=True)
    cv.fit(trgX, trgY)
    st = time()
    cv.best_estimator_.fit(trgX, trgY)
    train_time = time() - st
    st = time()
    cv.predict(trgX[0].reshape(1, -1))
    predict_time = time() - st
      
    regTable = pd.DataFrame(cv.cv_results_)
    regTable.to_csv('./output/{}_{}_reg.csv'.format(clf_type, dataset), index=False)
    train_score = cv.score(trgX, trgY)
    test_score = cv.score(tstX, tstY)
    
    # Saves best iteration   
    results = pd.DataFrame({'classifier': [clf_type],
                 'dataset': [dataset],
                 'train_score': [train_score],
                 'test_score': [test_score],
                 'train_time': [train_time],
                 'predict_time': [predict_time],
                 'params': [cv.best_params_]})
    
    results.to_csv('./output/{}_{}_best.csv'.format(clf_type, dataset))
    
    # Saves learning curve
    N = trgY.shape[0]
    train_sizes = [50,100]+[int(N*x/10) for x in range(1,8)]
    np.random.seed(55)
    curve = ms.learning_curve(cv.best_estimator_,
                              trgX, trgY,
                              cv=5,
                              train_sizes=train_sizes,
                              verbose=False,
                              scoring=scorer)

    curve_train_scores = pd.DataFrame(index=curve[0], data=curve[1])
    curve_test_scores  = pd.DataFrame(index=curve[0], data=curve[2])
    
    curve_train_scores.to_csv('./output/{}_{}_LC_train.csv'.format(clf_type, dataset))
    curve_test_scores.to_csv('./output/{}_{}_LC_test.csv'.format(clf_type, dataset))
    
    plt = plot_learning_curve('Learning Curve: {} - {}'.format(clf_type, dataset),
                        train_sizes,
                        curve_train_scores, curve_test_scores, ylim=None, 
                        cv=None,
                        n_jobs=None)
    
    plt.savefig('./output/images/{}_{}_LC.png'.format(clf_type, dataset), format='png', dpi=150)
    plt.close()
    return cv

def iterationLC(clfObj, trgX, trgY, tstX, tstY, params, clf_type=None, dataset=None):
    print('Calculating iteration learning curve for ',dataset)
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    
    d = defaultdict(list)
    name = list(params.keys())[0]
    for value in list(params.values())[0]:        
        d['param_{}'.format(name)].append(value)
        clfObj.set_params(**{name: value})
        clfObj.fit(trgX, trgY)
        pred = clfObj.predict(trgX)
        d['train acc'].append(balanced_accuracy(trgY, pred))
        clfObj.fit(trgX, trgY)
        pred = clfObj.predict(tstX)
        d['test acc'].append(balanced_accuracy(tstY, pred))
        
    d = pd.DataFrame(d)
    d.to_csv('./output/{}_{}_ILC.csv'.format(clf_type, dataset), index=False)
    return
    
def add_noise(y, frac=0.1):
    np.random.seed(456)
    n = y.shape[0]
    sz = int(n*frac)
    ind = np.random.choice(np.arange(n),size=sz,replace=False)
    tmp = y.copy()
    tmp[ind] = 1-tmp[ind]
    return tmp

# Timing curve
def plot_timing_curve(title, data_sizes, fit_times, predict_times, ylim=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Testing Size (% of total)")
    plt.ylabel("Time (s)")
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    predict_times_mean = np.mean(predict_times, axis=1)
    predict_times_std = np.std(predict_times, axis=1)
    plt.grid()

    plt.fill_between(data_sizes, fit_times_mean - fit_times_std,
                     fit_times_mean + fit_times_std, alpha=0.2)
    plt.fill_between(data_sizes, predict_times_mean - predict_times_std,
                     predict_times_mean + predict_times_std, alpha=0.2)
    plt.plot(data_sizes, predict_times_mean, 'o-', linewidth=1, markersize=4,
             label="Predict time")
    plt.plot(data_sizes, fit_times_mean, 'o-', linewidth=1, markersize=4,
             label="Fit time")

    plt.legend(loc="best")
    return plt

def make_timing_curve(X, Y, clf, clf_name, dataset):
    out = defaultdict(dict)
    print('Making timing curve for', dataset)
    fracs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for frac in fracs:  
        X_train, X_test, y_train, y_test = ms.train_test_split(X, Y, test_size=frac, random_state=42)
        st = time()
        np.random.seed(55)
        clf.fit(X_train, y_train)
        out['train'][frac]= time() - st
        st = time()
        clf.predict(X_test)
        out['test'][frac]= time() - st
    out = pd.DataFrame(out)
    out.to_csv('./output/{}_{}_timing.csv'.format(clf_name, dataset))
    train_df = pd.DataFrame(out['train'], index=fracs)
    test_df = pd.DataFrame(out['test'], fracs)
    plt = plot_timing_curve('Timing Curve: {} - {}'.format(clf_name, dataset),
                            np.array(fracs) * 100, 
                            train_df, test_df)
    
    plt.savefig('./output/images/{}_{}_TC.png'.format(clf_name, dataset), format='png', dpi=150)
    plt.close()
    return

def plot_complexity_curve(title, param, classifier, dataset, ylim=None):
    plt.figure()
    plt.title(title)
    
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel(param)
    plt.ylabel("Score")
    
    best = pd.read_csv('./output/{}_{}_best.csv'.format(classifier, dataset))
    best_params = ast.literal_eval(best.loc[0, 'params'])
    grid_search = pd.read_csv('./output/{}_{}_reg.csv'.format(classifier, dataset))
    best_params.pop('{}__{}'.format(classifier, param))
    for _param, value in best_params.items():
        if isinstance(value, tuple):
            grid_search['param_'+_param] = grid_search['param_'+_param].apply(ast.literal_eval)
        grid_search = grid_search.loc[grid_search['param_'+_param] == value]
    df = grid_search[['param_{}__{}'.format(classifier, param), 
                      'mean_test_score', 
                      'std_test_score', 
                      'mean_train_score', 
                      'std_train_score']].sort_values(by='param_{}__{}'.format(classifier, param))
    param_values = df['param_{}__{}'.format(classifier, param)]
    train_scores_mean = df['mean_train_score']
    train_scores_std = df['std_train_score']
    test_scores_mean = df['mean_test_score']
    test_scores_std = df['std_test_score']
    plt.grid()

    plt.fill_between(param_values, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.fill_between(param_values, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2)
    plt.plot(param_values, train_scores_mean, 'o-', linewidth=1, markersize=4,
             label="Train")
    plt.plot(param_values, test_scores_mean, 'o-', linewidth=1, markersize=4,
             label="Test")

    plt.legend(loc="best")
    return plt

def make_complexity_curve(clf_name, dataset, param):    
    print('Making complexity curve for', dataset)
    plt = plot_complexity_curve('Complexity Curve: {} - {} - {}'.format(clf_name, dataset, param),
                                param,
                                clf_name,
                                dataset,
                                ylim=None)

    plt.savefig('./output/images/{}_{}_CC_{}.png'.format(clf_name, dataset, param), format='png', dpi=150)
    plt.close()
    return

def plot_iteration_learning_curve(title, param, classifier, dataset, ylim=None):
    plt.figure()
    plt.title(title)
    
    if ylim is not None:
        plt.ylim(*ylim)
        
    plt.xlabel(param)
    plt.ylabel("Accuracy")

    df = pd.read_csv('./output/{}_{}_ILC.csv'.format(classifier, dataset))
    param_values = df['param_{}__{}'.format(classifier, param)]
    train_scores = df['train acc']
    test_scores = df['test acc']
    
    plt.grid()
    plt.plot(param_values, train_scores, 
             'o-', linewidth=1, markersize=4,
             label="Train")
    plt.plot(param_values, test_scores, 
             'o-', linewidth=1, markersize=4,
             label="Test")

    plt.legend(loc="best")
    return plt

def make_iteration_learning_curve(clf_name, dataset, param):    
    print('Making iteration learning curve for', dataset)
    plt = plot_iteration_learning_curve('Iteration Learning Curve: {} - {}'.format(clf_name, dataset),
                                        param,
                                        clf_name,
                                        dataset,
                                        ylim=None)

    plt.savefig('./output/images/{}_{}_ILC.png'.format(clf_name, dataset), format='png', dpi=150)
    plt.close()
    return

# Modified decision tree classifier that performs post-pruning
class dtclf_pruned(dtclf):        
    def remove_subtree(self,root):
        '''Clean up'''
        tree = self.tree_
        visited,stack= set(),[root]
        while stack:
            v = stack.pop()
            visited.add(v)
            left =tree.children_left[v]
            right=tree.children_right[v]
            if left >=0:
                stack.append(left)
            if right >=0:
                stack.append(right)
        for node in visited:
            tree.children_left[node] = -1
            tree.children_right[node] = -1
        return 
        
    def prune(self):      
        C = 1-self.alpha
        if self.alpha <= -1: # Early exit
            return self
        tree = self.tree_        
        bestScore = self.score(self.valX,self.valY)        
        candidates = np.flatnonzero(tree.children_left>=0)
        for candidate in reversed(candidates): # Go backwards/leaves up
            if tree.children_left[candidate]==tree.children_right[candidate]: # leaf node. Ignore
                continue
            left = tree.children_left[candidate]
            right = tree.children_right[candidate]
            tree.children_left[candidate]=tree.children_right[candidate]=-1            
            score = self.score(self.valX,self.valY)
            if score >= C*bestScore:
                bestScore = score                
                self.remove_subtree(candidate)
            else:
                tree.children_left[candidate]=left
                tree.children_right[candidate]=right
        assert (self.tree_.children_left>=0).sum() == (self.tree_.children_right>=0).sum() 
        return self
        
    def fit(self,X,Y,sample_weight=None,check_input=True, X_idx_sorted=None):        
        if sample_weight is None:
            sample_weight = np.ones(X.shape[0]) 
        self.trgX = X.copy()
        self.trgY = Y.copy()
        self.trgWts = sample_weight.copy()        
        sss = ms.StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=123)
        for train_index, test_index in sss.split(self.trgX,self.trgY):
            self.valX = self.trgX[test_index]
            self.valY = self.trgY[test_index]
            self.trgX = self.trgX[train_index]
            self.trgY = self.trgY[train_index]
            self.valWts = sample_weight[test_index]
            self.trgWts = sample_weight[train_index]
        super().fit(self.trgX,self.trgY,self.trgWts,check_input,X_idx_sorted)
        self.prune()
        return self
    def __init__(self,
                 criterion="gini",
                 splitter="best",
                 max_depth=None,
                 min_samples_split=2,
                 min_samples_leaf=1,
                 min_weight_fraction_leaf=0.,
                 max_features=None,
                 random_state=None,
                 max_leaf_nodes=None,
                 min_impurity_decrease=1e-7,
                 class_weight=None,
                 presort=False,
                 alpha = 0):
        super(dtclf_pruned, self).__init__(
            criterion=criterion,
            splitter=splitter,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            min_weight_fraction_leaf=min_weight_fraction_leaf,
            max_features=max_features,
            max_leaf_nodes=max_leaf_nodes,
            class_weight=class_weight,
            random_state=random_state,
            min_impurity_decrease=min_impurity_decrease,
            presort=presort)
        self.alpha = alpha
        
    def numNodes(self):
        assert (self.tree_.children_left>=0).sum() == (self.tree_.children_right>=0).sum() 
        return  (self.tree_.children_left>=0).sum()

In [2]:
# Data load and preprocessing

# Mushroom dataset
mushroom = pd.read_csv('mushroom.txt', header=None)
print('Dataset shape: ' + str(mushroom.shape))
mushroom.columns = ['type', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor', 'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color', 'stalk-shape', 'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring', 'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number', 'ring-type', 'spore-print-color','population', 'habitat']
mushroom['type_label'] = mushroom['type'].astype('category')
print('Mushroom types: ', str(mushroom['type_label'].cat.categories))
print('Labels balance: \n', mushroom['type_label'].value_counts()/mushroom['type_label'].size)
# Code 1 is for poisonous mushrooms

mushroom['type'] = mushroom['type_label'].cat.codes

# There are no null values, missing is represented as ? only for stalk root
print('Missing values for stalk root: '+ str(sum(mushroom['stalk-root']=='?')))

# We remove feature stalk-root due to high number of missing values
mushroom = pd.get_dummies(mushroom.drop(columns=['type_label','stalk-root']))
mushroom = mushroom.astype(float)
mushroom.type = mushroom.type.astype(int)

Dataset shape: (8124, 23)
Mushroom types:  Index(['e', 'p'], dtype='object')
Labels balance: 
 e    0.517971
p    0.482029
Name: type_label, dtype: float64
Missing values for stalk root: 2480


In [3]:
# Wine dataset

wine = pd.read_csv('winequality-white.csv', sep = ';')
print('Dataset shape: ', str(wine.shape))

# There are no null values in the dataset

# Creation of class for good vs bad wines (good when greater or equal than 7)
wine['quality_label'] = 'bad'
wine.loc[wine['quality']>=7, 'quality_label'] = 'good'
wine['quality_label'] = wine['quality_label'].astype('category')

print('Quality values: ', str(wine['quality_label'].cat.categories))
print('Labels balance: \n', wine['quality_label'].value_counts()/wine['quality_label'].size)

wine['quality_int'] = wine['quality']
wine['quality'] = wine['quality_label'].cat.codes

wine = wine.drop(['quality_label', 'quality_int'], axis=1)
wine = wine.astype(float)
wine.quality = wine.quality.astype(int)

Dataset shape:  (4898, 12)
Quality values:  Index(['bad', 'good'], dtype='object')
Labels balance: 
 bad     0.783585
good    0.216415
Name: quality_label, dtype: float64


In [4]:
# Load Data       
mushroomX = mushroom.drop('type', axis=1).values
mushroomY = mushroom['type'].values

wineX = wine.drop('quality', axis=1).values
wineY = wine['quality'].values

mushroom_trgX, mushroom_tstX, mushroom_trgY, mushroom_tstY = ms.train_test_split(mushroomX, mushroomY, test_size=0.3, random_state=0, stratify=mushroomY)     
wine_trgX, wine_tstX, wine_trgY, wine_tstY = ms.train_test_split(wineX, wineY, test_size=0.3, random_state=0, stratify=wineY)

In [73]:
# Decision trees

def DTpruningVSnodes(clf, alphas, trgX, trgY, dataset):
    '''Dump table of pruning alpha vs. # of internal nodes'''
    print('Dumping table of pruning vs nodes', dataset)
    out = {}
    for a in alphas:
        clf.set_params(**{'DT__alpha':a})
        clf.fit(trgX,trgY)
        out[a]=clf.steps[-1][-1].numNodes()
    out = pd.Series(out)
    out.index.name='alpha'
    out.name = 'Number of Internal Nodes'
    out.to_csv('./output/DT_{}_nodecounts.csv'.format(dataset))    
    return

# Search for good alphas
alphas = [-0.007 + 0.001*(i+1) for i in range(10)]

pipeM = Pipeline([('DT', dtclf_pruned(random_state=55))])

pipeW = Pipeline([('Scale', StandardScaler()),                 
                 ('DT', dtclf_pruned(random_state=55))])

params = {'DT__criterion':['gini', 'entropy'], 
          'DT__alpha': alphas}

mushroom_clf = basicResults(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
                            params,'DT','mushroom')        

wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        params, 'DT', 'wine')

mushroom_final_params = mushroom_clf.best_params_
wine_final_params = wine_clf.best_params_
make_complexity_curve('DT', 'mushroom', 'alpha')
make_complexity_curve('DT', 'wine', 'alpha')

pipeM.set_params(**mushroom_final_params)
make_timing_curve(mushroomX, mushroomY, pipeM, 'DT', 'mushroom')

pipeW.set_params(**wine_final_params)
make_timing_curve(wineX, wineY, pipeW, 'DT', 'wine')

DTpruningVSnodes(pipeM, alphas, mushroom_trgX, mushroom_trgY, 'mushroom')
DTpruningVSnodes(pipeW, alphas, wine_trgX, wine_trgY, 'wine')

Calculating basic results for mushroom
Calculating basic results for wine
Making complexity curve for mushroom
Making complexity curve for wine
Making timing curve for mushroom
Making timing curve for wine
Dumping table of pruning vs nodes mushroom
Dumping table of pruning vs nodes wine


In [74]:
# boosting
from sklearn.ensemble import AdaBoostClassifier

alphas = [-0.007 + 0.001*(i+1) for i in range(10)]

mushroom_base = dtclf_pruned(criterion='gini', class_weight='balanced', random_state=55)                
wine_base = dtclf_pruned(criterion='entropy', class_weight='balanced', random_state=55)

paramsM= {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
          'Boost__base_estimator__alpha': alphas}

paramsW = {'Boost__n_estimators': [1, 2, 5, 10, 20, 30, 45, 60, 80, 100],
           'Boost__base_estimator__alpha': alphas}
                                   
         
mushroom_booster = AdaBoostClassifier(algorithm='SAMME', 
                                      learning_rate=1, 
                                      base_estimator=mushroom_base, 
                                      random_state=55)

wine_booster = AdaBoostClassifier(algorithm='SAMME', 
                                  learning_rate=1, 
                                  base_estimator=wine_base, 
                                  random_state=55)

pipeM = Pipeline([('Boost', mushroom_booster)])

pipeW = Pipeline([('Scale', StandardScaler()),                
                 ('Boost', wine_booster)])

mushroom_clf = basicResults(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
                            paramsM, 'Boost', 'mushroom') 

wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        paramsW, 'Boost', 'wine')        

mushroom_final_params = mushroom_clf.best_params_
wine_final_params = wine_clf.best_params_

make_complexity_curve('Boost', 'mushroom', 'n_estimators')
make_complexity_curve('Boost', 'wine', 'n_estimators')
make_complexity_curve('Boost', 'mushroom', 'base_estimator__alpha')
make_complexity_curve('Boost', 'wine', 'base_estimator__alpha')

pipeM.set_params(**mushroom_final_params)
pipeW.set_params(**wine_final_params)
make_timing_curve(mushroomX, mushroomY, pipeM, 'Boost', 'mushroom')
make_timing_curve(wineX, wineY, pipeW, 'Boost', 'wine')

Calculating basic results for mushroom
Calculating basic results for wine
Making complexity curve for mushroom
Making complexity curve for wine
Making complexity curve for mushroom
Making complexity curve for wine
Making timing curve for mushroom
Making timing curve for wine


In [83]:
#Neural networks
from sklearn.neural_network import MLPClassifier

pipeM = Pipeline([('MLP', MLPClassifier(max_iter=2000, 
                                        early_stopping=True, 
                                        random_state=55))])

pipeW = Pipeline([('Scale', StandardScaler()),
                 ('MLP', MLPClassifier(max_iter=2000,
                                       early_stopping=True, 
                                       random_state=55))])

d = wineX.shape[1]
hiddens_wine = [(h,)*l for l in [1,2] for h in [d,d//2,d*2]]
alphas = [10**-x for x in np.arange(-1,5.01,1/2)]
alphasM = [10**-x for x in np.arange(-1,9.01,1/2)]

d = mushroomX.shape[1]
d = d//(2**4)
hiddens_mushroom = [(h,)*l for l in [1,2] for h in [d,d//2,d*2]]

params_wine = {'MLP__activation': ['relu', 'logistic'], 
               'MLP__alpha': alphas, 
               'MLP__hidden_layer_sizes': hiddens_wine}

params_mushroom = {'MLP__activation': ['relu', 'logistic'], 
                   'MLP__alpha': alphas, 
                   'MLP__hidden_layer_sizes': hiddens_mushroom}

mushroom_clf = basicResults(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
                            params_mushroom, 'MLP', 'mushroom')  
wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        params_wine, 'MLP', 'wine')        
# print(wine_clf.best_estimator_.named_steps['MLP'].n_iter_)

make_complexity_curve('MLP', 'mushroom', 'alpha')
make_complexity_curve('MLP', 'wine', 'alpha')

mushroom_final_params = mushroom_clf.best_params_
wine_final_params = wine_clf.best_params_

pipeM.set_params(**mushroom_final_params)
make_timing_curve(mushroomX, mushroomY, pipeM, 'MLP', 'mushroom')

pipeW.set_params(**wine_final_params)
make_timing_curve(wineX, wineY, pipeW, 'MLP', 'wine')

pipeM.set_params(**{'MLP__early_stopping': False})               
iterationLC(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
            {'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]}, 
            'MLP', 'mushroom')
make_iteration_learning_curve('MLP', 'mushroom', 'max_iter')

pipeW.set_params(**{'MLP__early_stopping': False})    
iterationLC(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
            {'MLP__max_iter': [2**x for x in range(12)] + [2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000]}, 
            'MLP', 'wine')
make_iteration_learning_curve('MLP', 'wine', 'max_iter')

Calculating basic results for mushroom
Calculating basic results for wine
Making complexity curve for mushroom
Making complexity curve for wine
Making timing curve for mushroom
Making timing curve for wine




1




2
4




8




16
32
64
128
256
512
1024
2048
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
Making iteration learning curve for mushroom
1




2
4
8




16




32




64




128
256
512
1024
2048
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
Making iteration learning curve for wine


In [43]:
# SVM
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.metrics import euclidean_distances
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.linear_model import SGDClassifier

class primalSVM_RBF(BaseEstimator, ClassifierMixin):
    '''http://scikit-learn.org/stable/developers/contributing.html'''
    
    def __init__(self, alpha=1e-9, gamma_frac=0.1, max_iter=2000):
         self.alpha = alpha
         self.gamma_frac = gamma_frac
         self.max_iter = max_iter
         
    def fit(self, X, y):
         # Check that X and y have correct shape
         X, y = check_X_y(X, y)
         
         # Get the kernel matrix
         dist = euclidean_distances(X, squared=True)
         median = np.median(dist) 
         del dist
         gamma = median
         gamma *= self.gamma_frac
         self.gamma = 1/gamma
         kernels = rbf_kernel(X, None, self.gamma)    
         self.X_ = X
         self.classes_ = unique_labels(y)
         self.kernels_ = kernels
         self.y_ = y
         self.clf = SGDClassifier(loss='hinge', penalty='l2', alpha=self.alpha,
                                  l1_ratio=0, fit_intercept=True, verbose=False,
                                  average=False, learning_rate='optimal',
                                  class_weight='balanced', max_iter=self.max_iter,
                                  random_state=55)         
         self.clf.fit(self.kernels_, self.y_)
         
         # Return the classifier
         return self

    def predict(self, X):
         # Check is fit had been called
         check_is_fitted(self, ['X_', 'y_','clf','kernels_'])
         # Input validation
         X = check_array(X)
         new_kernels = rbf_kernel(X, self.X_, self.gamma)
         pred = self.clf.predict(new_kernels)
         return pred

N_wine = wine_trgX.shape[0]
N_mushroom = mushroom_trgX.shape[0]
alphas = [10**-x for x in np.arange(1, 9.01, 1/2)]

#Linear SVM
svm_mushroom = SGDClassifier(loss='hinge',
                             l1_ratio=0,
                             penalty='l2',
                             class_weight='balanced',
                             random_state=55,
                             tol=None,
                             max_iter=1000)

svm_wine = SGDClassifier(loss='hinge',
                         l1_ratio=0,
                         penalty='l2',
                         class_weight='balanced',
                         random_state=55,
                         tol=None,
                         max_iter=1000)

pipeM = Pipeline([('SVM_Lin', svm_mushroom)])

pipeW = Pipeline([('Scale', StandardScaler()),                
                ('SVM_Lin', svm_wine)])

params_mushroom = {'SVM_Lin__alpha': alphas, 
                   'SVM_Lin__max_iter': [int(1e6/N_mushroom)]}

params_wine = {'SVM_Lin__alpha': alphas, 
               'SVM_Lin__max_iter': [int(1e6/N_wine)]}
                                                 
mushroom_clf = basicResults(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
                            params_mushroom, 'SVM_Lin', 'mushroom') 
wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        params_wine, 'SVM_Lin', 'wine')

make_complexity_curve('SVM_Lin', 'mushroom', 'alpha')
make_complexity_curve('SVM_Lin', 'wine', 'alpha')

mushroom_final_params = mushroom_clf.best_params_
wine_final_params = wine_clf.best_params_

pipeM.set_params(**mushroom_final_params)
make_timing_curve(mushroomX, mushroomY, pipeM, 'SVM_Lin', 'mushroom')

pipeW.set_params(**wine_final_params)
make_timing_curve(wineX, wineY, pipeW, 'SVM_Lin', 'wine')

pipeM.set_params(**mushroom_final_params)
iterationLC(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
            {'SVM_Lin__max_iter': [30*(x+1) for x in range(8)]}, 
            'SVM_Lin', 'mushroom')  
make_iteration_learning_curve('SVM_Lin', 'mushroom', 'max_iter')

pipeW.set_params(**wine_final_params)
iterationLC(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
            {'SVM_Lin__max_iter': [50*(x+1) for x in range(8)]}, 
            'SVM_Lin', 'wine')
make_iteration_learning_curve('SVM_Lin', 'wine', 'max_iter')

#RBF SVM

gamma_fracsM = np.arange(0.05, 1.01, 0.1)
gamma_fracsW = np.arange(0.2, 2.1, 0.2)

pipeM = Pipeline([('SVM_RBF', primalSVM_RBF())])

pipeW = Pipeline([('Scale', StandardScaler()),
                 ('SVM_RBF', primalSVM_RBF())])

params_mushroom = {'SVM_RBF__alpha': alphas, 
                   'SVM_RBF__max_iter': [int(1e6/N_mushroom)],
                   'SVM_RBF__gamma_frac': gamma_fracsM}

params_wine = {'SVM_RBF__alpha': alphas, 
               'SVM_RBF__max_iter': [int(1e6/N_wine)],
               'SVM_RBF__gamma_frac': gamma_fracsW}
                                            
mushroom_clf = basicResults(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
                            params_mushroom, 'SVM_RBF', 'mushroom')
wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        params_wine, 'SVM_RBF', 'wine')        

mushroom_final_params = mushroom_clf.best_params_
wine_final_params = wine_clf.best_params_

make_complexity_curve('SVM_RBF', 'mushroom', 'alpha')
make_complexity_curve('SVM_RBF', 'wine', 'alpha')

pipeM.set_params(**mushroom_final_params)                     
make_timing_curve(mushroomX, mushroomY, pipeM, 'SVM_RBF', 'mushroom')

pipeW.set_params(**wine_final_params)
make_timing_curve(wineX, wineY, pipeW, 'SVM_RBF', 'wine')

pipeM.set_params(**mushroom_final_params)
iterationLC(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
            {'SVM_RBF__max_iter': [30*(x+1) for x in range(8)]}, 
            'SVM_RBF', 'mushroom')
make_iteration_learning_curve('SVM_RBF', 'mushroom', 'max_iter')

pipeW.set_params(**wine_final_params)
iterationLC(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
            {'SVM_RBF__max_iter': [50*(x+1) for x in range(8)]},
            'SVM_RBF', 'wine')
make_iteration_learning_curve('SVM_RBF', 'wine', 'max_iter')

Calculating basic results for mushroom
Calculating basic results for wine
Making complexity curve for mushroom
Making complexity curve for wine
Making timing curve for mushroom
Making timing curve for wine
Calculating iteration learning curve for  mushroom
Making iteration learning curve for mushroom
Calculating iteration learning curve for  wine
Making iteration learning curve for wine
Calculating basic results for mushroom
Calculating basic results for wine
Making complexity curve for mushroom
Making complexity curve for wine
Making timing curve for mushroom
Making timing curve for wine
Calculating iteration learning curve for  mushroom
Making iteration learning curve for mushroom
Calculating iteration learning curve for  wine
Making iteration learning curve for wine


In [37]:
# knn
from sklearn.neighbors import KNeighborsClassifier as knnC

pipeM = Pipeline([('KNN', knnC())])  

pipeW = Pipeline([('Scale', StandardScaler()),                
                 ('KNN', knnC())])  

params_mushroom= {'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 
                  'KNN__n_neighbors': np.arange(1,51,3), 
                  'KNN__weights': ['uniform', 'distance']}

params_wine= {'KNN__metric': ['manhattan', 'euclidean', 'chebyshev'], 
              'KNN__n_neighbors': np.arange(1,51,3), 
              'KNN__weights': ['uniform', 'distance']}

mushroom_clf = basicResults(pipeM, mushroom_trgX, mushroom_trgY, mushroom_tstX, mushroom_tstY, 
                            params_mushroom, 'KNN', 'mushroom')        
wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        params_wine, 'KNN', 'wine')        

make_complexity_curve('KNN', 'mushroom', 'n_neighbors')
make_complexity_curve('KNN', 'wine', 'n_neighbors')

mushroom_final_params = mushroom_clf.best_params_
wine_final_params = wine_clf.best_params_

pipeM.set_params(**mushroom_final_params)
make_timing_curve(mushroomX, mushroomY, pipeM, 'KNN', 'mushroom')
pipeW.set_params(**wine_final_params)
make_timing_curve(wineX, wineY, pipeW, 'KNN', 'wine')

Calculating basic results for mushroom


KeyboardInterrupt: 