In [1]:
import ast
from time import time
from collections import defaultdict
from math import ceil

import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.utils import compute_sample_weight
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
import matplotlib.pyplot as plt
%matplotlib inline
import mlrose

In [59]:
# helpers
def balanced_accuracy(truth, pred):
    wts = compute_sample_weight('balanced', truth)
    return accuracy_score(truth, pred, sample_weight=wts)

scorer = make_scorer(balanced_accuracy)    

# Plots learning curve
def plot_learning_curve(title,
                        train_sizes,
                        train_scores, test_scores, ylim=None, 
                        cv=None,
                        n_jobs=None):
    
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Test score")

    plt.legend(loc="best")
    
    return plt

# Grid search + learning curve
def basicResults(clfObj, trgX, trgY, tstX, tstY, params, clf_type=None, dataset=None):
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    print('Calculating basic results for', dataset)
    
    # Saves results of hyperparameters grid search
    cv = GridSearchCV(clfObj, 
                     n_jobs=-1, 
                     param_grid=params,
                     verbose=False,
                     cv=5, 
                     scoring=scorer, 
                     return_train_score=True)
    cv.fit(trgX, trgY)
    st = time()
    cv.best_estimator_.fit(trgX, trgY)
    train_time = time() - st
    st = time()
    cv.predict(trgX[0].reshape(1, -1))
    predict_time = time() - st
    
    def schedule_to_str(schedule):
        schedule = str(schedule)
        if 'Geom' in schedule:
            return 'geom_decay'
        elif 'Exp' in schedule:
            return 'exp_decay'
        elif 'Arith' in schedule:
            return 'arith_decay'
    
    best_params = cv.best_params_.copy()
    regTable = pd.DataFrame(cv.cv_results_)
    
    if 'NN_sa__schedule' in params:
        regTable['param_NN_sa__schedule'] = regTable['param_NN_sa__schedule'].apply(lambda x: schedule_to_str(x))
        best_params['NN_sa__schedule'] = schedule_to_str(best_params['NN_sa__schedule'])
    
    regTable.to_csv('./output/{}_{}_reg.csv'.format(clf_type, dataset), index=False)
    train_score = cv.score(trgX, trgY)
    test_score = cv.score(tstX, tstY)

    # Saves best iteration
    results = pd.DataFrame({'classifier': [clf_type],
                 'dataset': [dataset],
                 'train_score': [train_score],
                 'test_score': [test_score],
                 'train_time': [train_time],
                 'predict_time': [predict_time],
                 'params': [best_params]})
    
    results.to_csv('./output/{}_{}_best.csv'.format(clf_type, dataset))
    
    # Saves learning curve
    N = trgY.shape[0]
    train_sizes = [50,100]+[int(N*x/10) for x in range(1,8)]
    np.random.seed(55)
    curve = learning_curve(cv.best_estimator_,
                           trgX, trgY,
                           cv=5,
                           train_sizes=train_sizes,
                           verbose=False,
                           scoring=scorer)

    curve_train_scores = pd.DataFrame(index=curve[0], data=curve[1])
    curve_test_scores  = pd.DataFrame(index=curve[0], data=curve[2])
    
    curve_train_scores.to_csv('./output/{}_{}_LC_train.csv'.format(clf_type, dataset))
    curve_test_scores.to_csv('./output/{}_{}_LC_test.csv'.format(clf_type, dataset))
    
    plt = plot_learning_curve('Learning Curve: {} - {}'.format(clf_type, dataset),
                        train_sizes,
                        curve_train_scores, curve_test_scores, ylim=None, 
                        cv=None,
                        n_jobs=None)
    
    plt.savefig('./output/images/{}_{}_LC.png'.format(clf_type, dataset), format='png', dpi=150)
    plt.close()
    return cv

def iterationLC(clfObj, trgX, trgY, tstX, tstY, params, clf_type=None, dataset=None):
    print('Calculating iteration learning curve for ',dataset)
    np.random.seed(55)
    if clf_type is None or dataset is None:
        raise
    
    d = defaultdict(list)
    name = list(params.keys())[0]
    for value in list(params.values())[0]:        
        d['param_{}'.format(name)].append(value)
        clfObj.set_params(**{name: value})
        clfObj.fit(trgX, trgY)
        pred = clfObj.predict(trgX)
        d['train acc'].append(balanced_accuracy(trgY, pred))
        clfObj.fit(trgX, trgY)
        pred = clfObj.predict(tstX)
        d['test acc'].append(balanced_accuracy(tstY, pred))
        
    d = pd.DataFrame(d)
    d.to_csv('./output/{}_{}_ILC.csv'.format(clf_type, dataset), index=False)
    return
    
# Timing curve
def plot_timing_curve(title, data_sizes, fit_times, predict_times, ylim=None):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Testing Size (% of total)")
    plt.ylabel("Time (s)")
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    predict_times_mean = np.mean(predict_times, axis=1)
    predict_times_std = np.std(predict_times, axis=1)
    plt.grid()

    plt.fill_between(data_sizes, fit_times_mean - fit_times_std,
                     fit_times_mean + fit_times_std, alpha=0.2)
    plt.fill_between(data_sizes, predict_times_mean - predict_times_std,
                     predict_times_mean + predict_times_std, alpha=0.2)
    plt.plot(data_sizes, predict_times_mean, 'o-', linewidth=1, markersize=4,
             label="Predict time")
    plt.plot(data_sizes, fit_times_mean, 'o-', linewidth=1, markersize=4,
             label="Fit time")

    plt.legend(loc="best")
    return plt

def make_timing_curve(X, Y, clf, clf_name, dataset):
    out = defaultdict(dict)
    print('Making timing curve for', dataset)
    fracs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    for frac in fracs:  
        X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=frac, random_state=42)
        st = time()
        np.random.seed(55)
        clf.fit(X_train, y_train)
        out['train'][frac]= time() - st
        st = time()
        clf.predict(X_test)
        out['test'][frac]= time() - st
    out = pd.DataFrame(out)
    out.to_csv('./output/{}_{}_timing.csv'.format(clf_name, dataset))
    train_df = pd.DataFrame(out['train'], index=fracs)
    test_df = pd.DataFrame(out['test'], fracs)
    plt = plot_timing_curve('Timing Curve: {} - {}'.format(clf_name, dataset),
                            np.array(fracs) * 100, 
                            train_df, test_df)
    
    plt.savefig('./output/images/{}_{}_TC.png'.format(clf_name, dataset), format='png', dpi=150)
    plt.close()
    return

def plot_complexity_curve(title, param, classifier, dataset, ylim=None):
    plt.figure()
    plt.title(title)
    
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel(param)
    plt.ylabel("Score")
    
    best = pd.read_csv('./output/{}_{}_best.csv'.format(classifier, dataset))
    best_params = ast.literal_eval(best.loc[0, 'params'])
#     print(type(best_params['NN_ga__learning_rate']))
    grid_search = pd.read_csv('./output/{}_{}_reg.csv'.format(classifier, dataset))
#     print(grid_search['param_NN_ga__learning_rate'].dtype)
#     print(grid_search['param_NN_ga__learning_rate'])
    best_params.pop('{}__{}'.format(classifier, param))
    for _param, value in best_params.items():
        if isinstance(value, tuple):
            grid_search['param_'+_param] = grid_search['param_'+_param].apply(ast.literal_eval)
        grid_search = grid_search.loc[grid_search['param_'+_param] == value]
#         print(_param, value, grid_search.iloc[0:2])

    df = grid_search[['param_{}__{}'.format(classifier, param), 
                      'mean_test_score', 
                      'std_test_score', 
                      'mean_train_score', 
                      'std_train_score']].sort_values(by='param_{}__{}'.format(classifier, param))
    param_values = df['param_{}__{}'.format(classifier, param)]
    train_scores_mean = df['mean_train_score']
    train_scores_std = df['std_train_score']
    test_scores_mean = df['mean_test_score']
    test_scores_std = df['std_test_score']
    plt.grid()

    plt.fill_between(param_values, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2)
    plt.fill_between(param_values, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2)
    plt.plot(param_values, train_scores_mean, 'o-', linewidth=1, markersize=4,
             label="Train")
    plt.plot(param_values, test_scores_mean, 'o-', linewidth=1, markersize=4,
             label="Test")

    plt.legend(loc="best")
    return plt

def make_complexity_curve(clf_name, dataset, param):    
    print('Making complexity curve for', dataset)
    plt = plot_complexity_curve('Complexity Curve: {} - {} - {}'.format(clf_name, dataset, param),
                                param,
                                clf_name,
                                dataset,
                                ylim=None)

    plt.savefig('./output/images/{}_{}_CC_{}.png'.format(clf_name, dataset, param), format='png', dpi=150)
    plt.close()
    return

def plot_iteration_learning_curve(title, param, classifier, dataset, ylim=None):
    plt.figure()
    plt.title(title)
    
    if ylim is not None:
        plt.ylim(*ylim)
        
    plt.xlabel(param)
    plt.ylabel("Accuracy")

    df = pd.read_csv('./output/{}_{}_ILC.csv'.format(classifier, dataset))
    param_values = df['param_{}__{}'.format(classifier, param)]
    train_scores = df['train acc']
    test_scores = df['test acc']
    
    plt.grid()
    plt.plot(param_values, train_scores, 
             'o-', linewidth=1, markersize=4,
             label="Train")
    plt.plot(param_values, test_scores, 
             'o-', linewidth=1, markersize=4,
             label="Test")

    plt.legend(loc="best")
    return plt

def make_iteration_learning_curve(clf_name, dataset, param):    
    print('Making iteration learning curve for', dataset)
    plt = plot_iteration_learning_curve('Iteration Learning Curve: {} - {}'.format(clf_name, dataset),
                                        param,
                                        clf_name,
                                        dataset,
                                        ylim=None)

    plt.savefig('./output/images/{}_{}_ILC.png'.format(clf_name, dataset), format='png', dpi=150)
    plt.close()
    return

In [3]:
# Wine dataset

wine = pd.read_csv('winequality-white.csv', sep = ';')
print('Dataset shape: ', str(wine.shape))

# There are no null values in the dataset

# Creation of class for good vs bad wines (good when greater or equal than 7)
wine['quality_label'] = 'bad'
wine.loc[wine['quality']>=7, 'quality_label'] = 'good'
wine['quality_label'] = wine['quality_label'].astype('category')

print('Quality values: ', str(wine['quality_label'].cat.categories))
print('Labels balance: \n', wine['quality_label'].value_counts()/wine['quality_label'].size)

wine['quality_int'] = wine['quality']
wine['quality'] = wine['quality_label'].cat.codes

wine = wine.drop(['quality_label', 'quality_int'], axis=1)
wine = wine.astype(float)
wine.quality = wine.quality.astype(int)

Dataset shape:  (4898, 12)
Quality values:  Index(['bad', 'good'], dtype='object')
Labels balance: 
 bad     0.783585
good    0.216415
Name: quality_label, dtype: float64


In [4]:
# Load Data       
wineX = wine.drop('quality', axis=1).values
wineY = wine['quality'].values

wine_trgX, wine_tstX, wine_trgY, wine_tstY = train_test_split(wineX, wineY, test_size=0.3, random_state=0, stratify=wineY)

In [17]:
# Neural Networks
class NeuralNetworkF(mlrose.NeuralNetwork):
    def get_params(self, deep):
        return {'hidden_nodes': self.hidden_nodes,
                'max_iters': self.max_iters,
                'bias': self.bias,
                'is_classifier': self.is_classifier,
                'learning_rate': self.lr,
                'early_stopping': self.early_stopping,
                'clip_max': self.clip_max,
                'schedule': self.schedule,
                'pop_size': self.pop_size,
                'mutation_prob': self.mutation_prob}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

nn_params = {'hidden_nodes': [22],
             'activation': 'relu',
             'bias': True,
             'is_classifier': True,
             'early_stopping': True,
             'learning_rate': 0.1}

In [151]:
# Neural Networks random hill climb
pipeW = Pipeline([('Scale', StandardScaler()),
                  ('NN_rhc', NeuralNetworkF(algorithm='random_hill_climb',
                                        **nn_params))])

learning_rate = [10**x for x in np.arange(-4, 0, 1/2)]

params_wine = {'NN_rhc__max_iters': [5000],
               'NN_rhc__max_attempts': [50],
               'NN_rhc__learning_rate': learning_rate}

wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        params_wine, 'NN_rhc', 'wine')        

make_complexity_curve('NN_rhc', 'wine', 'learning_rate')

wine_final_params = wine_clf.best_params_

pipeW.set_params(**wine_final_params)
make_timing_curve(wineX, wineY, pipeW, 'NN_rhc', 'wine')

max_iters = [2**x for x in range(14)]
pipeW.set_params(**{'NN_rhc__early_stopping': False})    
iterationLC(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
            {'NN_rhc__max_iters': max_iters}, 
             'NN_rhc', 'wine')
make_iteration_learning_curve('NN_rhc', 'wine', 'max_iters')

Calculating basic results for wine


KeyboardInterrupt: 

In [None]:
# Neural Networks simulated annealing
pipeW = Pipeline([('Scale', StandardScaler()),
                  ('NN_sa', NeuralNetworkF(algorithm='simulated_annealing',
                                        **nn_params))])

learning_rate = [10**x for x in np.arange(-4, 1/2, 1/2)]
schedule = [mlrose.GeomDecay(), mlrose.ArithDecay(), mlrose.ExpDecay()]

params_wine = {'NN_sa__max_iters': [5000],
               'NN_sa__max_attempts': [50],
               'NN_sa__learning_rate': learning_rate,
               'NN_sa__schedule': schedule}

wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        params_wine, 'NN_sa', 'wine')        

make_complexity_curve('NN_sa', 'wine', 'learning_rate')

wine_final_params = wine_clf.best_params_

pipeW.set_params(**wine_final_params)
make_timing_curve(wineX, wineY, pipeW, 'NN_sa', 'wine')

# max_iters = [2**x for x in range(14)]
max_iters = [2**x for x in range(12)]
pipeW.set_params(**{'NN_sa__early_stopping': False})    
iterationLC(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
            {'NN_sa__max_iters': max_iters}, 
             'NN_sa', 'wine')
make_iteration_learning_curve('NN_sa', 'wine', 'max_iters')

Calculating basic results for wine


In [None]:
# Neural Networks Genetic Algorithm
pipeW = Pipeline([('Scale', StandardScaler()),
                  ('NN_ga', NeuralNetworkF(algorithm='genetic_alg',
                                        **nn_params))])

learning_rate = [round(10**x, 8) for x in np.arange(-4, 0, 1/2)]
mutation_prob = [round(10**x, 8) for x in np.arange(-4, 0, 1/2)]
pop_size = [round(10**x) for x in np.arange(1, 3.5, 1/2)]

params_wine = {'NN_ga__max_iters': [2000],
               'NN_ga__max_attempts': [20],
               'NN_ga__learning_rate': learning_rate,
               'NN_ga__pop_size': pop_size,
               'NN_ga__mutation_prob': mutation_prob}

wine_clf = basicResults(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
                        params_wine, 'NN_ga', 'wine')        

make_complexity_curve('NN_ga', 'wine', 'learning_rate')

wine_final_params = wine_clf.best_params_

pipeW.set_params(**wine_final_params)
make_timing_curve(wineX, wineY, pipeW, 'NN_ga', 'wine')

max_iters = [2**x for x in range(12)]
pipeW.set_params(**{'NN_ga__early_stopping': False})
iterationLC(pipeW, wine_trgX, wine_trgY, wine_tstX, wine_tstY, 
            {'NN_ga__max_iters': max_iters}, 
             'NN_ga', 'wine')
make_iteration_learning_curve('NN_ga', 'wine', 'max_iters')

Calculating basic results for wine


In [None]:
# Max 1's
# Knapsack Problem
# Traveller problem
# 4 peaks problem
# Max colors