# Setup, globals

In [2]:
import numpy as np
import pandas as pd
import copy
import pickle
from matplotlib import pyplot as plt

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelBinarizer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR, LinearSVR, LinearSVC
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_validate, cross_val_predict, cross_val_score, train_test_split, KFold, StratifiedKFold
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.metrics import classification_report, make_scorer, accuracy_score, f1_score 
from sklearn.utils import resample
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression

In [4]:
random_state = 42 
# select computationally feasible options. As working with constraint resources, 
# random search is more likely to find one of the best specifications.
N_FOLDS = 5 # 5
N_RANDOMSEARCH = 75 # 75

DATA_PATH = 'data/'
RESULTS_PATH = 'results/'
pd.options.display.float_format = '{:.3f}'.format

In [5]:
wine = pd.read_csv(DATA_PATH + 'wines_transformed.csv')

y = wine['quality']
x = wine.drop(['quality', 'red'], axis=1)
x_inclRed = wine.drop(['quality'], axis=1)

## Models
Set up models and hyperparameter space
We were instructed to run it as a classification and regression task

In [6]:
clfs = {}
clfs['RandomForest'] = {'classifier': RandomForestClassifier(),
                         'regressor': RandomForestRegressor(),
                         'standardise': False,
                         'clf_1hot': False,
                         'param_grid': 
                              {'RandomForest__n_estimators': [10, 50, 100],
                               'RandomForest__max_features': ['auto', 'sqrt'],
                               'RandomForest__max_depth': scipy.stats.randint(10,100),
                               'RandomForest__min_samples_split':  scipy.stats.randint(2,12),
                               'RandomForest__min_samples_leaf': scipy.stats.randint(1,5),
                               #'RandomForest__class_weight': [None, 'balanced']
                              }}

clfs['LogisticReg'] = {'classifier': LogisticRegression(),
                         'regressor': None,
                         'standardise': True,
                         'clf_1hot': False,
                         'param_grid': 
                              {
                                  'LogisticReg__class_weight': [None, 'balanced']
                              }}
clfs['LinearReg'] = {'classifier': None,
                         'regressor': LinearRegression(),
                         'standardise': True,
                         'clf_1hot': False,
                         'param_grid': 
                              {
                                  'LinearReg__class_weight': [None, 'balanced']
                              }}
clfs['NaiveBayes'] = {'classifier': GaussianNB(),
                         'regressor': None,
                         'standardise': True,
                         'clf_1hot': False,
                         'param_grid': 
                              {}}                      
clfs['KNN'] = {'classifier': KNeighborsClassifier(),
                         'regressor': KNeighborsRegressor(),
                         'standardise': True,
                         'clf_1hot': False,
                         'param_grid': 
                              {'KNN__n_neighbors': scipy.stats.randint(1,10),
                               'KNN__weights': ['uniform' , 'distance']}}

# define tuples for 2 or! 3 layers of 64 or 128 hidden units
layer_sizes = [[l1, l2, l3] for l1 in [64,128] for l2 in [64,128] for l3 in [None,64,128]]
[l.pop(-1) for l in layer_sizes if l[-1]==None]
clfs['MLP'] = {'classifier': MLPClassifier(early_stopping=True), # reduce runtime
                'regressor': MLPRegressor(early_stopping=True),
                'standardise': True,
                'clf_1hot': False,
                'param_grid': 
                      {'MLP__hidden_layer_sizes': layer_sizes,
                       'MLP__alpha':  scipy.stats.uniform(10**(-4), 10**3),
                      }}
clfs['Dummy_strat'] = {'classifier': DummyClassifier(strategy='stratified'), # default
                         'regressor': DummyRegressor(strategy='mean'), # default
                         'standardise': True,
                         'clf_1hot': False,
                         'param_grid': 
                             {}}
clfs['Dummy_majority'] = {'classifier': DummyClassifier(strategy='most_frequent'),
                         'regressor': DummyRegressor(strategy='mean'), # default
                         'standardise': True,
                         'clf_1hot': False,
                         'param_grid': 
                             {}}

# split SVM to speed up as certain paramters are kernel-specific 
cache_size = 5000 #in MB, speed-up if enough RAM (default: 200)
C = np.logspace(-5, 15, num=30, base=2)
clfs['SVM_rbf'] = {'classifier': SVC(kernel='rbf', cache_size=cache_size),
            'regressor': SVR(kernel='rbf', cache_size=cache_size),
            'standardise': True,
            'clf_1hot': False,
            'param_grid': 
                  {'SVM_rbf__C': C,
                   'SVM_rbf__gamma': np.logspace(-15, 3, num=19, base=2), # only used by rbf
                   'SVM_rbf__class_weight': [None, 'balanced']
            }}

# forgo polynomial kernel as runtime seems really exzessive
#clfs['SVM_poly'] = {'classifier': SVC(kernel='poly', cache_size=cache_size),
#                'regressor': SVR(kernel='poly', cache_size=cache_size),
#                'standardise': False,
#                'clf_1hot': False,
#                'param_grid': 
#                      {'SVM_poly__C': C,
#                       'SVM_poly__degree': [2,3,4], # only used by poly
#                       'SVM_poly__class_weight': [None, 'balanced']}}

# more efficient than SVC / SVR. 
# Allows to solve the primal problem, as #obs >> #features in this case
# l2 loss, needed if dual=false
clfs['SVM_linear'] = {'classifier': LinearSVC(dual=False),
                'regressor': LinearSVR(dual=False, loss='squared_epsilon_insensitive'),
                'standardise': True,
                'clf_1hot': False,
                'param_grid': 
                      {'SVM_linear__C': np.logspace(-5, 15, num=N_RANDOMSEARCH, base=2), # hyperparameter space needs to be large enough
                       'SVM_linear__class_weight': [None, 'balanced']
                        }}


# define new scores for variance of the error (used for CIs) to not have to 
# explicitely obtain predictions via use cross_val_predict
def mean_variance_mse(y, y_pred, **kwargs):
    SE = (y - y_pred)**2
    SE = SE.astype(np.float128) # seems like sklean switches to pd.var which doesn't support dtype argument and uses ddof=1. So better make sure
    v = np.var(SE, ddof=0) / (SE.shape[0] - 1)
    return v 

def mean_variance_acc(y, y_pred, **kwargs):
    acc = (y == y_pred)
    acc = acc.astype(np.float128) 
    v = np.var(acc, ddof=0) / (acc.shape[0] - 1)
    return v

def mean_variance_acc_reg(y, y_pred, **kwargs):
    preds = np.round(y_pred, 0).astype(int)
    acc = (y.astype(int) == preds)
    acc = acc.astype(np.float128) 
    v = np.var(acc, ddof=0) / (acc.shape[0] - 1)
    return v

def regression_acc(y, y_pred, **kwargs):
    preds = np.round(y_pred, 0).astype(int)
    return (accuracy_score(y.astype(int), preds))

def regression_f1_macro(y, y_pred, **kwargs):
    preds = np.round(y_pred, 0).astype(int)
    return f1_score(y.astype(int), preds, average='macro')

mean_variance_score_mse = make_scorer(mean_variance_mse, greater_is_better=False)
mean_variance_score_acc = make_scorer(mean_variance_acc, greater_is_better=True)
mean_variance_score_acc_reg = make_scorer(mean_variance_acc_reg, greater_is_better=True)
regression_acc_score =    make_scorer(regression_acc, greater_is_better=True)
regression_f1_macro_score = make_scorer(regression_f1_macro, greater_is_better=True)

# differentiate between classification and regression
scoring_reg = {'mse': 'neg_mean_squared_error',
               'mae': 'neg_mean_absolute_error',
               'acc_reg': regression_acc_score,
               'f1_macro': regression_f1_macro_score,
               'var_mse': mean_variance_score_mse,
               'var_acc': mean_variance_score_acc_reg}
scoring_clf = {'acc': 'accuracy',
               'precision_macro': 'precision_macro', # micro: calculated globally
               'precision_weighted': 'precision_weighted',
               'recall_macro': 'recall_macro',
               'recall_weighted': 'recall_weighted',
               'f1_micro': 'f1_micro',
               'f1_macro': 'f1_macro',
               'f1_weighted': 'f1_weighted',
               'var_acc': mean_variance_score_acc}
prefixes = ['x_', 'x_inclRed_']

Run a valid model validation setup:  
An inner loop to tune the models and an outer loop to obtain the scores

In [None]:
inner_cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=random_state)
outer_cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=random_state+1)

for idx_data, X in enumerate([x, x_inclRed]): 
    print('\nSTART NEW DATASET-LOOP\n')
    prefix = prefixes[idx_data]
    
    for classification in [True, False]:
        if classification:
            estimator = 'classifier'
            scoring = scoring_clf
            suffix = '_clf'
        else:
            estimator = 'regressor'
            scoring = scoring_reg
            suffix = '_reg'

        for model, d in clfs.items():
            print('CURRENT MODEL:', model, '- classification', classification)
            if d[estimator] == None:
                print('No estimator')
                continue

            # never actually needed, turns out sklearn always handles this
            if classification and d['clf_1hot']:
                lb = LabelBinarizer(sparse_output=False)
                Y = lb.fit_transform(y)
            else:
                Y = y

            pipe = Pipeline([
                        ('scaler', RobustScaler(with_centering=d['standardise'], with_scaling=d['standardise'])),
                        (model, d[estimator])
            ])

            # CV
            grid = d['param_grid'].copy()
            
            # Benchmarks without hyperparameters: only run 1 iteration
            if grid == {}:
                iters = 1
            else:
                iters = N_RANDOMSEARCH
               
            # run with n_jobs=-1 to utilize all resources (outside of notebook as it needs protection by a 
            # if __name__ == "__main__" clause if run on windows)
            gs = RandomizedSearchCV(pipe, grid, cv=inner_cv, n_iter=iters)
            score = cross_validate(gs, X=X, y=Y, cv=outer_cv, scoring=scoring, return_train_score=False)

            d[prefix + 'score' + suffix] = score

            if classification:
                print('Acc: {:.3f}'.format(np.mean(score['test_acc'])))
                #print(classification_report(y, preds))
            else: 
                print('MSE: {:.3f}'.format(- np.mean(score['test_mse'])))

## Display results 

In [8]:
def create_df(clfs, name, show_std=False):
    '''
    show_std: add +_196 * std of the scores, this is only to get an idea of the variance, 
    not in any way a valid confidence interval (correlated runs and way too few observations)!
    '''
    results = {}
    for clf in clfs.keys():
        try:
            tmp = clfs[clf][name].copy()
            for k, v in tmp.items():
                # np.abs() because 'mse' and 'mae' are defined as their negative
                if 'var_acc' in k:
                    if 'test_acc' in tmp.keys():
                        acc_var = 'test_acc'
                    elif 'test_acc_reg' in tmp.keys():
                        acc_var = 'test_acc_reg'
                    tmp[k] = '{:.3f} $\pm$ {:.3f}'.format(np.abs(np.mean(clfs[clf][name][acc_var])), 
                                                              1.96 * np.sqrt(np.mean(v)))
                    
                elif 'var_mse' in k:
                    tmp[k] = '{:.3f} $\pm$ {:.3f}'.format(np.abs(np.mean(clfs[clf][name]['test_mse'])), 
                                                          1.96 * np.sqrt(-np.mean(v)))
                else:
                    if show_std:
                        tmp[k] = '{:.3f} $\pm$ {:.3f}'.format(np.abs(np.mean(v)), 1.96*np.std(v))
                    else: 
                        tmp[k] = np.abs(np.mean(v))
            results[clf] = tmp
        except:
            print('no results:', clf)
    return pd.DataFrame(results)

def prettify_table(df, regression=False):    
    name_changes_idx = {
        #'test_f1_micro': 'F1-score, micro',
        'test_f1_macro': 'F1-score, macro',
        #'test_f1_weighted': 'F1-score, weighted',
        'test_precision_macro': 'Precision, macro',
        #'test_precision_weighted': 'Precision, weighted',
        'test_recall_macro': 'Recall, macro',
        #'test_recall_weighted': 'Recall, weighted',
        'test_var_acc': 'Accuracy',
        
        #'test_acc_reg': 'Accuracy',
        'test_mae': 'MAE',
        'test_var_mse': 'MSE'
    }
    drops = [name for name in df.index if name not in name_changes_idx.keys()]
    df = df.drop(drops, axis=0)
    
    df.index = [name_changes_idx[name] for name in df.index]
    
    if regression: dummy = 'Mean'
    else: dummy = 'Majority'
    name_changes_cols = {
        'Dummy_majority': dummy + ' predictor',
        'Dummy_strat': dummy + ' predictor,\nstratified',
        'LinearReg': 'Linear Regression',
        'LogisticReg': 'Logistic Regression',
        'RandomForest': 'Random Forest',
        'NaiveBayes': 'Naive Bayes',
        'SVM_linear': 'SVM: linear',
        'SVM_rbf': 'SVM: rbf',
        'KNN': 'K-NN',
        'MLP': 'MLP'
    }
    
    df.columns = [name_changes_cols[name] for name in df.columns]
    
    return df

In [9]:
# load results obtained from running this as a .py script with n_jobs=-1
clfs = pickle.load(open( RESULTS_PATH + 'A4_results_new' +".p", "rb" ))

TypeError: data type "f16" not understood

### Without 'red' 

##### Classification 

In [None]:
#display(create_df(clfs, 'x_score_clf', show_std=True))
df_x_clf = prettify_table(create_df(clfs, 'x_score_clf', show_std=False))
df_x_clf

##### Regression
Note that mse and mae are the negative!

In [None]:
#display(create_df(clfs, 'x_score_reg', show_std=True))
df_x_reg = prettify_table(create_df(clfs, 'x_score_reg', show_std=False))
df_x_reg

### Including Red

##### Classification 

In [None]:
#display(create_df(clfs, 'x_inclRed_score_clf', show_std=True))
df_x_inclRed_clf = prettify_table(create_df(clfs, 'x_inclRed_score_clf', show_std=False))
df_x_inclRed_clf

##### Regression 

In [None]:
#display(create_df(clfs, 'x_inclRed_score_reg', show_std=True))
df_x_inclRed_reg = prettify_table(create_df(clfs, 'x_inclRed_score_reg', show_std=False))
df_x_inclRed_reg