In [1]:
import numpy as np
import pandas as pd

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
wine = pd.read_csv('data/winequality.csv')
print(wine.shape)

(6497, 13)


In [3]:
wine.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [4]:
wine.dropna(inplace = True)
print(wine.shape)

(6463, 13)


In [5]:
wine['quality'] = (wine['quality'] >= 6).astype(int)
wine.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,1
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,1
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,1
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,1


In [6]:
X = wine.drop(['quality'], axis = 1)
y = wine['quality']

In [7]:
X.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [8]:
def findBest(model, params, X_train, y_train):
    opt_model = GridSearchCV(model, params, scoring = ['accuracy', 'f1'], refit = 'accuracy', verbose = 1, n_jobs = -1)
    opt_model.fit(X_train, y_train)
    
    print(opt_model.best_score_)
    print(opt_model.best_params_)
    return opt_model

In [9]:
def prepData(X, y, partition, categorical):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = partition)
    
    for feature in categorical:
        label_encoder = LabelEncoder()
        label_encoder.fit(X[feature])
        X_train[feature] = label_encoder.transform(X_train[feature])
        X_test[feature] = label_encoder.transform(X_test[feature])

    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)
    
    return X_train, X_test, y_train, y_test

In [10]:
def evalModels(mp_dict, trials, partitions, categorical, X, y):
    results = {}
    trainEval = {}
    testEval = {}
    detailRes = {}
    for p in partitions:
        results[str(p)] = {}
        trainEval[str(p)] = {}
        testEval[str(p)] = {}
        detailRes[str(p)] = {}
        print("Evaluating partition for: test_size = " + str(p))
        X_train, X_test, y_train, y_test = prepData(X, y, p, categorical)

        for model in mp_dict:
            print("Starting new Model: " + str(model))
            
            acc_list = []
            acc_dict = {}
            for iteration in range(0, trials):
                print("Starting iteration " + str(iteration + 1))
                opt_model = findBest(model, mp_dict[model], X_train, y_train)
                detailRes[str(p)][type(opt_model.best_estimator_).__name__] = {}
                detailRes[str(p)][type(opt_model.best_estimator_).__name__][str(iteration)] = opt_model
                acc_dict[str(opt_model.best_score_)] = opt_model
                acc_list.append(opt_model.best_score_)
                    
            med_acc = np.median(acc_list)
            best_model = acc_dict[str(med_acc)]
            
            train_acc, train_f1 = testModel(X_train, y_train, best_model)
            trainEval[str(p)][type(best_model.best_estimator_).__name__] = {"accuracy": train_acc, "f1": train_f1}
            
            test_acc, test_f1 = testModel(X_test, y_test, best_model)
            testEval[str(p)][type(best_model.best_estimator_).__name__] = {"accuracy": test_acc, "f1": test_f1}
            
            results[str(p)][type(best_model.best_estimator_).__name__] = best_model
            print("Best acc score for " + str(model))
            print(best_model.best_score_)
            
    return results, trainEval, testEval

In [11]:
def testModel(X, y, classifier):
    Y_pred = classifier.predict(X)
    acc = accuracy_score(y, Y_pred)
    f1 = f1_score(y, Y_pred)
    return acc, f1

In [12]:
def resultSummary(result):
    sl = ["params", "mean_test_accuracy", "std_test_accuracy", "mean_test_f1", "std_test_f1"]
    resultSum = {}
    for p in result:
        resultSum[p] = {}
        for modelName in result[p]:
            resultSum[p][modelName] = {key: result[p][modelName].cv_results_[key][result[p][modelName].best_index_] for key in sl}
    return resultSum

In [13]:
mp_dict = {
    LogisticRegression(): {
        'C': [10 ** i for i in range(-8, 4)]
    },
    LinearSVC(): {
        'C': [10 ** i for i in range(-5, 3)]
    },
#     SVC(kernel = 'poly'): {
#         'C': [10 ** i for i in range(-5, 3)],
#         'degree': [i for i in range(1, 4)]
#     },
    KNeighborsClassifier(): {
        'n_neighbors': [1, 2, 3, 5, 7, 10, 15, 25, 50, 100, 500]
    },
    RandomForestClassifier(): {
        'max_features': [1, 2, 4, 6, 8, 12 ,16, 20],
        'n_estimators': [1024]
    },
    GradientBoostingClassifier(): {
        'n_estimators': [2, 4, 8, 16, 32, 64, 128, 512, 1024]
    }
}
categorical = ['type']
trials = 3
partitions = [0.2, 0.5, 0.8]


result, trainEval, testEval = evalModels(mp_dict, trials, partitions, categorical, X, y)

Evaluating partition for: test_size = 0.2
Starting new Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 12 candidates, totalling 60 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7429400386847196
{'C': 0.1}
Starting iteration 2
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.5s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7429400386847196
{'C': 0.1}
Starting iteration 3
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7429400386847196
{'C': 0.1}
Best acc score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.7429400386847196
Starting new Model: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    1.5s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7437137330754352
{'C': 1}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    1.2s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7446808510638296
{'C': 10}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7435203094777563
{'C': 0.1}
Best acc score for LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.7437137330754352
Starting new Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Starting iteration 1
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    1.2s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7725338491295939
{'n_neighbors': 1}
Starting iteration 2
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    1.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    1.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7725338491295939
{'n_neighbors': 1}
Starting iteration 3
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    1.4s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    1.9s finished


0.7725338491295939
{'n_neighbors': 1}
Best acc score for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.7725338491295939
Starting new Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.4min finished


0.8263056092843326
{'max_features': 2, 'n_estimators': 1024}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.0min finished


0.8241779497098646
{'max_features': 4, 'n_estimators': 1024}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.0min finished


0.8243713733075435
{'max_features': 2, 'n_estimators': 1024}
Best acc score for RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.8243713733075435
Starting new Model: GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_sa

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    3.2s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   25.6s finished


0.7938104448742747
{'n_estimators': 1024}
Starting iteration 2
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    2.8s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   26.0s finished


0.7938104448742747
{'n_estimators': 1024}
Starting iteration 3
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    3.2s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   26.1s finished


0.7936170212765957
{'n_estimators': 1024}
Best acc score for GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
0.7938104448742747
Evaluating partition for: test_size = 0.5
Starting new Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7465185830290793
{'C': 1}
Starting iteration 2
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7465185830290793
{'C': 1}
Starting iteration 3
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7465185830290793
{'C': 1}
Best acc score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.7465185830290793
Starting new Model: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    0.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7480670491575789
{'C': 10}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    0.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7471372995631181
{'C': 0.01}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    0.7s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7471372995631181
{'C': 0.01}
Best acc score for LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.7471372995631181
Starting new Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Starting iteration 1
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    1.0s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7530143888678875
{'n_neighbors': 1}
Starting iteration 2
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    1.0s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7530143888678875
{'n_neighbors': 1}
Starting iteration 3
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    1.0s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    1.4s finished


0.7530143888678875
{'n_neighbors': 1}
Best acc score for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.7530143888678875
Starting new Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


0.8139878744957676
{'max_features': 2, 'n_estimators': 1024}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


0.8139897885453701
{'max_features': 2, 'n_estimators': 1024}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


0.8136792339973491
{'max_features': 2, 'n_estimators': 1024}
Best acc score for RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.8139878744957676
Starting new Model: GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_sa

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    2.0s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   17.4s finished


0.7966614189806729
{'n_estimators': 1024}
Starting iteration 2
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    2.0s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   18.6s finished


0.7966614189806729
{'n_estimators': 1024}
Starting iteration 3
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   18.0s finished


0.7963513429450525
{'n_estimators': 1024}
Best acc score for GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
0.7966614189806729
Evaluating partition for: test_size = 0.8
Starting new Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7569453174104337
{'C': 0.1}
Starting iteration 2
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7569453174104337
{'C': 0.1}
Starting iteration 3
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.3s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7569453174104337
{'C': 0.1}
Best acc score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.7569453174104337
Starting new Model: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.756939331357936
{'C': 10}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7538385561641376
{'C': 1}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    0.3s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7530663553919368
{'C': 0.1}
Best acc score for LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.7538385561641376
Starting new Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Starting iteration 1
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7569453174104337
{'n_neighbors': 25}
Starting iteration 2
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7569453174104337
{'n_neighbors': 25}
Starting iteration 3
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  40 out of  55 | elapsed:    0.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    0.6s finished


0.7569453174104337
{'n_neighbors': 25}
Best acc score for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.7569453174104337
Starting new Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   43.0s finished


0.7701176259315794
{'max_features': 2, 'n_estimators': 1024}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   41.7s finished


0.7747687887222771
{'max_features': 2, 'n_estimators': 1024}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   40.5s finished


0.7708898267037803
{'max_features': 4, 'n_estimators': 1024}
Best acc score for RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.7708898267037803
Starting new Model: GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_sa

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    8.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.760031127472988
{'n_estimators': 64}
Starting iteration 2
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    1.2s remaining:    0.6s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    8.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.7608063212714375
{'n_estimators': 64}
Starting iteration 3
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:    8.9s finished


0.7608063212714375
{'n_estimators': 64}
Best acc score for GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
0.7608063212714375


In [14]:
print("Summary of results: ")
resultSum = resultSummary(result)
print(resultSum)
print("Test evals: ")
print(testEval)

Summary of results: 
{'0.2': {'LogisticRegression': {'params': {'C': 0.1}, 'mean_test_accuracy': 0.7429400386847196, 'std_test_accuracy': 0.005879568337938383, 'mean_test_f1': 0.8056038951349385, 'std_test_f1': 0.00433056475183813}, 'LinearSVC': {'params': {'C': 1}, 'mean_test_accuracy': 0.7437137330754352, 'std_test_accuracy': 0.00547084550241039, 'mean_test_f1': 0.8068342900157107, 'std_test_f1': 0.0028196774041596567}, 'KNeighborsClassifier': {'params': {'n_neighbors': 1}, 'mean_test_accuracy': 0.7725338491295939, 'std_test_accuracy': 0.014962537412621827, 'mean_test_f1': 0.8216453973427262, 'std_test_f1': 0.011833220131659569}, 'RandomForestClassifier': {'params': {'max_features': 2, 'n_estimators': 1024}, 'mean_test_accuracy': 0.8243713733075435, 'std_test_accuracy': 0.009467882592278846, 'mean_test_f1': 0.8648945643431997, 'std_test_f1': 0.00770739831934258}, 'GradientBoostingClassifier': {'params': {'n_estimators': 1024}, 'mean_test_accuracy': 0.7938104448742747, 'std_test_accur

In [15]:
import csv
csv_columns = ["Model", "Partition", "Params", "Train_Acc", "Train_F1", "Val_Acc", "Val_F1", "Test_Acc", "Test_F1", "Acc_Std", "F1_Std"]
csv_file = "WINE_EvalResults.csv"
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(csv_columns)
        
        modelList = resultSum['0.2'].keys()
        for modelName in modelList:
            for partition in resultSum:
                params = resultSum[partition][modelName]['params']
                val_acc_std = resultSum[partition][modelName]['std_test_accuracy']
                val_acc = resultSum[partition][modelName]['mean_test_accuracy']
                val_f1_std = resultSum[partition][modelName]['std_test_f1']
                val_f1 = resultSum[partition][modelName]['mean_test_f1']
                
                test_acc = testEval[partition][modelName]['accuracy']
                test_f1 = testEval[partition][modelName]['f1']
                
                train_acc = trainEval[partition][modelName]['accuracy']
                train_f1 = trainEval[partition][modelName]['f1']
                
                writer.writerow([modelName, partition, params, train_acc, train_f1, val_acc, val_f1, test_acc, test_f1, val_acc_std, val_f1_std])

except IOError:
    print("I/O error")

In [16]:
result_df = pd.read_csv("WINE_EvalResults.csv")
result_df

Unnamed: 0,Model,Partition,Params,Train_Acc,Train_F1,Val_Acc,Val_F1,Test_Acc,Test_F1,Acc_Std,F1_Std
0,LogisticRegression,0.2,{'C': 0.1},0.743907,0.806489,0.74294,0.805604,0.736272,0.802319,0.00588,0.004331
1,LogisticRegression,0.5,{'C': 1},0.74559,0.804565,0.746519,0.805157,0.738552,0.8028,0.014689,0.012171
2,LogisticRegression,0.8,{'C': 0.1},0.756966,0.817229,0.756945,0.817218,0.741056,0.806139,0.020355,0.014574
3,LinearSVC,0.2,{'C': 1},0.745261,0.807877,0.743714,0.806834,0.740139,0.80578,0.005471,0.00282
4,LinearSVC,0.5,{'C': 0.01},0.744352,0.8038,0.747137,0.805703,0.733601,0.799067,0.014734,0.012513
5,LinearSVC,0.8,{'C': 1},0.763932,0.821742,0.753839,0.81367,0.740863,0.805119,0.024871,0.016891
6,KNeighborsClassifier,0.2,{'n_neighbors': 1},1.0,1.0,0.772534,0.821645,0.766435,0.818072,0.014963,0.011833
7,KNeighborsClassifier,0.5,{'n_neighbors': 1},1.0,1.0,0.753014,0.80418,0.753713,0.806326,0.015014,0.01212
8,KNeighborsClassifier,0.8,{'n_neighbors': 25},0.773994,0.82924,0.756945,0.818651,0.736801,0.80571,0.016426,0.012691
9,RandomForestClassifier,0.2,"{'max_features': 2, 'n_estimators': 1024}",1.0,1.0,0.824371,0.864895,0.830626,0.87003,0.009468,0.007707
