In [1]:
import numpy as np
import pandas as pd

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
adult = pd.read_csv('data/adult.csv', na_values = '?')
print(adult.shape)

(32561, 15)


In [3]:
adult.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [5]:
adult['income'] = adult['income'].map({
        '<=50K': 0,
        '>50K': 1,
        '<=50K.': 0,
        '>50K.': 1
})
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


In [6]:
adult.dropna(inplace = True)
print(adult.shape)
adult.head()

(30162, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,0
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,0


In [7]:
X = adult.drop(['income'], axis = 1)
y = adult['income']

In [8]:
def findBest(model, params, X_train, y_train):
    opt_model = GridSearchCV(model, params, scoring = ['accuracy', 'f1'], refit = 'accuracy', verbose = 1, n_jobs = -1)
    opt_model.fit(X_train, y_train)
    
    print(opt_model.best_score_)
    print(opt_model.best_params_)
    return opt_model

In [9]:
def prepData(X, y, partition, categorical):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = partition)
    
    for feature in categorical:
        label_encoder = LabelEncoder()
        label_encoder.fit(X[feature])
        X_train[feature] = label_encoder.transform(X_train[feature])
        X_test[feature] = label_encoder.transform(X_test[feature])

    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)
    
    return X_train, X_test, y_train, y_test

In [10]:
def evalModels(mp_dict, trials, partitions, categorical, X, y):
    results = {}
    trainEval = {}
    testEval = {}
    detailRes = {}
    for p in partitions:
        results[str(p)] = {}
        trainEval[str(p)] = {}
        testEval[str(p)] = {}
        detailRes[str(p)] = {}
        print("Evaluating partition for: test_size = " + str(p))
        X_train, X_test, y_train, y_test = prepData(X, y, p, categorical)

        for model in mp_dict:
            print("Starting new Model: " + str(model))
            
            acc_list = []
            acc_dict = {}
            for iteration in range(0, trials):
                print("Starting iteration " + str(iteration + 1))
                opt_model = findBest(model, mp_dict[model], X_train, y_train)
                detailRes[str(p)][type(opt_model.best_estimator_).__name__] = {}
                detailRes[str(p)][type(opt_model.best_estimator_).__name__][str(iteration)] = opt_model
                acc_dict[str(opt_model.best_score_)] = opt_model
                acc_list.append(opt_model.best_score_)
                    
            med_acc = np.median(acc_list)
            best_model = acc_dict[str(med_acc)]
            
            train_acc, train_f1 = testModel(X_train, y_train, best_model)
            trainEval[str(p)][type(best_model.best_estimator_).__name__] = {"accuracy": train_acc, "f1": train_f1}
            
            test_acc, test_f1 = testModel(X_test, y_test, best_model)
            testEval[str(p)][type(best_model.best_estimator_).__name__] = {"accuracy": test_acc, "f1": test_f1}
            
            results[str(p)][type(best_model.best_estimator_).__name__] = best_model
            print("Best acc score for " + str(model))
            print(best_model.best_score_)
            
    return results, trainEval, testEval

In [11]:
def testModel(X, y, classifier):
    Y_pred = classifier.predict(X)
    acc = accuracy_score(y, Y_pred)
    f1 = f1_score(y, Y_pred)
    return acc, f1

In [12]:
def resultSummary(result):
    sl = ["params", "mean_test_accuracy", "std_test_accuracy", "mean_test_f1", "std_test_f1"]
    resultSum = {}
    for p in result:
        resultSum[p] = {}
        for modelName in result[p]:
            resultSum[p][modelName] = {key: result[p][modelName].cv_results_[key][result[p][modelName].best_index_] for key in sl}
    return resultSum

In [13]:
mp_dict = {
    LogisticRegression(): {
        'C': [10 ** i for i in range(-8, 4)]
    },
    LinearSVC(): {
        'C': [10 ** i for i in range(-5, 3)]
    },
#     SVC(kernel = 'poly'): {
#         'C': [10 ** i for i in range(-5, 3)],
#         'degree': [i for i in range(1, 4)]
#     },
    KNeighborsClassifier(): {
        'n_neighbors': [1, 2, 3, 5, 7, 10, 15, 25, 50, 100, 500]
    },
    RandomForestClassifier(): {
        'max_features': [1, 2, 4, 6, 8, 12 ,16, 20],
        'n_estimators': [1024]
    },
    GradientBoostingClassifier(): {
        'n_estimators': [2, 4, 8, 16, 32, 64, 128, 512, 1024]
    }
}
categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
trials = 3
partitions = [0.2, 0.5, 0.8]


result, trainEval, testEval = evalModels(mp_dict, trials, partitions, categorical, X, y)

Evaluating partition for: test_size = 0.2
Starting new Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 12 candidates, totalling 60 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    3.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8222055146024664
{'C': 0.1}
Starting iteration 2
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    1.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8222055146024664
{'C': 0.1}
Starting iteration 3
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8222055146024664
{'C': 0.1}
Best acc score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.8222055146024664
Starting new Model: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:   12.4s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   25.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8210865325772103
{'C': 1}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:   14.1s remaining:    8.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   25.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8210865325772103
{'C': 1}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:   11.2s remaining:    6.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   23.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8210450818000081
{'C': 1}
Best acc score for LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.8210865325772103
Starting new Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Starting iteration 1
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   41.4s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  1.3min finished


0.8322763957750441
{'n_neighbors': 25}
Starting iteration 2
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   49.5s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  1.4min finished


0.8322763957750441
{'n_neighbors': 25}
Starting iteration 3
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   51.1s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  1.4min finished


0.8322763957750441
{'n_neighbors': 25}
Best acc score for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.8322763957750441
Starting new Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  5.7min finished


0.8559825899864508
{'max_features': 2, 'n_estimators': 1024}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  5.6min finished


0.8552779611302336
{'max_features': 1, 'n_estimators': 1024}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  5.5min finished


0.8563969689226534
{'max_features': 2, 'n_estimators': 1024}
Best acc score for RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.8559825899864508
Starting new Model: GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_sa

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.3min finished


0.8701148828989776
{'n_estimators': 512}
Starting iteration 2
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    9.8s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.3min finished


0.8701148828989776
{'n_estimators': 512}
Starting iteration 3
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    9.7s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.3min finished


0.8701148828989776
{'n_estimators': 512}
Best acc score for GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
0.8701148828989776
Evaluating partition for: test_size = 0.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Starting new Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.9s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8228892157526447
{'C': 0.1}
Starting iteration 2
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.8s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8228892157526447
{'C': 0.1}
Starting iteration 3
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8228892157526447
{'C': 0.1}
Best acc score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.8228892157526447
Starting new Model: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    7.8s remaining:    4.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   14.0s finished


0.8201705367198606
{'C': 0.1}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    7.3s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   14.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8211649019833676
{'C': 10}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:    7.5s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   13.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8204359425677131
{'C': 10}
Best acc score for LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.8204359425677131
Starting new Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Starting iteration 1
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.0s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:   35.8s finished


0.8313767518983936
{'n_neighbors': 25}
Starting iteration 2
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:   36.3s finished


0.8313767518983936
{'n_neighbors': 25}
Starting iteration 3
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   22.5s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:   36.5s finished


0.8313767518983936
{'n_neighbors': 25}
Best acc score for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.8313767518983936
Starting new Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.3min finished


0.8559108245143128
{'max_features': 1, 'n_estimators': 1024}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.3min finished


0.8561098514254766
{'max_features': 1, 'n_estimators': 1024}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.3min finished


0.8556456164844836
{'max_features': 2, 'n_estimators': 1024}
Best acc score for RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.8559108245143128
Starting new Model: GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_sa

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    6.2s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   49.7s finished


0.8689737596590146
{'n_estimators': 512}
Starting iteration 2
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    5.4s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   48.2s finished


0.869172632711716
{'n_estimators': 512}
Starting iteration 3
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    6.1s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   48.6s finished


0.8687748206669721
{'n_estimators': 512}
Best acc score for GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
0.8689737596590146
Evaluating partition for: test_size = 0.8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Starting new Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.4s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8227804638777941
{'C': 0.1}
Starting iteration 2
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8227804638777941
{'C': 0.1}
Starting iteration 3
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.5s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8227804638777941
{'C': 0.1}
Best acc score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.8227804638777941
Starting new Model: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8224498880906157
{'C': 0.01}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8224498880906157
{'C': 0.01}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    3.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8224498880906157
{'C': 0.01}
Best acc score for LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
0.8224498880906157
Starting new Model: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
Starting iteration 1
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    5.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8183049128837998
{'n_neighbors': 15}
Starting iteration 2
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    5.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.8183049128837998
{'n_neighbors': 15}
Starting iteration 3
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:    5.4s finished


0.8183049128837998
{'n_neighbors': 15}
Best acc score for KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')
0.8183049128837998
Starting new Model: RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


0.8499676431430256
{'max_features': 4, 'n_estimators': 1024}
Starting iteration 2
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


0.8488066433917132
{'max_features': 2, 'n_estimators': 1024}
Starting iteration 3
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


0.8509618436401258
{'max_features': 4, 'n_estimators': 1024}
Best acc score for RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
0.8499676431430256
Starting new Model: GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_sa

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    2.2s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   20.0s finished


0.8624005078171694
{'n_estimators': 128}
Starting iteration 2
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    2.2s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   20.5s finished


0.8624005078171694
{'n_estimators': 128}
Starting iteration 3
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    2.2s remaining:    1.1s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   20.3s finished


0.8622346703378989
{'n_estimators': 128}
Best acc score for GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
0.8624005078171694


In [14]:
resultBackup = result

In [15]:
print("Summary of results: ")
resultSum = resultSummary(result)
print(resultSum)
print("Test evals: ")
print(testEval)

Summary of results: 
{'0.2': {'LogisticRegression': {'params': {'C': 0.1}, 'mean_test_accuracy': 0.8222055146024664, 'std_test_accuracy': 0.003503586126883561, 'mean_test_f1': 0.5638574281658426, 'std_test_f1': 0.010885518855801942}, 'LinearSVC': {'params': {'C': 1}, 'mean_test_accuracy': 0.8210865325772103, 'std_test_accuracy': 0.0047654371811858155, 'mean_test_f1': 0.5446180319205758, 'std_test_f1': 0.013343583563805474}, 'KNeighborsClassifier': {'params': {'n_neighbors': 25}, 'mean_test_accuracy': 0.8322763957750441, 'std_test_accuracy': 0.0033307027287857362, 'mean_test_f1': 0.6256503165524642, 'std_test_f1': 0.007611951413375969}, 'RandomForestClassifier': {'params': {'max_features': 2, 'n_estimators': 1024}, 'mean_test_accuracy': 0.8559825899864508, 'std_test_accuracy': 0.003360002895327758, 'mean_test_f1': 0.6860632455627886, 'std_test_f1': 0.009041417790283474}, 'GradientBoostingClassifier': {'params': {'n_estimators': 512}, 'mean_test_accuracy': 0.8701148828989776, 'std_test_a

In [16]:
# Summary of results: 
# {'0.2': {'LogisticRegression': {'params': {'C': 0.1}, 'mean_test_accuracy': 0.8235733816610802, 'mean_test_f1': 0.5652042783133927}, 'LinearSVC': {'params': {'C': 1}, 'mean_test_accuracy': 0.8196364596776098, 'mean_test_f1': 0.5349997399093454}, 'KNeighborsClassifier': {'params': {'n_neighbors': 25}, 'mean_test_accuracy': 0.8304528965512799, 'mean_test_f1': 0.6249653545829906}, 'RandomForestClassifier': {'params': {'max_features': 2, 'n_estimators': 1024}, 'mean_test_accuracy': 0.8549876596758921, 'mean_test_f1': 0.6847844110219501}, 'GradientBoostingClassifier': {'params': {'n_estimators': 512}, 'mean_test_accuracy': 0.8686640885187961, 'mean_test_f1': 0.7137364060011511}}, '0.5': {'LogisticRegression': {'params': {'C': 0.1}, 'mean_test_accuracy': 0.8248121168374789, 'mean_test_f1': 0.5712530687456912}, 'LinearSVC': {'params': {'C': 1}, 'mean_test_accuracy': 0.822093635622718, 'mean_test_f1': 0.5447817079406571}, 'KNeighborsClassifier': {'params': {'n_neighbors': 15}, 'mean_test_accuracy': 0.8291896098940663, 'mean_test_f1': 0.6246361951660419}, 'RandomForestClassifier': {'params': {'max_features': 4, 'n_estimators': 1024}, 'mean_test_accuracy': 0.8518003857891049, 'mean_test_f1': 0.6823738123562695}, 'GradientBoostingClassifier': {'params': {'n_estimators': 512}, 'mean_test_accuracy': 0.8695048351120838, 'mean_test_f1': 0.7140236479476625}}, '0.8': {'LogisticRegression': {'params': {'C': 10}, 'mean_test_accuracy': 0.8255959913220421, 'mean_test_f1': 0.5601809101543567}, 'LinearSVC': {'params': {'C': 1}, 'mean_test_accuracy': 0.8265926649547073, 'mean_test_f1': 0.5579410463494652}, 'KNeighborsClassifier': {'params': {'n_neighbors': 15}, 'mean_test_accuracy': 0.8204564034288652, 'mean_test_f1': 0.6145556118624702}, 'RandomForestClassifier': {'params': {'max_features': 1, 'n_estimators': 1024}, 'mean_test_accuracy': 0.8512915950487827, 'mean_test_f1': 0.6601491817387592}, 'GradientBoostingClassifier': {'params': {'n_estimators': 128}, 'mean_test_accuracy': 0.8592511070716563, 'mean_test_f1': 0.6687497604151422}}}
# Test evals: 
# {'0.2': {'LogisticRegression': {'accuracy': 0.8077241836565556, 'f1': 0.5429472025216706}, 'LinearSVC': {'accuracy': 0.8180009945300846, 'f1': 0.5507364975450082}, 'KNeighborsClassifier': {'accuracy': 0.8324216807558429, 'f1': 0.6281721221037148}, 'RandomForestClassifier': {'accuracy': 0.8511519973479198, 'f1': 0.6736918604651162}, 'GradientBoostingClassifier': {'accuracy': 0.863583623404608, 'f1': 0.7076376554174068}}, '0.5': {'LogisticRegression': {'accuracy': 0.8186459783833964, 'f1': 0.5580869284213927}, 'LinearSVC': {'accuracy': 0.8163251773755056, 'f1': 0.5338269942780208}, 'KNeighborsClassifier': {'accuracy': 0.8264040846097739, 'f1': 0.6131796690307328}, 'RandomForestClassifier': {'accuracy': 0.8508719580929647, 'f1': 0.6779321208649578}, 'GradientBoostingClassifier': {'accuracy': 0.8651946157416617, 'f1': 0.7072714182865371}}, '0.8': {'LogisticRegression': {'accuracy': 0.8170327393286365, 'f1': 0.5486146610775995}, 'LinearSVC': {'accuracy': 0.8181102362204724, 'f1': 0.5429553264604812}, 'KNeighborsClassifier': {'accuracy': 0.8231661831744717, 'f1': 0.6191198786039454}, 'RandomForestClassifier': {'accuracy': 0.8459593866556154, 'f1': 0.6580811332904056}, 'GradientBoostingClassifier': {'accuracy': 0.8591794446746788, 'f1': 0.6844353640416047}}}


In [17]:
import csv
csv_columns = ["Model", "Partition", "Params", "Train_Acc", "Train_F1", "Val_Acc", "Val_F1", "Test_Acc", "Test_F1", "Acc_Std", "F1_Std"]
csv_file = "ADULT_EvalResults.csv"
try:
    with open(csv_file, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(csv_columns)
        
        modelList = resultSum['0.2'].keys()
        for modelName in modelList:
            for partition in resultSum:
                params = resultSum[partition][modelName]['params']
                val_acc_std = resultSum[partition][modelName]['std_test_accuracy']
                val_acc = resultSum[partition][modelName]['mean_test_accuracy']
                val_f1_std = resultSum[partition][modelName]['std_test_f1']
                val_f1 = resultSum[partition][modelName]['mean_test_f1']
                
                test_acc = testEval[partition][modelName]['accuracy']
                test_f1 = testEval[partition][modelName]['f1']
                
                train_acc = trainEval[partition][modelName]['accuracy']
                train_f1 = trainEval[partition][modelName]['f1']
                
                writer.writerow([modelName, partition, params, train_acc, train_f1, val_acc, val_f1, test_acc, test_f1, val_acc_std, val_f1_std])

except IOError:
    print("I/O error")

In [18]:
result_df = pd.read_csv("ADULT_EvalResults.csv")
result_df

Unnamed: 0,Model,Partition,Params,Train_Acc,Train_F1,Val_Acc,Val_F1,Test_Acc,Test_F1,Acc_Std,F1_Std
0,LogisticRegression,0.2,{'C': 0.1},0.822289,0.56405,0.822206,0.563857,0.814686,0.556349,0.003504,0.010886
1,LogisticRegression,0.5,{'C': 0.1},0.823884,0.572853,0.822889,0.569575,0.820436,0.561954,0.005786,0.018195
2,LogisticRegression,0.8,{'C': 0.1},0.824105,0.556253,0.82278,0.552121,0.819478,0.560621,0.00633,0.019326
3,LinearSVC,0.2,{'C': 1},0.821252,0.545378,0.821087,0.544618,0.812365,0.533773,0.004765,0.013344
4,LinearSVC,0.5,{'C': 10},0.820171,0.543895,0.820436,0.547712,0.819375,0.536892,0.004456,0.016602
5,LinearSVC,0.8,{'C': 0.01},0.824602,0.543178,0.82245,0.536665,0.817986,0.540682,0.008163,0.022133
6,KNeighborsClassifier,0.2,{'n_neighbors': 25},0.846243,0.657686,0.832276,0.62565,0.823305,0.617647,0.003331,0.007612
7,KNeighborsClassifier,0.5,{'n_neighbors': 25},0.845634,0.660542,0.831377,0.627379,0.825211,0.614958,0.005445,0.014458
8,KNeighborsClassifier,0.8,{'n_neighbors': 15},0.841678,0.636744,0.818305,0.581344,0.824492,0.615175,0.007669,0.013419
9,RandomForestClassifier,0.2,"{'max_features': 2, 'n_estimators': 1024}",0.999959,0.999916,0.855983,0.686063,0.851152,0.682461,0.00336,0.009041
