In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [42]:
adult = pd.read_csv('data/adult.csv', na_values = '?')
print(adult.shape)

(32561, 15)


In [43]:
adult.describe()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [44]:
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [45]:
adult['income'] = adult['income'].map({
        '<=50K': 0,
        '>50K': 1,
        '<=50K.': 0,
        '>50K.': 1
})
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,0
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0


In [46]:
adult.dropna(inplace = True)
print(adult.shape)
adult.head()

(30162, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,0
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,0
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,0
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States,0
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States,0


In [47]:
X = adult.drop(['income'], axis = 1)
y = adult['income']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [49]:
categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
for feature in categorical:
        label_encoder = LabelEncoder()
        X_train[feature] = label_encoder.fit_transform(X_train[feature])
        X_test[feature] = label_encoder.transform(X_test[feature])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [50]:
scaler = StandardScaler()

X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)

In [51]:
X_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,0.345936,1.881136,-0.426486,0.17362,-0.449895,0.947697,-0.487459,-0.260966,0.383626,0.692911,-0.14769,-0.219893,1.585883,0.262826
1,1.412901,-0.209843,1.127263,1.227258,-0.056536,1.616068,1.251181,-0.260966,0.383626,-1.443188,-0.14769,-0.219893,-0.079128,0.262826
2,-1.102088,-0.209843,-0.473184,-0.3532,1.12354,0.947697,-1.480968,-0.260966,0.383626,0.692911,-0.14769,-0.219893,-0.079128,0.262826
3,0.49836,-0.209843,2.684585,0.17362,-0.449895,-0.389045,-0.984214,-0.88423,0.383626,0.692911,-0.14769,-0.219893,-0.079128,0.262826
4,-0.416182,-0.209843,-0.376954,1.227258,-0.056536,-0.389045,-0.984214,-0.88423,0.383626,0.692911,-0.14769,-0.219893,-0.079128,0.262826


In [9]:
trials = 3
partitions = [0.2, 0.5, 0.8]

In [30]:
def findBest(model, params, X_train, y_train):
    opt_model = GridSearchCV(model, params, scoring = ['accuracy', 'f1'], refit = 'accuracy', verbose = 1, n_jobs = -1)
    opt_model.fit(X_train, y_train)
    
    print(opt_model.best_score_)
    print(opt_model.best_params_)
    return opt_model

In [37]:
def prepData(X, y, partition, categorical):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = partition)
    
    for feature in categorical:
        label_encoder = LabelEncoder()
        label_encoder.fit(X[feature])
        X_train[feature] = label_encoder.transform(X_train[feature])
        X_test[feature] = label_encoder.transform(X_test[feature])

    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)
    
    return X_train, X_test, y_train, y_test

In [32]:
def trainModels(mp_dict, trials, partitions, categorical, X, y):
    results = {}
    for p in partitions:
        results[str(p)] = {}
        print("Evaluating partition for: test_size = " + str(p))
        for model in mp_dict:
            print("Starting new Model: " + str(model))
            max_acc = 0
            for iteration in range(0, trials):
                X_train, X_test, y_train, y_test = prepData(X, y, p, categorical)

                print("Starting iteration " + str(iteration + 1))
                opt_model = findBest(model, mp_dict[model], X_train, y_train)
                if opt_model.best_score_ > max_acc:
                    best_model = opt_model
                    max_acc = opt_model.best_score_
            results[str(p)][best_model] = best_model.cv_results_
            print("Best acc score for " + str(model))
            print(best_model.best_score_)
            
    return results
            #TODO get test error

In [52]:
# best C = 0.01
C_list = [10 ** i for i in range(-5, 3)]
print(C_list)
param_grid = {
    'C': C_list
}
lin_svm = LinearSVC()
opt_lin_svm = findBest(lin_svm, param_grid, X_train, y_train)

[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   28.1s finished


0.8187244051542916
{'C': 1}




In [33]:
def resultSummary(result):
    sl = ["params", "mean_test_accuracy", "mean_test_f1"]
    resultSum = {}
    for p in result:
        resultSum[p] = {}
        for model in result[p]:
            resultSum[p][model] = {key: result[p][model][key][model.best_index_] for key in sl}
    print(resultSum)
    return resultSum

In [38]:
mp_dict = {
    LogisticRegression(): {'C': [10 ** i for i in range(-8, 4)]},
    LinearSVC(): {'C': [10 ** i for i in range(-5, 3)]}
}
categorical = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']
trials = 3
partitions = [0.2, 0.5, 0.8]
result = trainModels(mp_dict, trials, partitions, categorical, X, y)
resultSummary(result)

Evaluating partition for: test_size = 0.2
Starting new Model: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
Starting iteration 1
Fitting 5 folds for each of 12 candidates, totalling 60 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.1s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ver

0.8181028152773513
{'C': 100}
Starting iteration 2
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    1.0s remaining:    0.3s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.2s finished


0.8198845631070046
{'C': 1}
Starting iteration 3
Fitting 5 folds for each of 12 candidates, totalling 60 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    1.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.3s finished
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

0.8200094307818832
{'C': 0.1}
Best acc score for LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.8200094307818832
Starting new Model: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
Starting iteration 1
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  25 out of  40 | elapsed:   12.4s remaining:    7.5s


KeyboardInterrupt: 

In [58]:
print(type(opt_lin_svm.best_estimator_).__name__)
#print(opt_lin_svm.cv_results_)
sl = ["params", "mean_test_accuracy", "mean_test_f1"]
opt_lin_svm_res = {key: opt_lin_svm.cv_results_[key][opt_lin_svm.best_index_] for key in sl}
#print(opt_lin_svm_res)
# for k in sl:
#     print(opt_lin_svm.cv_results_[k][opt_lin_svm.best_index_])

LinearSVC


In [14]:
# best C = 10, degree = 3
C_list = [10 ** i for i in range(-5, 3)]
degree_list = [i for i in range(1, 4)]
print(C_list)
print(degree_list)
param_grid = {
    'C': C_list,
    'degree': degree_list
}
poly_svm = SVC(kernel = 'poly')
opt_poly_svm = findBest(poly_svm, param_grid, X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
[1, 2, 3]
Fitting 5 folds for each of 24 candidates, totalling 120 fits


KeyboardInterrupt: 

In [30]:
# best C = 0.1, 0.82
C_list = [10 ** i for i in range(-8, 4)]
print(C_list)
param_grid = {
    'C': C_list
}

log_reg = LogisticRegression(penalty = 'l2')
opt_log_reg = findBest(log_reg, param_grid, X_train, y_train)

[1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


0.5628065801463193
{'C': 1}


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.1s finished


In [32]:
opt_log_reg.cv_results_

{'mean_fit_time': array([0.13892021, 0.12473407, 0.119169  , 0.10039697, 0.07621431,
        0.0791533 , 0.10070982, 0.12050576, 0.11343312, 0.11949058,
        0.11358047, 0.0824604 ]),
 'std_fit_time': array([0.0105393 , 0.00659311, 0.00801643, 0.00479013, 0.00256065,
        0.00674347, 0.00669852, 0.02128167, 0.00075405, 0.01081562,
        0.00233162, 0.01286708]),
 'mean_score_time': array([0.00917711, 0.01040673, 0.01004252, 0.0092895 , 0.00996008,
        0.00944004, 0.0093574 , 0.01198745, 0.00997949, 0.00987344,
        0.00986795, 0.00512791]),
 'std_score_time': array([0.00058033, 0.00173496, 0.00239092, 0.00041957, 0.00101669,
        0.00062434, 0.00096711, 0.00365081, 0.00060326, 0.00071404,
        0.00084167, 0.00167856]),
 'param_C': masked_array(data=[1e-08, 1e-07, 1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1,
                    1, 10, 100, 1000],
              mask=[False, False, False, False, False, False, False, False,
                    False, False, False, False],
 

In [16]:
# best n = 25
n_list = [1, 2, 3, 5, 7, 10, 15, 25, 50, 100, 500]
print(n_list)
param_grid = {
    'n_neighbors': n_list
}

knn = KNeighborsClassifier()
opt_knn = findBest(knn, param_grid, X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[1, 2, 3, 5, 7, 10, 15, 25, 50, 100, 500]
Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   40.0s
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  1.1min finished


0.8290155206767927
{'n_neighbors': 25}


In [17]:
# best features = 6
max_features = [1, 2, 4, 6, 8, 12 ,16, 20]
tree_size = [1024]
param_grid = {
    'max_features': max_features,
    'n_estimators': tree_size
}
rf = RandomForestClassifier()

opt_rf = findBest(rf, param_grid, X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  4.7min finished


0.8539762346752626
{'max_features': 4, 'n_estimators': 1024}


In [18]:
# best stages = 512
boosting_stages = [2, 4, 8, 16, 32, 64, 128, 512, 1024]
param_grid = {
    'n_estimators': boosting_stages
}
gb = GradientBoostingClassifier()
opt_gb = findBest(gb, param_grid, X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  45 | elapsed:    7.5s remaining:    3.8s
[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:  1.1min finished


0.8677118479895067
{'n_estimators': 512}
