In [3]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

uci_features = ['28',  '48',  '64', '105', '128', '153', '241', '281', '318', '336', 
                '338', '378', '433', '442', '451', '453', '455', '472', '475', '493']

madelon_features = ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
                   'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
                   'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
                   'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956']

Xuci_1 = Xuci_1[uci_features]
Xuci_2 = Xuci_2[uci_features]
Xuci_3 = Xuci_3[uci_features]

# !conda install -y psycopg2

from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel, RFECV 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm



In [5]:
Xdb_1 = pd.read_pickle('data/madelon_db_1')
Xdb_2 = pd.read_pickle('data/madelon_db_2')
Xdb_3 = pd.read_pickle('data/madelon_db_3')


ydb_1 = Xdb_1['target']
ydb_2 = Xdb_2['target']
ydb_3 = Xdb_3['target']
Xdb_1 = Xdb_1[madelon_features]
Xdb_2 = Xdb_2[madelon_features]
Xdb_3 = Xdb_3[madelon_features]

In [70]:
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [7]:
def corr_reduction(X, y, corr_thresh = 0.7):
    
    #find order of X features from least important to most important in predicting X
    skb = SelectKBest(k=len(X.columns))
    skb.fit(X, y)
    
    tmp_X = X[[col for p,col in sorted(zip(skb.pvalues_,X.columns))]]

    # iterate through columns
    for col in tmp_X.columns:
        corrs = tmp_X.drop(col, axis=1).corrwith(tmp_X[col]) #store the correlations
        
        # if tested column is too highly correlated, drop it
        if max(corrs) > corr_thresh:
            tmp_X = tmp_X.drop(col, axis=1)
            
    return tmp_X, y
        

In [52]:
dtc_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                    ('scaler2', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', KNeighborsClassifier())])

rfc_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', SVC())])

ada_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', AdaBoostClassifier())])

gbc_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', GradientBoostingClassifier())])

xgb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', XGBClassifier())])

In [None]:
dtc_params = {'pca__n_components': [1, 3, 5],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_params = {'pca__n_components': [1, 3, 5],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_params = {'pca__n_components': [1, 3, 5],
             'classifier__n_neighbors': np.linspace(1,50).astype(int)}

rfc_params = {'pca__n_components': [1, 3, 5],
             'classifier__n_estimators': [200, 500],
              'classifier__max_features': ['log2', 'sqrt', 'auto'],
              'classifier__oob_score': [True, False],
             'classifier__max_depth': [1, 5, None]}

svc_params = {'pca__n_components': [1, 3, 5],
              'classifier__C': np.logspace(-3,3,14)}

ada_params = {'pca__n_components': [1, 3, 5],
              'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0]}

gbc_params = {'pca__n_components': [1, 3, 5],
              'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
             'classifier__max_depth': [1, 2, 3, 4, 5],
             'classifier__loss': ['deviance', 'exponential'],
             'classifier__warm_start': [True, False]}

xgb_params = {'pca__n_components': [1, 3, 5],
              'classifier__n_estimators': [10, 50, 75, 100],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
             'classifier__max_depth': [1, 2, 3, 4, 5]}

In [None]:
np.linspace(1,50).astype(int)

In [12]:
def test_train_gs(X, y, pipe, param):
    
    gs = GridSearchCV(pipe, param, cv=5, n_jobs=-1)
    gs.fit(X_train, y_train)
    
    print('Best params:', gs.best_params_)
    print('Best fitting score:', gs.best_score_)
    print('Train score:', gs.score(X_train, y_train))
    print('Test score:', gs.score(X_test, y_test))
    
    return gs.best_estimator_
    

In [None]:
# dtc_classifier = est_train_gs(Xdb_1, ydb_1, dtc_pipe, dtc_params)

In [None]:
def boxcox_df(df, c=15):
    bc_df = df
    
    for col in df.columns:
        bc_df[col] = stats.boxcox(df[col]+c)[0]
#         bc_df[col] = stats.boxcox(df[col])[0]
        
    return bc_df

In [None]:
# test_train_gs(boxcox_df(Xdb_1), ydb_1, dtc_pipe, dtc_params)

In [None]:
# test_train_gs(Xdb_2, ydb_2, dtc_pipe, dtc_params)

In [None]:
# test_train_gs(Xdb_3, ydb_3, dtc_pipe, dtc_params)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(Xdb_1, ydb_1, test_size = 0.25, random_state=42)

In [None]:
print("Random Forest")
rfc_classifier = test_train_gs(X_train, y_train, rfc_pipe, rfc_params)
print("\nKNN")
knn_classifier = test_train_gs(X_train, y_train, knn_pipe, knn_params)
print("\nSVC")
svc_classifier = test_train_gs(X_train, y_train, svc_pipe, svc_params)
print("\nAda")
ada_classifier = test_train_gs(X_train, y_train, ada_pipe, ada_params)
print("\nGBC")
gbc_classifier = test_train_gs(X_train, y_train, gbc_pipe, gbc_params)

In [None]:
print("\nXGB")
xgb_classifier = test_train_gs(X_train, y_train, xgb_pipe, xgb_params)

In [None]:
votes = pd.DataFrame({'rfc': rfc_classifier.predict(X_test),
                     'knn': knn_classifier.predict(X_test),
                     'svc': svc_classifier.predict(X_test),
                      'ada': ada_classifier.predict(X_test),
                      'gbc': gbc_classifier.predict(X_test),
                      'xgb': xgb_classifier.predict(X_test),
                     'truth': y_test})

In [None]:
votes['vote'] = round(votes[['gbc','rfc']].max(axis=1)).astype(int)

In [None]:
import itertools

results = []

for i in range(1,7):
    for cols in itertools.combinations(votes.drop(['truth', 'vote'], axis=1), i):
        col_list = list(cols)
        
        results.append({'classifiers': col_list,
                       'max': np.mean(votes[col_list].max(axis=1) == votes['truth']),
                       'majority': np.mean(round(votes[col_list].mean(axis=1)) == votes['truth']),
                       'majority0.6': np.mean((votes[col_list].mean(axis=1) >= 0.6) == votes['truth'])

                       })
        

# print(np.mean(votes['ada'] == votes['truth']))
results_df = pd.DataFrame(results)
results_df['highest'] = results_df[['max', 'majority', 'majority0.6']].max(axis=1)
results_df.sort_values('highest', ascending=False)

In [None]:
votes[votes.truth != votes.vote]

## Trying SKB instead of PCA

In [20]:
dtc_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                    ('scaler2', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', KNeighborsClassifier())])

rfc_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', SVC())])

ada_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', AdaBoostClassifier())])

gbc_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', GradientBoostingClassifier())])

xgb_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', XGBClassifier())])

In [18]:
dtc_skb_params = {'skb__k': [5, 10, 15],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_skb_params = {'skb__k': [5, 10, 15],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_skb_params = {'skb__k': [5, 10, 15],
             'classifier__n_neighbors': np.linspace(1,50).astype(int)}

rfc_skb_params = {'skb__k': [5, 10, 15],
             'classifier__n_estimators': [200, 500],
              'classifier__max_features': ['log2', 'sqrt', 'auto'],
              'classifier__oob_score': [True, False],
             'classifier__max_depth': [1, 5, None]}

svc_skb_params = {'skb__k': [5, 10, 15],
              'classifier__C': np.logspace(-3,3,14)}

ada_skb_params = {'skb__k': [5, 10, 15],
              'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0]}

gbc_skb_params = {'skb__k': [5, 10, 15],
              'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
             'classifier__max_depth': [1, 2, 3, 4, 5],
             'classifier__loss': ['deviance', 'exponential'],
             'classifier__warm_start': [True, False]}

xgb_skb_params = {'skb__k': [5, 10, 15],
              'classifier__n_estimators': [10, 50, 75, 100],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
             'classifier__max_depth': [1, 2, 3, 4, 5]}

In [21]:
print("Random Forest")
rfc_classifier = test_train_gs(X_train, y_train, rfc_skb_pipe, rfc_skb_params)
print("\nKNN")
knn_classifier = test_train_gs(X_train, y_train, knn_skb_pipe, knn_skb_params)
print("\nSVC")
svc_classifier = test_train_gs(X_train, y_train, svc_skb_pipe, svc_skb_params)
print("\nAda")
ada_classifier = test_train_gs(X_train, y_train, ada_skb_pipe, ada_skb_params)
print("\nGBC")
gbc_classifier = test_train_gs(X_train, y_train, gbc_skb_pipe, gbc_skb_params)


Ada
Best params: {'classifier__learning_rate': 0.75, 'classifier__n_estimators': 500, 'skb__k': 15}
Best fitting score: 0.693929798558243
Train score: 0.73091693054
Test score: 0.703920776071

GBC
Best params: {'classifier__learning_rate': 0.25, 'classifier__loss': 'exponential', 'classifier__max_depth': 5, 'classifier__n_estimators': 200, 'classifier__warm_start': False, 'skb__k': 15}
Best fitting score: 0.7990972175436233
Train score: 0.931954456646
Test score: 0.796685529507


In [19]:
print("\nAda")
ada_classifier = test_train_gs(X_train, y_train, ada_skb_pipe, ada_skb_params)
print("\nGBC")
gbc_classifier = test_train_gs(X_train, y_train, gbc_skb_pipe, gbc_skb_params)

Random Forest
Best params: {'classifier__max_depth': None, 'classifier__max_features': 'auto', 'classifier__n_estimators': 500, 'classifier__oob_score': False, 'skb__k': 15}
Best fitting score: 0.8279323586875968
Train score: 1.0
Test score: 0.829628132579

KNN
Best params: {'classifier__n_neighbors': 7, 'skb__k': 15}
Best fitting score: 0.8282018459880078
Train score: 0.871993532305
Test score: 0.823767178658

SVC
Best params: {'classifier__C': 1000.0, 'skb__k': 15}
Best fitting score: 0.8154685710435896
Train score: 0.850434548272
Test score: 0.820735650768

Ada


NameError: name 'ada_skb_pipe' is not defined

In [27]:
print(rfc_classifier.score(X_test, y_test))
print(knn_classifier.score(X_test, y_test))
print(svc_classifier.score(X_test, y_test))
print(ada_classifier.score(X_test, y_test))
print(gbc_classifier.score(X_test, y_test))

0.829628132579
0.823767178658
0.820735650768
0.703920776071
0.796685529507


In [31]:
knn_classifier

Pipeline(steps=[('scaler1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('skb', SelectKBest(k=15, score_func=<function f_classif at 0x7f72a1938730>)), ('scaler2', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform'))])

In [22]:
import pickle

In [28]:
rfc_pca_classifier = pickle.load(open('classifiers/rfc_classifier', 'rb'))
knn_pca_classifier = pickle.load(open('classifiers/knn_classifier', 'rb'))
svc_pca_classifier = pickle.load(open('classifiers/svc_classifier', 'rb'))
ada_pca_classifier = pickle.load(open('classifiers/ada_classifier', 'rb'))
gbc_pca_classifier = pickle.load(open('classifiers/gbc_classifier', 'rb'))

print(rfc_pca_classifier.score(X_test, y_test))
print(knn_pca_classifier.score(X_test, y_test))
print(svc_pca_classifier.score(X_test, y_test))
print(ada_pca_classifier.score(X_test, y_test))
print(gbc_pca_classifier.score(X_test, y_test))

0.827000808407
0.833265966047
0.816693613581
0.681891673403
0.793451899757


In [30]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

0.82951599782370999

In [49]:
dtc_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))),
                     ('scaler2', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))),
                        ('scaler2', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))),
                         ('scaler2', StandardScaler()),
                     ('classifier', KNeighborsClassifier())])

rfc_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))),
                      ('scaler2', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))),
                         ('scaler2', StandardScaler()),
                     ('classifier', SVC())])


In [50]:
dtc_sfm_params = {'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_sfm_params = {'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn__params = {'classifier__n_neighbors': np.linspace(1,50).astype(int)}

rfc_sfm_params = {'classifier__n_estimators': [200, 500],
              'classifier__max_features': ['log2', 'sqrt', 'auto'],
              'classifier__oob_score': [True, False],
             'classifier__max_depth': [1, 5, None]}

svc_sfm_params = {'classifier__C': np.logspace(-3,3,14)}

In [51]:
print("\nDecision Tree")
dtc_sfm_classifier = test_train_gs(X_train, y_train, rfc_sfm_pipe, rfc_sfm_params)
print("\nLogReg")
lr_sfm_classifier = test_train_gs(X_train, y_train, rfc_sfm_pipe, rfc_sfm_params)
print("\nRandom Forest")
rfc_sfm_classifier = test_train_gs(X_train, y_train, rfc_sfm_pipe, rfc_sfm_params)
print("\nKNN")
knn_sfm_classifier = test_train_gs(X_train, y_train, knn_sfm_pipe, knn_sfm_params)
print("\nSVC")
svc_sfm_classifier = test_train_gs(X_train, y_train, svc_sfm_pipe, svc_sfm_params)


Decision Tree


KeyboardInterrupt: 

In [42]:
rfc_pca_classifier.named_steps['classifier']

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

In [60]:
knn_pca_classifier.named_steps['classifier']

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

In [63]:
svc_pca_classifier.named_steps['classifier']

SVC(C=119.37766417144383, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
from sklearn.ensemble import VotingClassifier

In [74]:
rfc_vote = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

knn_vote = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

svc_vote = SVC(C=119.37766417144383, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


voting = VotingClassifier(estimators = [('rfc', rfc_vote), 
                                        ('knn', knn_vote), 
                                        ('svc', svc_vote)],
                          voting = 'hard'
                         )

In [75]:
voting.fit(X_train, y_train)

VotingClassifier(estimators=[('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         n_jobs=1, voting='hard', weights=None)

In [76]:
print('Train', voting.score(X_train, y_train))
print('Test', voting.score(X_test, y_test))

Train 0.931482853871
Test 0.833265966047


In [68]:
print(voting.score(Xdb_2, ydb_2))
print(voting.score(Xdb_3, ydb_3))

0.83644906528
0.842328835582


In [77]:
print(accuracy_score(voting.predict(Xdb_2), ydb_2))
print(accuracy_score(voting.predict(Xdb_3), ydb_3))

0.836698990303
0.840029985007


In [78]:
print(roc_auc_score(voting.predict(Xdb_2), ydb_2))
print(roc_auc_score(voting.predict(Xdb_3), ydb_3))

0.836703632502
0.84016855959


In [84]:
rfc_vote = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

knn_vote = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

svc_vote = SVC(C=119.37766417144383, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


voting = VotingClassifier(estimators = [('rfc', rfc_vote), 
                                        ('knn', knn_vote), 
                                        ('svc', svc_vote)],
                          voting = 'soft'
                         )

In [85]:
voting.fit(X_train, y_train)

VotingClassifier(estimators=[('rfc', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         n_jobs=1, voting='soft', weights=None)

In [86]:
print('Train', voting.score(X_train, y_train))
print('Test', voting.score(X_test, y_test))

Train 0.951627029576
Test 0.835691188359


In [87]:
print(accuracy_score(voting.predict(Xdb_2), ydb_2))
print(accuracy_score(voting.predict(Xdb_3), ydb_3))

0.836499050285
0.841179410295


In [88]:
print(roc_auc_score(voting.predict(Xdb_2), ydb_2))
print(roc_auc_score(voting.predict(Xdb_3), ydb_3))

0.836540418876
0.841314540886


In [None]:
vote_params = {weights: []}

In [107]:
rfc_vote = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=500, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False)

knn_vote = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='uniform')

svc_vote = SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


knn_vote_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA(n_components=5)),
                     ('scaler2', StandardScaler()),
                     ('classifier', knn_vote)])

rfc_vote_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA(n_components=5)),
                     ('scaler2', StandardScaler()),
                     ('classifier', rfc_vote)])

svc_vote_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA(n_components=5)),
                     ('scaler2', StandardScaler()),
                     ('classifier', svc_vote)])

voting = VotingClassifier(estimators = [('rfc', knn_vote_pipe), 
                                        ('knn', knn_vote_pipe), 
                                        ('svc', svc_vote_pipe)],
                          voting = 'hard'
                         )

In [108]:
voting.fit(X_train, y_train)

VotingClassifier(estimators=[('rfc', Pipeline(steps=[('scaler1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('scaler2', StandardScaler(copy=True, with_mean=True, with_std=Tru...  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]))],
         n_jobs=1, voting='hard', weights=None)

In [109]:
print('Train', voting.score(X_train, y_train))
print('Test', voting.score(X_test, y_test))

Train 0.876035841811
Test 0.833265966047


In [110]:
print(accuracy_score(voting.predict(Xdb_2), ydb_2))
print(accuracy_score(voting.predict(Xdb_3), ydb_3))

0.831150654804
0.833933033483


In [111]:
print(roc_auc_score(voting.predict(Xdb_2), ydb_2))
print(roc_auc_score(voting.predict(Xdb_3), ydb_3))

0.831177970478
0.834152512541


In [102]:
np.linspace(0.5, 2.5, 9)

array([ 0.5 ,  0.75,  1.  ,  1.25,  1.5 ,  1.75,  2.  ,  2.25,  2.5 ])

In [103]:
weight_list = []

for i in np.linspace(0.5, 2.5, 9):
    for j in np.linspace(0.5, 2.5, 9):
        for k in np.linspace(0.5, 2.5, 9):
            weight_list.append([i, j, k])

In [104]:
len(weight_list)

729

In [105]:
voting_params = {'weights': weight_list}

voting_gs = GridSearchCV(voting, voting_params, cv=5, n_jobs=-1)

In [106]:
voting_gs.fit(X_test, y_test)

KeyboardInterrupt: 