In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

uci_features = ['28',  '48',  '64', '105', '128', '153', '241', '281', '318', '336', 
                '338', '378', '433', '442', '451', '453', '455', '472', '475', '493']

madelon_features = ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
                   'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
                   'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
                   'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956']

Xuci_1 = Xuci_1[uci_features]
Xuci_2 = Xuci_2[uci_features]
Xuci_3 = Xuci_3[uci_features]

# !conda install -y psycopg2

from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel, RFECV 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

Xdb_1 = pd.read_pickle('data/madelon_db_1')
Xdb_2 = pd.read_pickle('data/madelon_db_2')
Xdb_3 = pd.read_pickle('data/madelon_db_3')


ydb_1 = Xdb_1['target']
ydb_2 = Xdb_2['target']
ydb_3 = Xdb_3['target']
Xdb_1 = Xdb_1[madelon_features]
Xdb_2 = Xdb_2[madelon_features]
Xdb_3 = Xdb_3[madelon_features]

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier



In [4]:
def corr_reduction(X, y, corr_thresh = 0.7):
    
    #find order of X features from least important to most important in predicting X
    skb = SelectKBest(k=len(X.columns))
    skb.fit(X, y)
    
    tmp_X = X[[col for p,col in sorted(zip(skb.pvalues_,X.columns))]]

    # iterate through columns
    for col in tmp_X.columns:
        corrs = tmp_X.drop(col, axis=1).corrwith(tmp_X[col]) #store the correlations
        
        # if tested column is too highly correlated, drop it
        if max(corrs) > corr_thresh:
            tmp_X = tmp_X.drop(col, axis=1)
            
    return tmp_X, y
        

In [62]:
dtc_pca_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_pca_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                    ('scaler2', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_pca_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', KNeighborsClassifier())])

rfc_pca_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_pca_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
#                      ('scaler2', StandardScaler()),
                     ('classifier', SVC(probability=True))])

# ada_pipe = Pipeline([('scaler1', StandardScaler()),
#                      ('pca', PCA()),
#                      ('scaler2', StandardScaler()),
#                      ('classifier', AdaBoostClassifier())])

# gbc_pipe = Pipeline([('scaler1', StandardScaler()),
#                      ('pca', PCA()),
#                      ('scaler2', StandardScaler()),
#                      ('classifier', GradientBoostingClassifier())])

# xgb_pipe = Pipeline([('scaler1', StandardScaler()),
#                      ('pca', PCA()),
#                      ('scaler2', StandardScaler()),
#                      ('classifier', XGBClassifier())])

In [71]:
dtc_pca_params = {'pca__n_components': [1, 3, 5],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_pca_params = {'pca__n_components': [1, 3, 5],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_pca_params = {'pca__n_components': [1, 3, 5],
             'classifier__n_neighbors': np.linspace(1,50).astype(int)}

rfc_pca_params = {'pca__n_components': [1, 3, 5],
             'classifier__n_estimators': [200, 500],
              'classifier__max_features': ['log2', 'sqrt', 'auto'],
              'classifier__oob_score': [True, False],
             'classifier__max_depth': [1, 5, None]}

svc_pca_params = {'pca__n_components': [1, 3, 5],
              'classifier__C': np.logspace(-3,3,7)}

# ada_params = {'pca__n_components': [1, 3, 5],
#               'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
#              'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0]}

# gbc_params = {'pca__n_components': [1, 3, 5],
#               'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
#              'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
#              'classifier__max_depth': [1, 2, 3, 4, 5],
#              'classifier__loss': ['deviance', 'exponential'],
#              'classifier__warm_start': [True, False]}

# xgb_params = {'pca__n_components': [1, 3, 5],
#               'classifier__n_estimators': [10, 50, 75, 100],
#              'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
#              'classifier__max_depth': [1, 2, 3, 4, 5]}

In [64]:
def test_train_gs(X, y, pipe, param):
    
    gs = GridSearchCV(pipe, param, cv=5, n_jobs=-1)
    gs.fit(X_train, y_train)
    
    print('Best params:', gs.best_params_)
    print('Best fitting score:', gs.best_score_)
    print('Train score:', gs.score(X_train, y_train))
    print('Test score:', gs.score(X_test, y_test))
    
    return gs.best_estimator_
    

In [65]:
X_train, X_test, y_train, y_test = train_test_split(Xdb_1, ydb_1, test_size = 0.25, random_state=42)

In [69]:
# print("\nDecision Tree")
# dtc_pca_classifier = test_train_gs(X_train, y_train, dtc_pca_pipe, dtc_pca_params)
# print("\nLogReg")
# lr_pca_classifier = test_train_gs(X_train, y_train, lr_pca_pipe, lr_pca_params)
# print("\nRandom Forest")
# rfc_pca_classifier = test_train_gs(X_train, y_train, rfc_pca_pipe, rfc_pca_params)
# print("\nKNN")
# knn_pca_classifier = test_train_gs(X_train, y_train, knn_pca_pipe, knn_pca_params)
print("\nSVC")
svc_pca_classifier = test_train_gs(X_train, y_train, svc_pca_pipe, svc_pca_params)



SVC
Best params: {'classifier__C': 10, 'pca__n_components': 5}
Best fitting score: 0.8270565249612613
Train score: 0.879269689416
Test score: 0.831447049313


## Trying SKB instead of PCA

In [10]:
dtc_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', KNeighborsClassifier())])

rfc_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_skb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('skb', SelectKBest()),
                     ('scaler2', StandardScaler()),
                     ('classifier', SVC())])

# ada_skb_pipe = Pipeline([('scaler1', StandardScaler()),
#                      ('skb', SelectKBest()),
#                      ('scaler2', StandardScaler()),
#                      ('classifier', AdaBoostClassifier())])

# gbc_skb_pipe = Pipeline([('scaler1', StandardScaler()),
#                      ('skb', SelectKBest()),
#                      ('scaler2', StandardScaler()),
#                      ('classifier', GradientBoostingClassifier())])

# xgb_skb_pipe = Pipeline([('scaler1', StandardScaler()),
#                      ('skb', SelectKBest()),
#                      ('scaler2', StandardScaler()),
#                      ('classifier', XGBClassifier())])

In [11]:
dtc_skb_params = {'skb__k': [5, 10, 15],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_skb_params = {'skb__k': [5, 10, 15],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_skb_params = {'skb__k': [5, 10, 15],
             'classifier__n_neighbors': np.linspace(1,50).astype(int)}

rfc_skb_params = {'skb__k': [5, 10, 15],
             'classifier__n_estimators': [200, 500],
              'classifier__max_features': ['log2', 'sqrt', 'auto'],
              'classifier__oob_score': [True, False],
             'classifier__max_depth': [1, 5, None]}

svc_skb_params = {'skb__k': [5, 10, 15],
              'classifier__C': np.logspace(-3,3,7)}

# ada_skb_params = {'skb__k': [5, 10, 15],
#               'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
#              'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0]}

# gbc_skb_params = {'skb__k': [5, 10, 15],
#               'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
#              'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
#              'classifier__max_depth': [1, 2, 3, 4, 5],
#              'classifier__loss': ['deviance', 'exponential'],
#              'classifier__warm_start': [True, False]}

# xgb_skb_params = {'skb__k': [5, 10, 15],
#               'classifier__n_estimators': [10, 50, 75, 100],
#              'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
#              'classifier__max_depth': [1, 2, 3, 4, 5]}

In [23]:
print("\nDecision Tree")
dtc_skb_classifier = test_train_gs(X_train, y_train, dtc_skb_pipe, dtc_skb_params)
print("\nLogReg")
lr_skb_classifier = test_train_gs(X_train, y_train, lr_skb_pipe, lr_skb_params)
print("\nRandom Forest")
rfc_skb_classifier = test_train_gs(X_train, y_train, rfc_skb_pipe, rfc_skb_params)
print("\nKNN")
knn_skb_classifier = test_train_gs(X_train, y_train, knn_skb_pipe, knn_skb_params)
print("\nSVC")
svc_skb_classifier = test_train_gs(X_train, y_train, svc_skb_pipe, svc_skb_params)


SVC
Best params: {'classifier__C': 1000.0, 'skb__k': 15}
Best fitting score: 0.8154685710435896
Train score: 0.850434548272
Test score: 0.820735650768


In [13]:
import pickle

In [14]:
# rfc_pca_classifier = pickle.load(open('classifiers/rfc_classifier', 'rb'))
# knn_pca_classifier = pickle.load(open('classifiers/knn_classifier', 'rb'))
# svc_pca_classifier = pickle.load(open('classifiers/svc_classifier', 'rb'))
# ada_pca_classifier = pickle.load(open('classifiers/ada_classifier', 'rb'))
# gbc_pca_classifier = pickle.load(open('classifiers/gbc_classifier', 'rb'))

# print(rfc_pca_classifier.score(X_test, y_test))
# print(knn_pca_classifier.score(X_test, y_test))
# print(svc_pca_classifier.score(X_test, y_test))
# print(ada_pca_classifier.score(X_test, y_test))
# print(gbc_pca_classifier.score(X_test, y_test))

In [16]:
rfc_for_skb = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

dtc_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(rfc_for_skb)),
                     ('scaler2', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(rfc_for_skb)),
                        ('scaler2', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(rfc_for_skb)),
                         ('scaler2', StandardScaler()),
                     ('classifier', KNeighborsClassifier())])

rfc_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(rfc_for_skb)),
                      ('scaler2', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_sfm_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('sfm', SelectFromModel(rfc_for_skb)),
                         ('scaler2', StandardScaler()),
                     ('classifier', SVC())])


In [17]:
dtc_sfm_params = {'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_sfm_params = {'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_sfm_params = {'classifier__n_neighbors': np.linspace(1,50).astype(int)}

rfc_sfm_params = {'classifier__n_estimators': [200, 500],
              'classifier__max_features': ['log2', 'sqrt', 'auto'],
              'classifier__oob_score': [True, False],
             'classifier__max_depth': [1, 5, None]}

svc_sfm_params = {'classifier__C': np.logspace(-3,3,7)}

In [24]:
print("\nDecision Tree")
dtc_sfm_classifier = test_train_gs(X_train, y_train, dtc_sfm_pipe, dtc_sfm_params)
print("\nLogReg")
lr_sfm_classifier = test_train_gs(X_train, y_train, lr_sfm_pipe, lr_sfm_params)
print("\nRandom Forest")
rfc_sfm_classifier = test_train_gs(X_train, y_train, rfc_sfm_pipe, rfc_sfm_params)
print("\nKNN")
knn_sfm_classifier = test_train_gs(X_train, y_train, knn_sfm_pipe, knn_sfm_params)
print("\nSVC")
svc_sfm_classifier = test_train_gs(X_train, y_train, svc_sfm_pipe, svc_sfm_params)


SVC
Best params: {'classifier__C': 1000.0}
Best fitting score: 0.8160075456444115
Train score: 0.849760830021
Test score: 0.822352465643


# Let's see all of the test scores again.


In [42]:
print("PCA")
print("Decision Tree:", dtc_pca_classifier.score(X_test, y_test))
print("LogReg:", lr_pca_classifier.score(X_test, y_test))
print("Random Forest:", rfc_pca_classifier.score(X_test, y_test))
print("KNN:", knn_pca_classifier.score(X_test, y_test))
print("SVC:", svc_pca_classifier.score(X_test, y_test))

print("\nSelectKBest")
print("Decision Tree:", dtc_skb_classifier.score(X_test, y_test))
print("LogReg:", lr_skb_classifier.score(X_test, y_test))
print("Random Forest:", rfc_skb_classifier.score(X_test, y_test))
print("KNN:", knn_skb_classifier.score(X_test, y_test))
print("SVC:", svc_skb_classifier.score(X_test, y_test))

print("\nSelectFromModel")
print("Decision Tree:", dtc_sfm_classifier.score(X_test, y_test))
print("LogReg:", lr_sfm_classifier.score(X_test, y_test))
print("Random Forest:", rfc_sfm_classifier.score(X_test, y_test))
print("KNN:", knn_sfm_classifier.score(X_test, y_test))
print("SVC:", svc_sfm_classifier.score(X_test, y_test))

PCA
Decision Tree: 0.751616814875
LogReg: 0.601050929669
Random Forest: 0.829830234438
KNN: 0.833265966047
SVC: 0.831447049313

SelectKBest
Decision Tree: 0.746564268391
LogReg: 0.601455133387
Random Forest: 0.825383993533
KNN: 0.823767178658
SVC: 0.820735650768

SelectFromModel
Decision Tree: 0.749797898141
LogReg: 0.600848827809
Random Forest: 0.828011317704
KNN: 0.829021827001
SVC: 0.822352465643


## Let's try voting

The PCA models consistently have the strongest accuracy scores. Let's try to see if we can increase the accuracy by ensembling all of the pipelines utilizing PCA. In addition to voting, let's use `GridSearchCV` to determine if weighting the votes will help.

In [74]:
svc_pca_classifier.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('scaler1', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('classifier', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])>

In [45]:
from sklearn.ensemble import VotingClassifier

In [91]:

voting = VotingClassifier(estimators = [('dtc', dtc_pca_classifier), 
                                        ('lr', lr_pca_classifier),
                                        ('rfc', rfc_pca_classifier), 
                                        ('knn', knn_pca_classifier), 
                                        ('svc', svc_pca_classifier)],
                          voting = 'soft'
                         )

In [96]:
weight_list = []

for i in np.linspace(0.0001, 3, 7):
    for j in np.linspace(0.0001, 3, 7):
        for k in np.linspace(0.0001, 3, 7):
            for l in np.linspace(0.0001, 3, 7):
                for m in np.linspace(0.0001, 3, 7):
                    weight_list.append([i, j, k, l, m])

In [97]:
len(weight_list)

16807

In [98]:
voting_params = {'weights': weight_list}

voting_gs = GridSearchCV(voting, voting_params, cv=5, n_jobs=-1)

In [None]:
voting_gs.fit(X_test, y_test)











In [None]:
voting_gs.best_score_

In [None]:
voting_gs.best_params_

In [None]:
voting_gs.score(Xdb_2, ydb_2)

In [None]:
voting_gs.score(Xdb_3, ydb_3)