In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

uci_features = ['28',  '48',  '64', '105', '128', '153', '241', '281', '318', '336', 
                '338', '378', '433', '442', '451', '453', '455', '472', '475', '493']

madelon_features = ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
                   'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
                   'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
                   'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956']

Xuci_1 = Xuci_1[uci_features]
Xuci_2 = Xuci_2[uci_features]
Xuci_3 = Xuci_3[uci_features]

Xdb_1 = Xdb_1[madelon_features]
Xdb_2 = Xdb_2[madelon_features]
Xdb_3 = Xdb_3[madelon_features]

# !conda install -y psycopg2

from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel, RFECV 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm



In [246]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [10]:
def corr_reduction(X, y, corr_thresh = 0.7):
    
    #find order of X features from least important to most important in predicting X
    skb = SelectKBest(k=len(X.columns))
    skb.fit(X, y)
    
    tmp_X = X[[col for p,col in sorted(zip(skb.pvalues_,X.columns))]]

    # iterate through columns
    for col in tmp_X.columns:
        corrs = tmp_X.drop(col, axis=1).corrwith(tmp_X[col]) #store the correlations
        
        # if tested column is too highly correlated, drop it
        if max(corrs) > corr_thresh:
            tmp_X = tmp_X.drop(col, axis=1)
            
    return tmp_X, y
        

In [247]:
dtc_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                    ('scaler2', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', KNeighborsClassifier())])

rfc_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', SVC())])

ada_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', AdaBoostClassifier())])

gbc_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', GradientBoostingClassifier())])

xgb_pipe = Pipeline([('scaler1', StandardScaler()),
                     ('pca', PCA()),
                     ('scaler2', StandardScaler()),
                     ('classifier', XGBClassifier())])

In [248]:
dtc_params = {'pca__n_components': [1, 3, 5],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_params = {'pca__n_components': [1, 3, 5],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_params = {'pca__n_components': [1, 3, 5],
             'classifier__n_neighbors': np.linspace(1,50).astype(int)}

rfc_params = {'pca__n_components': [1, 3, 5],
             'classifier__n_estimators': [200, 500],
              'classifier__max_features': ['log2', 'sqrt', 'auto'],
              'classifier__oob_score': [True, False],
             'classifier__max_depth': [1, 5, None]}

svc_params = {'pca__n_components': [1, 3, 5],
              'classifier__C': np.logspace(-3,3,14)}

ada_params = {'pca__n_components': [1, 3, 5],
              'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0]}

gbc_params = {'pca__n_components': [1, 3, 5],
              'classifier__n_estimators': [10, 25, 50, 75, 100, 200, 500],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
             'classifier__max_depth': [1, 2, 3, 4, 5],
             'classifier__loss': ['deviance', 'exponential'],
             'classifier__warm_start': [True, False]}

xgb_params = {'pca__n_components': [1, 3, 5],
              'classifier__n_estimators': [10, 50, 75, 100],
             'classifier__learning_rate': [0.1, .25, 0.5, 0.75, 1.0],
             'classifier__max_depth': [1, 2, 3, 4, 5]}

In [176]:
np.linspace(1,50).astype(int)

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [201]:
def test_train_gs(X, y, pipe, param):
    
    gs = GridSearchCV(pipe, param, cv=5, n_jobs=-1)
    gs.fit(X_train, y_train)
    
    print('Best params:', gs.best_params_)
    print('Best fitting score:', gs.best_score_)
    print('Train score:', gs.score(X_train, y_train))
    print('Test score:', gs.score(X_test, y_test))
    
    return gs.best_estimator_
    

In [188]:
# dtc_classifier = est_train_gs(Xdb_1, ydb_1, dtc_pipe, dtc_params)

Best params: {'classifier__max_depth': None, 'classifier__splitter': 'best', 'pca__n_components': 5}
Best fitting score: 0.6915824915824916
Train score: 1.0
Test score: 0.737373737374


(array([0, 0, 0, ..., 1, 1, 1]), 1428    0
 56      1
 351     0
 1681    1
 1730    1
 1793    1
 464     0
 414     1
 1653    1
 69      0
 1116    1
 1073    0
 1379    1
 247     0
 1694    0
 1456    1
 1318    0
 1107    0
 984     0
 582     1
 432     0
 462     0
 1307    1
 1161    1
 1200    1
 1523    0
 809     0
 1517    0
 548     0
 1630    0
        ..
 1713    1
 745     1
 1341    1
 408     1
 1887    1
 1684    1
 609     1
 1840    0
 1467    1
 904     1
 1632    1
 637     0
 182     0
 1061    0
 1554    0
 170     1
 292     0
 1939    0
 900     1
 300     1
 1858    0
 1487    1
 532     1
 1635    0
 1368    0
 1911    0
 272     0
 1625    0
 1726    1
 939     0
 Name: target, dtype: int64)

In [179]:
def boxcox_df(df, c=15):
    bc_df = df
    
    for col in df.columns:
        bc_df[col] = stats.boxcox(df[col]+c)[0]
#         bc_df[col] = stats.boxcox(df[col])[0]
        
    return bc_df

In [198]:
# test_train_gs(boxcox_df(Xdb_1), ydb_1, dtc_pipe, dtc_params)

In [199]:
# test_train_gs(Xdb_2, ydb_2, dtc_pipe, dtc_params)

In [200]:
# test_train_gs(Xdb_3, ydb_3, dtc_pipe, dtc_params)

In [204]:
X_train, X_test, y_train, y_test = train_test_split(Xdb_1, ydb_1, test_size = 0.25, random_state=42)

print("Random Forest")
rfc_classifier = test_train_gs(X_train, y_train, rfc_pipe, rfc_params)
print("\nKNN")
knn_classifier = test_train_gs(X_train, y_train, knn_pipe, knn_params)
print("\nSVC")
svc_classifier = test_train_gs(X_train, y_train, svc_pipe, svc_params)
print("\nAda")
ada_classifier = test_train_gs(X_train, y_train, ada_pipe, ada_params)
print("\nGBC")
gbc_classifier = test_train_gs(X_train, y_train, gbc_pipe, gbc_params)

Random Forest
Best params: {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 500, 'classifier__oob_score': True, 'pca__n_components': 5}
Best fitting score: 0.7831649831649832
Train score: 1.0
Test score: 0.791919191919

KNN
Best params: {'classifier__n_neighbors': 3, 'pca__n_components': 5}
Best fitting score: 0.7865319865319865
Train score: 0.889562289562
Test score: 0.79595959596

SVC
Best params: {'classifier__C': 41.246263829013564, 'pca__n_components': 5}
Best fitting score: 0.7757575757575758
Train score: 0.868013468013
Test score: 0.806060606061

Ada
Best params: {'classifier__learning_rate': 0.1, 'classifier__n_estimators': 200, 'pca__n_components': 5}
Best fitting score: 0.6686868686868687
Train score: 0.698316498316
Test score: 0.684848484848

GBC
Best params: {'classifier__learning_rate': 0.25, 'classifier__loss': 'deviance', 'classifier__max_depth': 5, 'classifier__n_estimators': 75, 'classifier__warm_start': True, 'pca__n_comp

In [252]:
print("\nXGB")
xgb_classifier = test_train_gs(X_train, y_train, xgb_pipe, xgb_params)


XGB
Best params: {'classifier__learning_rate': 0.25, 'classifier__max_depth': 4, 'classifier__n_estimators': 100, 'pca__n_components': 5}
Best fitting score: 0.7454545454545455
Train score: 0.954208754209
Test score: 0.781818181818


In [253]:
votes = pd.DataFrame({'rfc': rfc_classifier.predict(X_test),
                     'knn': knn_classifier.predict(X_test),
                     'svc': svc_classifier.predict(X_test),
                      'ada': ada_classifier.predict(X_test),
                      'gbc': gbc_classifier.predict(X_test),
                      'xgb': xgb_classifier.predict(X_test),
                     'truth': y_test})

In [254]:
votes['vote'] = round(votes[['gbc','rfc']].max(axis=1)).astype(int)

In [255]:
import itertools

In [256]:
results = []

for i in range(1,7):
    for cols in itertools.combinations(votes.drop(['truth', 'vote'], axis=1), i):
        col_list = list(cols)
        
        results.append({'classifiers': col_list,
                       'max': np.mean(votes[col_list].max(axis=1) == votes['truth']),
                       'majority': np.mean(round(votes[col_list].mean(axis=1)) == votes['truth']),
                       'majority0.6': np.mean((votes[col_list].mean(axis=1) >= 0.6) == votes['truth'])

                       })
        

# print(np.mean(votes['ada'] == votes['truth']))
results_df = pd.DataFrame(results)
results_df['highest'] = results_df[['max', 'majority', 'majority0.6']].max(axis=1)
results_df.sort_values('highest', ascending=False)

Unnamed: 0,classifiers,majority,majority0.6,max,highest
39,"[knn, svc, xgb]",0.816162,0.816162,0.787879,0.816162
55,"[knn, rfc, svc, xgb]",0.814141,0.814141,0.795960,0.814141
37,"[knn, rfc, svc]",0.800000,0.800000,0.814141,0.814141
53,"[gbc, knn, svc, xgb]",0.814141,0.814141,0.797980,0.814141
16,"[knn, svc]",0.789899,0.789899,0.812121,0.812121
49,"[ada, knn, svc, xgb]",0.810101,0.810101,0.739394,0.810101
15,"[knn, rfc]",0.777778,0.777778,0.810101,0.810101
32,"[gbc, knn, svc]",0.810101,0.810101,0.806061,0.810101
26,"[ada, knn, svc]",0.808081,0.808081,0.745455,0.808081
51,"[gbc, knn, rfc, svc]",0.802020,0.802020,0.806061,0.806061


In [None]:
votes[votes.truth != votes.vote]