In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

uci_features = ['28',  '48',  '64', '105', '128', '153', '241', '281', '318', '336', 
                '338', '378', '433', '442', '451', '453', '455', '472', '475', '493']

madelon_features = ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
                   'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
                   'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
                   'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956']

Xuci_1 = Xuci_1[uci_features]
Xuci_2 = Xuci_2[uci_features]
Xuci_3 = Xuci_3[uci_features]

# !conda install -y psycopg2

from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel, RFECV 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm_notebook
import itertools

Xdb_1 = pd.read_pickle('data/madelon_db_1')
Xdb_2 = pd.read_pickle('data/madelon_db_2')
Xdb_3 = pd.read_pickle('data/madelon_db_3')


ydb_1 = Xdb_1['target']
ydb_2 = Xdb_2['target']
ydb_3 = Xdb_3['target']
Xdb_1 = Xdb_1[madelon_features]
Xdb_2 = Xdb_2[madelon_features]
Xdb_3 = Xdb_3[madelon_features]

from sklearn.metrics import roc_auc_score, accuracy_score




## Brute Force of testing Feature Correlations. 

This approach is based on two pieces of information:
1. Previous grid searches of PCA consistently found that 5 PCA components provided the best fit. This suggests that the Database data includes five true predictors.
2. the true predictors are independent from each other.

The following code will attempt to find the set of 5 feature with the least amount of correlations. This will be accomplished by 
1. Testing every set of 5 features from the 20 features previously identified.
1. Creating a cross correlation matrix for each set of 5
1. Taking the sum of the absolute values of the correlation matrix.

The assumption is that the set of 5 features with the lowest total correlation will be the most independent set of features.

In [4]:
corr_results = []
combos = list(itertools.combinations(Xdb_1, 5))

for cols in tqdm_notebook(combos):
    corr1 = Xdb_1[list(cols)].corr()
    corr2 = Xdb_2[list(cols)].corr()
    corr3 = Xdb_3[list(cols)].corr()

    tmp = pd.concat([corr1, corr2, corr3])
    mean_corr = abs(tmp).groupby(tmp.index).mean()


    corr_results.append({'columns': cols,
                         'Xdb_1_corr_sum': abs(corr1).sum().sum() - 5,
                         'Xdb_2_corr_sum': abs(corr2).sum().sum() - 5,
                         'Xdb_3_corr_sum': abs(corr3).sum().sum() - 5,
                         'mean_corr_sum': mean_corr.sum().sum()})




In [5]:
corr_results_df = pd.DataFrame(corr_results)
corr_results_df.sort_values('mean_corr_sum')

Unnamed: 0,Xdb_1_corr_sum,Xdb_2_corr_sum,Xdb_3_corr_sum,columns,mean_corr_sum
3445,5.875845,5.872196,5.888319,"(feat_257, feat_526, feat_681, feat_736, feat_...",5.878787
3729,7.691497,7.655846,7.592507,"(feat_257, feat_681, feat_736, feat_920, feat_...",7.646617
14895,7.637696,7.655231,7.667335,"(feat_526, feat_681, feat_736, feat_920, feat_...",7.653421
15322,7.832736,7.839899,7.783426,"(feat_681, feat_724, feat_736, feat_920, feat_...",7.818687
3199,7.837366,7.894866,7.896389,"(feat_257, feat_504, feat_526, feat_736, feat_...",7.876207
14888,7.899120,7.893090,7.891945,"(feat_526, feat_681, feat_736, feat_808, feat_...",7.894719
3718,7.948210,7.918848,7.854657,"(feat_257, feat_681, feat_736, feat_769, feat_...",7.907238
3349,7.923472,7.955926,7.884594,"(feat_257, feat_504, feat_736, feat_769, feat_...",7.921331
2308,7.954591,7.909024,7.965330,"(feat_257, feat_336, feat_526, feat_681, feat_...",7.942982
1242,8.039368,7.934679,7.966066,"(feat_257, feat_308, feat_504, feat_701, feat_...",7.980038


In [6]:
corr_results_df.loc[3445, 'columns']

('feat_257', 'feat_526', 'feat_681', 'feat_736', 'feat_920')

Naive testing of the 5 identified features. Since we suspect that these 5 features may be the 'true' predictors, we will exclude and feature selection and/or dimensionality reduction.

In [20]:
dtc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', KNeighborsClassifier(weights='distance'))])

rfc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', SVC(probability=True))])

In [8]:

def test_all_pipes_reduced(X, y):
    X_reduced = X[['feat_257', 'feat_526', 'feat_681', 'feat_736', 'feat_920']]
    
    X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size = 0.25, random_state=42)
    
    scores = []
    
    for pipe in tqdm_notebook([dtc_pipe, lr_pipe, knn_pipe, rfc_pipe, svc_pipe]):
        pipe.fit(X_train, y_train)
        
        train_score = pipe.score(X_train, y_train)
        test_score = pipe.score(X_test, y_test)
        
        scores.append({'classifier': pipe.named_steps['classifier'],
                      'train_score': train_score,
                      'test_score': test_score})
    
    scores_df = pd.DataFrame(scores)
    return scores_df
        

In [9]:
Xdb_1_reduced_naive = test_all_pipes_reduced(Xdb_1, ydb_1)
Xdb_2_reduced_naive = test_all_pipes_reduced(Xdb_2, ydb_2)
Xdb_3_reduced_naive = test_all_pipes_reduced(Xdb_3, ydb_3)










In [10]:
Xdb_1_reduced_naive

Unnamed: 0,classifier,test_score,train_score
0,"DecisionTreeClassifier(class_weight=None, crit...",0.743937,1.0
1,"LogisticRegression(C=1.0, class_weight=None, d...",0.601051,0.610389
2,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.826395,0.886613
3,"(DecisionTreeClassifier(class_weight=None, cri...",0.796686,0.989692
4,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.787793,0.793438


In [11]:
Xdb_2_reduced_naive

Unnamed: 0,classifier,test_score,train_score
0,"DecisionTreeClassifier(class_weight=None, crit...",0.740304,1.0
1,"LogisticRegression(C=1.0, class_weight=None, d...",0.617553,0.599374
2,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.827469,0.883298
3,"(DecisionTreeClassifier(class_weight=None, cri...",0.789684,0.991469
4,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.778289,0.788256


In [12]:
Xdb_3_reduced_naive

Unnamed: 0,classifier,test_score,train_score
0,"DecisionTreeClassifier(class_weight=None, crit...",0.736158,1.0
1,"LogisticRegression(C=1.0, class_weight=None, d...",0.597242,0.611381
2,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.825305,0.885587
3,"(DecisionTreeClassifier(class_weight=None, cri...",0.787128,0.989871
4,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.769938,0.784367


These naive test scores are looking nearly as strong as my tuned models utilizing PCA, suggesting that I may have identified the true predictors. Let's GridSearch some parameters and see what kind of scores we can achieve 

In [2]:
Xdb_true1 = Xdb_1[['feat_257', 'feat_526', 'feat_681', 'feat_736', 'feat_920']]
Xdb_true2 = Xdb_2[['feat_257', 'feat_526', 'feat_681', 'feat_736', 'feat_920']]
Xdb_true3 = Xdb_3[['feat_257', 'feat_526', 'feat_681', 'feat_736', 'feat_920']]

In [31]:
dtc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', DecisionTreeClassifier())])

lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', LogisticRegression())])

knn_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', KNeighborsClassifier(weights='distance'))])

rfc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('classifier', SVC(probability=True))])

dtc_params = {'classifier__max_depth': [1, 3, 5, 10, 15, None],
                  'classifier__splitter': ['random', 'best']}

lr_params = {'classifier__penalty': ['l1', 'l2'],
                 'classifier__max_iter': [100, 500],
                 'classifier__C': np.logspace(-3,3,7)}

knn_params = {'classifier__algorithm': ['auto'],
                  'classifier__p': [2, 3],
                  'classifier__n_neighbors': np.linspace(1,10).astype(int)}

rfc_params = {'classifier__n_estimators': [10, 50, 100, 200, 500],
                  'classifier__max_features': ['log2', 'sqrt', 'auto'],
                  'classifier__max_depth': [1, 5, None]}

svc_params = {'classifier__C': np.logspace(-3,3,15)}

In [3]:
def test_train_gs(X_train, y_train, X_test, y_test, pipe, param):
    
    gs = GridSearchCV(pipe, param, cv=5, n_jobs=-1)
    gs.fit(X_train, y_train)
    
    print('Best params:', gs.best_params_)
#     print('Best fitting score:', gs.best_score_)
#     print('Train score:', gs.score(X_train, y_train))
    print('Test score:', gs.score(X_test, y_test))
    
    return gs.best_estimator_

In [4]:
X1_train, X1_test, y1_train, y1_test = train_test_split(Xdb_true1, ydb_1, test_size = 0.25, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(Xdb_true2, ydb_2, test_size = 0.25, random_state=42)
X3_train, X3_test, y3_train, y3_test = train_test_split(Xdb_true3, ydb_3, test_size = 0.25, random_state=42)

In [40]:
print("Xdb_true1")
print("Decision Tree")
dtc_pca_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, dtc_pipe, dtc_params)
print("\nLogReg")
lr_pca_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, lr_pipe, lr_params)
print("\nRandom Forest")
rfc_pca_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, rfc_pipe, rfc_params)
print("\nKNN")
knn_pca_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, knn_pipe, knn_params)
print("\nSVC")
svc_pca_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, svc_pipe, svc_params)

print("Xdb_true2")
print("Decision Tree")
dtc_pca_classifier = test_train_gs(X2_train, y2_train, X2_test, y2_test, dtc_pipe, dtc_params)
print("\nLogReg")
lr_pca_classifier = test_train_gs(X2_train, y2_train, X2_test, y2_test, lr_pipe, lr_params)
print("\nRandom Forest")
rfc_pca_classifier = test_train_gs(X2_train, y2_train, X2_test, y2_test, rfc_pipe, rfc_params)
print("\nKNN")
knn_pca_classifier = test_train_gs(X2_train, y2_train, X2_test, y2_test, knn_pipe, knn_params)
print("\nSVC")
svc_pca_classifier = test_train_gs(X2_train, y2_train, X2_test, y2_test, svc_pipe, svc_params)

print("Xdb_true3")
print("Decision Tree")
dtc_pca_classifier = test_train_gs(X3_train, y3_train, X3_test, y3_test, dtc_pipe, dtc_params)
print("\nLogReg")
lr_pca_classifier = test_train_gs(X3_train, y3_train, X3_test, y3_test, lr_pipe, lr_params)
print("\nRandom Forest")
rfc_pca_classifier = test_train_gs(X3_train, y3_train, X3_test, y3_test, rfc_pipe, rfc_params)
print("\nKNN")
knn_pca_classifier = test_train_gs(X3_train, y3_train, X3_test, y3_test, knn_pipe, knn_params)
print("\nSVC")
svc_pca_classifier = test_train_gs(X3_train, y3_train, X3_test, y3_test, svc_pipe, svc_params)

Xdb_true1
Decision Tree
Best params: {'classifier__max_depth': 10, 'classifier__splitter': 'best'}
Test score: 0.743734842361

LogReg
Best params: {'classifier__C': 0.01, 'classifier__max_iter': 100, 'classifier__penalty': 'l1'}
Test score: 0.599029911075

Random Forest
Best params: {'classifier__max_depth': None, 'classifier__max_features': 'sqrt', 'classifier__n_estimators': 500}
Test score: 0.822958771221

KNN
Best params: {'classifier__algorithm': 'auto', 'classifier__n_neighbors': 8, 'classifier__p': 2}
Test score: 0.837712206952

SVC
Best params: {'classifier__C': 1000.0}
Test score: 0.825586095392
Xdb_true2
Decision Tree
Best params: {'classifier__max_depth': 10, 'classifier__splitter': 'best'}
Test score: 0.74450219912

LogReg
Best params: {'classifier__C': 1.0, 'classifier__max_iter': 100, 'classifier__penalty': 'l1'}
Test score: 0.617952818872

Random Forest
Best params: {'classifier__max_depth': None, 'classifier__max_features': 'log2', 'classifier__n_estimators': 500}
Test 

These scores are on par with my best tuned models using PCA, further indicating that I have isolated the 5 true features. Let's see if the models can be further tuned

In [37]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [47]:
knn_pipe = Pipeline([('poly', PolynomialFeatures()),
                     ('scaler', StandardScaler()),
                     ('classifier', KNeighborsClassifier())])

rfc_pipe = Pipeline([('poly', PolynomialFeatures()),
                     ('scaler', StandardScaler()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('poly', PolynomialFeatures()),
                     ('scaler', StandardScaler()),
                     ('classifier', SVC(probability=True))])

mnb_pipe = Pipeline([('poly', PolynomialFeatures()),
                     ('scaler', MinMaxScaler(feature_range=(1,2))),
                     ('classifier', MultinomialNB())])

gnb_pipe = Pipeline([('poly', PolynomialFeatures()),
                     ('scaler', StandardScaler()),
                     ('classifier', GaussianNB())])

knn_params = {'poly__degree': [2, 3, 4, 5],
             'poly__interaction_only': [True, False],
             'poly__include_bias': [True, False],
             'classifier__algorithm': ['auto', 'ball_tree'],
              'classifier__weights': ['distance'],
              'classifier__n_neighbors': np.linspace(5,15).astype(int)}

rfc_params = {'poly__degree': [2, 3, 4, 5],
             'poly__interaction_only': [True, False],
             'poly__include_bias': [True, False],
             'classifier__n_estimators': [ 500],

              'classifier__max_depth': [None]}

svc_params = {'poly__degree': [2, 3, 4, 5],
             'poly__interaction_only': [True, False],
             'poly__include_bias': [True, False],
             'classifier__C': np.logspace(0,4,15)
             }

mnb_params = {'poly__degree': [2, 3, 4, 5],
             'poly__interaction_only': [True, False],
             'poly__include_bias': [True, False],
             'classifier__alpha': np.linspace(0,2,8)
             }

gnb_params = {'poly__degree': [2, 3, 4, 5],
             'poly__interaction_only': [True, False],
             'poly__include_bias': [True, False],
             'classifier__priors': [(0.5,0.5)]
             }

In [48]:
np.logspace(0,4,15)

array([  1.00000000e+00,   1.93069773e+00,   3.72759372e+00,
         7.19685673e+00,   1.38949549e+01,   2.68269580e+01,
         5.17947468e+01,   1.00000000e+02,   1.93069773e+02,
         3.72759372e+02,   7.19685673e+02,   1.38949549e+03,
         2.68269580e+03,   5.17947468e+03,   1.00000000e+04])

In [None]:
print("Xdb_true1")
print("\nMNB")
mnb_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, mnb_pipe, mnb_params)
print("\nGNB")
gnb_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, gnb_pipe, gnb_params)
print("\nRandom Forest")
rfc_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, rfc_pipe, rfc_params)
print("\nKNN")
knn_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, knn_pipe, knn_params)
print("\nSVC")
svc_classifier = test_train_gs(X1_train, y1_train, X1_test, y1_test, svc_pipe, svc_params)


Xdb_true1

MNB
Best params: {'classifier__alpha': 0.0, 'poly__degree': 5, 'poly__include_bias': True, 'poly__interaction_only': False}
Test score: 0.577607113985

GNB
Best params: {'classifier__priors': (0.5, 0.5), 'poly__degree': 2, 'poly__include_bias': True, 'poly__interaction_only': True}
Test score: 0.639854486661

Random Forest
Best params: {'classifier__max_depth': None, 'classifier__n_estimators': 500, 'poly__degree': 2, 'poly__include_bias': True, 'poly__interaction_only': False}
Test score: 0.823565076799

KNN
Best params: {'classifier__algorithm': 'auto', 'classifier__n_neighbors': 8, 'classifier__weights': 'distance', 'poly__degree': 2, 'poly__include_bias': True, 'poly__interaction_only': False}
Test score: 0.829830234438

SVC


In [24]:
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures

In [34]:
mnb_pipe = Pipeline([('scaler', MinMaxScaler()),
                     ('poly', PolynomialFeatures(5)),
                     ('classifier', MultinomialNB())])


In [35]:
mnb_gs = GridSearchCV(mnb_pipe, mnb_params, cv=5, n_jobs = -1)
mnb_gs.fit(X1_train, y1_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('poly', PolynomialFeatures(degree=5, include_bias=True, interaction_only=False)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'classifier__alpha': array([ 0.     ,  0.28571,  0.57143,  0.85714,  1.14286,  1.42857,
        1.71429,  2.     ])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [36]:
mnb_gs.score(X1_test, y1_test)

0.60610347615198057

Try Naive Bayes!

In [42]:

lr_pipe = Pipeline([('poly', PolynomialFeatures()),
                    ('scaler', StandardScaler()),
                     ('classifier', LogisticRegression())])

lr_params = {'poly__degree': [2, 3, 4, 5],
             'poly__interaction_only': [True, False],
             'poly__include_bias': [True, False],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

In [43]:
lr_gs = GridSearchCV(lr_pipe, lr_params, cv=5, n_jobs = -1)
lr_gs.fit(X1_train, y1_train)
lr_gs.score(X1_test, y1_test)

0.78213419563459985

In [44]:
lr_gs.best_params_

{'classifier__C': 100.0,
 'classifier__max_iter': 100,
 'classifier__penalty': 'l1',
 'poly__degree': 5,
 'poly__include_bias': False,
 'poly__interaction_only': False}

Maybe test a voting classifier of 5 LogRegs, each individual classifier using a separate feature to predict?

Also, maybe test `from sklearn.neural_network import MLPClassifier`


In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
lr_257 = LogisticRegression(C = 100, max_iter = 100, penalty = 'l1')
lr_526 = LogisticRegression(C = 100, max_iter = 100, penalty = 'l1')
lr_681 = LogisticRegression(C = 100, max_iter = 100, penalty = 'l1')
lr_736 = LogisticRegression(C = 100, max_iter = 100, penalty = 'l1')
lr_920 = LogisticRegression(C = 100, max_iter = 100, penalty = 'l1')

lr_257.fit(X1_train['feat_257'], y1_train)
lr_526.fit(X1_train['feat_526'], y1_train)
lr_681.fit(X1_train['feat_681'], y1_train)
lr_736.fit(X1_train['feat_736'], y1_train)
lr_920.fit(X1_train['feat_920'], y1_train)

lr_ensemble_df = pd.DataFrame({'lr_257': lr_257.predict(X1_test['feat_257']),
                               'lr_526': lr_526.predict(X1_test['feat_526']),
                               'lr_681': lr_681.predict(X1_test['feat_681']),
                               'lr_736': lr_736.predict(X1_test['feat_736']),
                               'lr_920': lr_920.predict(X1_test['feat_920']),
                              'class': y1_train})

['feat_257', 'feat_526', 'feat_681', 'feat_736', 'feat_920']