In [24]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

uci_features = ['28',  '48',  '64', '105', '128', '153', '241', '281', '318', '336', 
                '338', '378', '433', '442', '451', '453', '455', '472', '475', '493']

madelon_features = ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
                   'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
                   'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
                   'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956']

Xuci_1 = Xuci_1[uci_features]
Xuci_2 = Xuci_2[uci_features]
Xuci_3 = Xuci_3[uci_features]

# !conda install -y psycopg2

from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel, RFECV 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm_notebook

Xdb_1 = pd.read_pickle('data/madelon_db_1')
Xdb_2 = pd.read_pickle('data/madelon_db_2')
Xdb_3 = pd.read_pickle('data/madelon_db_3')


ydb_1 = Xdb_1['target']
ydb_2 = Xdb_2['target']
ydb_3 = Xdb_3['target']
Xdb_1 = Xdb_1[madelon_features]
Xdb_2 = Xdb_2[madelon_features]
Xdb_3 = Xdb_3[madelon_features]

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
def corr_reduction(X, y, corr_thresh = 0.7):
    
    #find order of X features from least important to most important in predicting X
    skb = SelectKBest(k=len(X.columns))
    skb.fit(X, y)
    
    tmp_X = X[[col for p,col in sorted(zip(skb.pvalues_,X.columns))]]

    # iterate through columns
    for col in tmp_X.columns:
        corrs = tmp_X.drop(col, axis=1).corrwith(tmp_X[col]) #store the correlations
        
        # if tested column is too highly correlated, drop it
        if max(corrs) > corr_thresh:
            tmp_X = tmp_X.drop(col, axis=1)
            
    return tmp_X, y
        

In [3]:
X_test, y_test = corr_reduction(Xdb_1, ydb_1)

X_test.shape

(1980, 13)

In [4]:
def uci_group(X):
    grouped_df = pd.DataFrame()
    grouped_df['A'] = X[['28','451','318']].mean(axis=1)
    grouped_df['B'] = X[['105','128']].mean(axis=1)
    grouped_df['C'] = X[['241','475']].mean(axis=1)
    grouped_df['D'] = X[['378','48']].mean(axis=1)
    grouped_df['E'] = X[['153','281','433']].mean(axis=1)
    grouped_df['F'] = X[['64','336']].mean(axis=1)
    grouped_df['G'] = X[['453', '493']].mean(axis=1)
    grouped_df['H'] = X[['472','442']].mean(axis=1)
    grouped_df['I'] = X[['338']].mean(axis=1)
    grouped_df['J'] = X[['455']].mean(axis=1)
    
    return grouped_df

def mad_group(X):
    grouped_df = pd.DataFrame()
    grouped_df['A'] = X[['feat_956','feat_639','feat_829']].mean(axis=1)
    grouped_df['B'] = X[['feat_269','feat_315', 'feat_701']].mean(axis=1) #701 is negative correlated with the other two, so i am not 100% sure what the outcome will be
    grouped_df['C'] = X[['feat_341','feat_395']].mean(axis=1)
    grouped_df['D'] = X[['feat_336', 'feat_867']].mean(axis=1)
    grouped_df['E'] = X[['feat_808', 'feat_257']].mean(axis=1)
    grouped_df['F'] = X[['feat_308', 'feat_736']].mean(axis=1)
    grouped_df['G'] = X[['feat_504', 'feat_681']].mean(axis=1)
    grouped_df['H'] = X[['feat_724', 'feat_769']].mean(axis=1)
    grouped_df['I'] = X[['feat_526']].mean(axis=1)
    grouped_df['J'] = X[['feat_920']].mean(axis=1)
    
    return grouped_df


In [5]:
Xuci_1_grouped = uci_group(Xuci_1)
Xuci_2_grouped = uci_group(Xuci_2)
Xuci_3_grouped = uci_group(Xuci_3)

Xdb_1_grouped = mad_group(Xdb_1)
Xdb_2_grouped = mad_group(Xdb_2)
Xdb_3_grouped = mad_group(Xdb_3)


In [6]:
dtc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', DecisionTreeClassifier())])

lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', LogisticRegression())])

knn_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', KNeighborsClassifier())])

rfc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', SVC())])

### Some naive testing of the pipes

In [7]:
def pipe_test(X, y, pipeline):
    X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.25, random_state=42)
    
    pipe = pipeline
    pipe.fit(X_train, y_train)
    
    print(pipe.score(X_train, y_train))
    print(pipe.score(X_test, y_test))
    

In [8]:
print("\nTesting UCI 1")
pipe_test(Xuci_1_grouped, yuci_1, dtc_pipe)

print("\nTesting UCI 2")
pipe_test(Xuci_2_grouped, yuci_2, dtc_pipe)

print("\nTesting UCI 3")
pipe_test(Xuci_3_grouped, yuci_3, dtc_pipe)

print("\nTesting DB 1")
pipe_test(Xdb_1_grouped, ydb_1, dtc_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_2_grouped, ydb_2, dtc_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_3_grouped, ydb_3, dtc_pipe)


Testing UCI 1
1.0
0.606060606061

Testing UCI 2
1.0
0.545454545455

Testing UCI 3
1.0
0.590909090909

Testing DB 1
1.0
0.657912457912

Testing DB 2
1.0
0.621182586095

Testing DB 2
1.0
0.655450874832


In [9]:
print("\nTesting UCI 1")
pipe_test(Xuci_1_grouped, yuci_1, lr_pipe)

print("\nTesting UCI 2")
pipe_test(Xuci_2_grouped, yuci_2, lr_pipe)

print("\nTesting UCI 3")
pipe_test(Xuci_3_grouped, yuci_3, lr_pipe)

print("\nTesting DB 1")
pipe_test(Xdb_1_grouped, ydb_1, lr_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_2_grouped, ydb_2, lr_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_3_grouped, ydb_3, lr_pipe)


Testing UCI 1
0.7
0.515151515152

Testing UCI 2
0.7
0.584848484848

Testing UCI 3
0.609090909091
0.590909090909

Testing DB 1
0.593939393939
0.575084175084

Testing DB 2
0.599221789883
0.593242365172

Testing DB 2
0.618951612903
0.601615074024


In [10]:
print("\nTesting UCI 1")
pipe_test(Xuci_1_grouped, yuci_1, knn_pipe)

print("\nTesting UCI 2")
pipe_test(Xuci_2_grouped, yuci_2, knn_pipe)

print("\nTesting UCI 3")
pipe_test(Xuci_3_grouped, yuci_3, knn_pipe)

print("\nTesting DB 1")
pipe_test(Xdb_1_grouped, ydb_1, knn_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_2_grouped, ydb_2, knn_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_3_grouped, ydb_3, knn_pipe)


Testing UCI 1
0.881818181818
0.672727272727

Testing UCI 2
0.836363636364
0.633333333333

Testing UCI 3
0.809090909091
0.675757575758

Testing DB 1
0.822222222222
0.70101010101

Testing DB 2
0.77626459144
0.679662118259

Testing DB 2
0.800403225806
0.693135935397


In [11]:
print("\nTesting UCI 1")
pipe_test(Xuci_1_grouped, yuci_1, rfc_pipe)

print("\nTesting UCI 2")
pipe_test(Xuci_2_grouped, yuci_2, rfc_pipe)

print("\nTesting UCI 3")
pipe_test(Xuci_3_grouped, yuci_3, rfc_pipe)

print("\nTesting DB 1")
pipe_test(Xdb_1_grouped, ydb_1, rfc_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_2_grouped, ydb_2, rfc_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_3_grouped, ydb_3, rfc_pipe)


Testing UCI 1
0.972727272727
0.551515151515

Testing UCI 2
0.981818181818
0.563636363636

Testing UCI 3
0.981818181818
0.578787878788

Testing DB 1
0.987878787879
0.692929292929

Testing DB 2
0.990272373541
0.679662118259

Testing DB 2
0.985887096774
0.679676985195


### Almost all naive testing results in test scores ranging from ~0.55 - 0.68

In [12]:
dtc_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
             'classifier__n_neighbors': [1, 5, 9, 15, 25]}

rfc_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
             'classifier__n_estimators': [10, 50, 100, 200, 500],
             'classifier__max_depth': [1, 5, None]}

svc_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
              'classifier__C': np.logspace(-3,3,7)}




In [31]:
def gridsearch_pipe(X, y, pipeline, params, cv=5):
    X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.25, random_state=42)
    
    gs = GridSearchCV(pipeline, params, cv=cv, n_jobs=-1)
    gs.fit(X_train, y_train)
    
    results = {'best_params': gs.best_params_,
               'cv_score': gs.best_score_,
              'train_score': gs.score(X_train, y_train),
              'test_score': gs.score(X_test, y_test),
              'estimator': gs.best_estimator_.named_steps['classifier']}

    return results

In [32]:
def test_all_pipes(X, y):
    scores = []
    
    for pipe, param in tqdm([(dtc_pipe, dtc_params), (lr_pipe, lr_params), 
                        (knn_pipe, knn_params), (rfc_pipe, rfc_params),
                            (svc_pipe, svc_params)]):
        scores.append(gridsearch_pipe(X, y, pipe, param))
    
    scores_df = pd.DataFrame(scores)
    return scores_df
        

In [15]:
# with n_jobs=-1 in gridsearch_pipe()
uci1_gs_results = test_all_pipes(Xuci_1_grouped, yuci_1)
uci2_gs_results = test_all_pipes(Xuci_2_grouped, yuci_2)
uci3_gs_results = test_all_pipes(Xuci_3_grouped, yuci_3)
db1_gs_results = test_all_pipes(Xdb_1_grouped, ydb_1)
db2_gs_results = test_all_pipes(Xdb_2_grouped, ydb_2)
db3_gs_results = test_all_pipes(Xdb_3_grouped, ydb_3)

100%|██████████| 5/5 [00:35<00:00,  7.04s/it]
100%|██████████| 5/5 [00:35<00:00,  7.08s/it]
100%|██████████| 5/5 [00:34<00:00,  6.93s/it]
100%|██████████| 5/5 [00:39<00:00,  7.98s/it]
100%|██████████| 5/5 [00:40<00:00,  8.03s/it]
100%|██████████| 5/5 [00:38<00:00,  7.78s/it]


In [16]:
uci1_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 15, 'classifier__spl...",0.727273,"DecisionTreeClassifier(class_weight=None, crit...",0.636364,1.0
1,"{'classifier__C': 0.1, 'classifier__max_iter':...",0.7,"LogisticRegression(C=0.10000000000000001, clas...",0.542424,0.7
2,"{'classifier__n_neighbors': 1, 'pca__n_compone...",0.781818,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.69697,1.0
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.763636,"(DecisionTreeClassifier(class_weight=None, cri...",0.621212,0.963636
4,"{'classifier__C': 1.0, 'pca__n_components': 5,...",0.809091,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.7,0.909091


In [17]:
uci2_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': None, 'classifier__s...",0.718182,"DecisionTreeClassifier(class_weight=None, crit...",0.566667,1.0
1,"{'classifier__C': 1.0, 'classifier__max_iter':...",0.672727,"LogisticRegression(C=1.0, class_weight=None, d...",0.606061,0.681818
2,"{'classifier__n_neighbors': 1, 'pca__n_compone...",0.763636,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.690909,1.0
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.718182,"(DecisionTreeClassifier(class_weight=None, cri...",0.654545,0.927273
4,"{'classifier__C': 1.0, 'pca__n_components': 5,...",0.754545,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.681818,0.890909


In [18]:
uci3_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 15, 'classifier__spl...",0.663636,"DecisionTreeClassifier(class_weight=None, crit...",0.621212,0.990909
1,"{'classifier__C': 1.0, 'classifier__max_iter':...",0.518182,"LogisticRegression(C=1.0, class_weight=None, d...",0.569697,0.545455
2,"{'classifier__n_neighbors': 1, 'pca__n_compone...",0.709091,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.736364,1.0
3,"{'classifier__max_depth': None, 'classifier__n...",0.618182,"(DecisionTreeClassifier(class_weight=None, cri...",0.709091,1.0
4,"{'classifier__C': 10.0, 'pca__n_components': 5...",0.709091,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.721212,0.990909


In [19]:
db1_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': None, 'classifier__s...",0.692929,"DecisionTreeClassifier(class_weight=None, crit...",0.613468,1.0
1,"{'classifier__C': 1.0, 'classifier__max_iter':...",0.587879,"LogisticRegression(C=1.0, class_weight=None, d...",0.587205,0.585859
2,"{'classifier__n_neighbors': 15, 'pca__n_compon...",0.70303,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.690236,0.755556
3,"{'classifier__max_depth': None, 'classifier__n...",0.757576,"(DecisionTreeClassifier(class_weight=None, cri...",0.726599,1.0
4,"{'classifier__C': 1.0, 'pca__n_components': 5,...",0.721212,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.707744,0.8


In [20]:
db2_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 15, 'classifier__spl...",0.66537,"DecisionTreeClassifier(class_weight=None, crit...",0.615984,0.966926
1,"{'classifier__C': 0.1, 'classifier__max_iter':...",0.597276,"LogisticRegression(C=0.10000000000000001, clas...",0.569851,0.59144
2,"{'classifier__n_neighbors': 9, 'pca__n_compone...",0.653696,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.688109,0.733463
3,"{'classifier__max_depth': None, 'classifier__n...",0.678988,"(DecisionTreeClassifier(class_weight=None, cri...",0.692658,1.0
4,"{'classifier__C': 100.0, 'pca__n_components': ...",0.677043,"SVC(C=100.0, cache_size=200, class_weight=None...",0.666017,0.928016


In [21]:
db3_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 10, 'classifier__spl...",0.671371,"DecisionTreeClassifier(class_weight=None, crit...",0.625168,0.822581
1,"{'classifier__C': 1.0, 'classifier__max_iter':...",0.627016,"LogisticRegression(C=1.0, class_weight=None, d...",0.59354,0.635081
2,"{'classifier__n_neighbors': 9, 'pca__n_compone...",0.697581,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.675639,0.760081
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.709677,"(DecisionTreeClassifier(class_weight=None, cri...",0.696501,0.852823
4,"{'classifier__C': 1.0, 'pca__n_components': 5,...",0.699597,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.696501,0.784274


Trying without the PCAs

In [22]:
dtc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', DecisionTreeClassifier())])

lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', LogisticRegression())])

knn_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', KNeighborsClassifier())])

rfc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', SVC())])

In [23]:
dtc_params = {'rfe__k': [5, 10],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_params = {'rfe__k': [5, 10],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_params = {'rfe__k': [5, 10],
             'classifier__n_neighbors': [1, 5, 9, 15, 25]}

rfc_params = {'rfe__k': [5, 10],
             'classifier__n_estimators': [10, 50, 100, 200, 500],
             'classifier__max_depth': [1, 5, None]}

svc_params = {'rfe__k': [5, 10],
              'classifier__C': np.logspace(-3,3,7)}

In [24]:
uci1_gs_results = test_all_pipes(Xuci_1_grouped, yuci_1)
uci2_gs_results = test_all_pipes(Xuci_2_grouped, yuci_2)
uci3_gs_results = test_all_pipes(Xuci_3_grouped, yuci_3)
db1_gs_results = test_all_pipes(Xdb_1_grouped, ydb_1)
db2_gs_results = test_all_pipes(Xdb_2_grouped, ydb_2)
db3_gs_results = test_all_pipes(Xdb_3_grouped, ydb_3)

100%|██████████| 5/5 [00:08<00:00,  1.70s/it]
100%|██████████| 5/5 [00:09<00:00,  1.82s/it]
100%|██████████| 5/5 [00:08<00:00,  1.71s/it]
100%|██████████| 5/5 [00:11<00:00,  2.39s/it]
100%|██████████| 5/5 [00:11<00:00,  2.21s/it]
100%|██████████| 5/5 [00:10<00:00,  2.14s/it]


In [25]:
uci1_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': None, 'classifier__s...",0.718182,"DecisionTreeClassifier(class_weight=None, crit...",0.6,1.0
1,"{'classifier__C': 0.001, 'classifier__max_iter...",0.690909,"LogisticRegression(C=0.001, class_weight=None,...",0.557576,0.681818
2,"{'classifier__n_neighbors': 1, 'rfe__k': 10}",0.781818,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.7,1.0
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.718182,"(DecisionTreeClassifier(class_weight=None, cri...",0.593939,1.0
4,"{'classifier__C': 1.0, 'rfe__k': 10}",0.8,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.672727,0.890909


In [26]:
uci2_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 10, 'classifier__spl...",0.690909,"DecisionTreeClassifier(class_weight=None, crit...",0.584848,1.0
1,"{'classifier__C': 100.0, 'classifier__max_iter...",0.681818,"LogisticRegression(C=100.0, class_weight=None,...",0.578788,0.672727
2,"{'classifier__n_neighbors': 1, 'rfe__k': 10}",0.763636,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.690909,1.0
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.709091,"(DecisionTreeClassifier(class_weight=None, cri...",0.627273,0.954545
4,"{'classifier__C': 10.0, 'rfe__k': 10}",0.754545,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.709091,0.945455


In [27]:
uci3_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 5, 'classifier__spli...",0.645455,"DecisionTreeClassifier(class_weight=None, crit...",0.512121,0.609091
1,"{'classifier__C': 10.0, 'classifier__max_iter'...",0.518182,"LogisticRegression(C=10.0, class_weight=None, ...",0.590909,0.590909
2,"{'classifier__n_neighbors': 1, 'rfe__k': 10}",0.7,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.727273,1.0
3,"{'classifier__max_depth': None, 'classifier__n...",0.636364,"(DecisionTreeClassifier(class_weight=None, cri...",0.627273,1.0
4,"{'classifier__C': 10.0, 'rfe__k': 10}",0.681818,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.718182,0.963636


In [28]:
db1_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 5, 'classifier__spli...",0.652525,"DecisionTreeClassifier(class_weight=None, crit...",0.609428,0.664646
1,"{'classifier__C': 0.001, 'classifier__max_iter...",0.579798,"LogisticRegression(C=0.001, class_weight=None,...",0.580471,0.583838
2,"{'classifier__n_neighbors': 15, 'rfe__k': 10}",0.70303,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.690236,0.755556
3,"{'classifier__max_depth': None, 'classifier__n...",0.711111,"(DecisionTreeClassifier(class_weight=None, cri...",0.697643,1.0
4,"{'classifier__C': 10.0, 'rfe__k': 10}",0.713131,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.727946,0.810101


In [29]:
db2_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 10, 'classifier__spl...",0.622568,"DecisionTreeClassifier(class_weight=None, crit...",0.608837,0.939689
1,"{'classifier__C': 1.0, 'classifier__max_iter':...",0.599222,"LogisticRegression(C=1.0, class_weight=None, d...",0.597141,0.601167
2,"{'classifier__n_neighbors': 9, 'rfe__k': 10}",0.653696,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.688109,0.733463
3,"{'classifier__max_depth': None, 'classifier__n...",0.657588,"(DecisionTreeClassifier(class_weight=None, cri...",0.688759,1.0
4,"{'classifier__C': 10.0, 'rfe__k': 5}",0.675097,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.684211,0.770428


In [30]:
db3_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 3, 'classifier__spli...",0.679435,"DecisionTreeClassifier(class_weight=None, crit...",0.612382,0.725806
1,"{'classifier__C': 0.01, 'classifier__max_iter'...",0.612903,"LogisticRegression(C=0.01, class_weight=None, ...",0.606326,0.629032
2,"{'classifier__n_neighbors': 9, 'rfe__k': 10}",0.697581,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.675639,0.760081
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.71371,"(DecisionTreeClassifier(class_weight=None, cri...",0.651413,0.810484
4,"{'classifier__C': 1.0, 'rfe__k': 10}",0.701613,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.681696,0.735887


### Time for some brute force

In [18]:
import itertools

In [32]:
def brute_test(X, y, estimator):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
#     X_train = pd.DataFrame(X_train, columns=X.columns)
#     X_test = pd.DataFrame(X_test, columns=X.columns)
    
#     skb = SelectKBest(k=10)
#     skb.fit(X_train, y_train)
    
#     X_k_train = pd.DataFrame(skb.transform(X_train), columns=X.columns[skb.get_support()])
#     X_k_test = pd.DataFrame(skb.transform(X_test), columns=X.columns[skb.get_support()])
    X_k_train = X_train
    X_k_test = X_test


    results = []
    combos = list(itertools.combinations(X_k_train, 5))
    
    for cols in tqdm(combos):
        
        X_tr_tmp = X_k_train[list(cols)]
        X_te_tmp = X_k_test[list(cols)]
        
        model = estimator
        model.fit(X_tr_tmp, y_train)
        
        metrics = {'features': cols,
                  'train_score': model.score(X_tr_tmp, y_train),
                  'test_score': model.score(X_te_tmp, y_test)}
        
        results.append(metrics)
        
    return pd.DataFrame(results)

In [33]:
Xuci_1_df = brute_test(Xuci_1_grouped, yuci_1, RandomForestClassifier(n_estimators=1000))
Xuci_2_df = brute_test(Xuci_2_grouped, yuci_2, RandomForestClassifier(n_estimators=1000))
Xuci_3_df = brute_test(Xuci_3_grouped, yuci_3, RandomForestClassifier(n_estimators=1000))
Xdb_1_df = brute_test(Xdb_1_grouped, ydb_1, RandomForestClassifier(n_estimators=1000))
Xdb_2_df = brute_test(Xdb_2_grouped, ydb_2, RandomForestClassifier(n_estimators=1000))
Xdb_3_df = brute_test(Xdb_3_grouped, ydb_3, RandomForestClassifier(n_estimators=1000))

100%|██████████| 252/252 [04:24<00:00,  1.05s/it]
100%|██████████| 252/252 [04:29<00:00,  1.07s/it]
100%|██████████| 252/252 [04:24<00:00,  1.05s/it]
100%|██████████| 252/252 [09:55<00:00,  2.36s/it]
100%|██████████| 252/252 [10:13<00:00,  2.44s/it]
100%|██████████| 252/252 [09:52<00:00,  2.35s/it]


In [34]:
Xuci_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
54,"(A, B, G, I, J)",0.818182,1.0
36,"(A, B, E, F, G)",0.809091,1.0
37,"(A, B, E, F, H)",0.8,1.0
6,"(A, B, C, E, F)",0.790909,1.0
8,"(A, B, C, E, H)",0.790909,1.0


In [35]:
Xuci_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
98,"(A, D, E, H, I)",0.772727,1.0
100,"(A, D, E, I, J)",0.754545,1.0
205,"(C, D, E, I, J)",0.745455,1.0
59,"(A, C, D, E, I)",0.745455,1.0
92,"(A, D, E, F, H)",0.745455,1.0


In [36]:
Xuci_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
96,"(A, D, E, G, I)",0.772727,1.0
138,"(B, C, D, H, I)",0.754545,1.0
31,"(A, B, D, G, I)",0.754545,1.0
177,"(B, D, G, H, I)",0.745455,1.0
63,"(A, C, D, F, I)",0.745455,1.0


In [37]:
Xdb_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
50,"(A, B, F, H, J)",0.779798,1.0
235,"(D, E, F, H, J)",0.771717,1.0
48,"(A, B, F, G, J)",0.771717,1.0
51,"(A, B, F, I, J)",0.769697,1.0
29,"(A, B, D, F, J)",0.767677,1.0


In [38]:
Xdb_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
232,"(D, E, F, G, I)",0.717899,1.0
202,"(C, D, E, G, J)",0.712062,1.0
233,"(D, E, F, G, J)",0.706226,1.0
199,"(C, D, E, F, J)",0.70428,1.0
74,"(A, C, E, F, J)",0.700389,1.0


In [39]:
Xdb_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
105,"(A, D, F, H, J)",0.72379,1.0
210,"(C, D, F, H, J)",0.715726,1.0
101,"(A, D, F, G, H)",0.711694,1.0
197,"(C, D, E, F, H)",0.707661,1.0
86,"(A, C, F, I, J)",0.707661,1.0


In [40]:
Xuci_lr_1_df = brute_test(Xuci_1_grouped, yuci_1, LogisticRegression())
Xuci_lr_2_df = brute_test(Xuci_2_grouped, yuci_2, LogisticRegression())
Xuci_lr_3_df = brute_test(Xuci_3_grouped, yuci_3, LogisticRegression())
Xdb_lr_1_df = brute_test(Xdb_1_grouped, ydb_1, LogisticRegression())
Xdb_lr_2_df = brute_test(Xdb_2_grouped, ydb_2, LogisticRegression())
Xdb_lr_3_df = brute_test(Xdb_3_grouped, ydb_3, LogisticRegression())

100%|██████████| 252/252 [00:00<00:00, 655.17it/s]
100%|██████████| 252/252 [00:00<00:00, 669.09it/s]
100%|██████████| 252/252 [00:00<00:00, 641.25it/s]
100%|██████████| 252/252 [00:00<00:00, 376.85it/s]
100%|██████████| 252/252 [00:00<00:00, 380.77it/s]
100%|██████████| 252/252 [00:00<00:00, 363.58it/s]


In [41]:
Xuci_lr_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
34,"(A, B, D, H, J)",0.636364,0.6
3,"(A, B, C, D, H)",0.627273,0.581818
19,"(A, B, C, H, J)",0.627273,0.581818
153,"(B, C, F, G, J)",0.618182,0.587879
69,"(A, C, D, H, J)",0.618182,0.6


In [42]:
Xuci_lr_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
48,"(A, B, F, G, J)",0.645455,0.593939
14,"(A, B, C, F, J)",0.645455,0.593939
11,"(A, B, C, F, G)",0.645455,0.590909
10,"(A, B, C, E, J)",0.627273,0.593939
103,"(A, D, F, G, J)",0.627273,0.624242


In [43]:
Xuci_lr_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
130,"(B, C, D, E, J)",0.618182,0.59697
131,"(B, C, D, F, G)",0.618182,0.6
50,"(A, B, F, H, J)",0.618182,0.612121
105,"(A, D, F, H, J)",0.618182,0.609091
73,"(A, C, E, F, I)",0.618182,0.615152


In [44]:
Xdb_lr_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
251,"(F, G, H, I, J)",0.606061,0.583838
171,"(B, D, F, G, H)",0.606061,0.597306
37,"(A, B, E, F, H)",0.606061,0.594613
161,"(B, D, E, F, G)",0.606061,0.603367
8,"(A, B, C, E, H)",0.606061,0.594613


In [45]:
Xdb_lr_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
231,"(D, E, F, G, H)",0.620623,0.593892
162,"(B, D, E, F, H)",0.614786,0.592593
108,"(A, D, G, H, J)",0.61284,0.594542
165,"(B, D, E, G, H)",0.61284,0.585445
251,"(F, G, H, I, J)",0.610895,0.593242


In [46]:
Xdb_lr_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
94,"(A, D, E, F, J)",0.627016,0.613728
243,"(D, F, G, I, J)",0.627016,0.606326
157,"(B, C, G, H, I)",0.625,0.600269
65,"(A, C, D, G, H)",0.622984,0.60498
18,"(A, B, C, H, I)",0.622984,0.602288


In [47]:
Xuci_knn_1_df = brute_test(Xuci_1_grouped, yuci_1, KNeighborsClassifier())
Xuci_knn_2_df = brute_test(Xuci_2_grouped, yuci_2, KNeighborsClassifier())
Xuci_knn_3_df = brute_test(Xuci_3_grouped, yuci_3, KNeighborsClassifier())
Xdb_knn_1_df = brute_test(Xdb_1_grouped, ydb_1, KNeighborsClassifier())
Xdb_knn_2_df = brute_test(Xdb_2_grouped, ydb_2, KNeighborsClassifier())
Xdb_knn_3_df = brute_test(Xdb_3_grouped, ydb_3, KNeighborsClassifier())

100%|██████████| 252/252 [00:00<00:00, 393.01it/s]
100%|██████████| 252/252 [00:00<00:00, 401.19it/s]
100%|██████████| 252/252 [00:00<00:00, 404.48it/s]
100%|██████████| 252/252 [00:01<00:00, 138.15it/s]
100%|██████████| 252/252 [00:01<00:00, 134.34it/s]
100%|██████████| 252/252 [00:01<00:00, 137.79it/s]


In [48]:
Xuci_knn_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
128,"(B, C, D, E, H)",0.836364,0.866667
92,"(A, D, E, F, H)",0.836364,0.815152
135,"(B, C, D, G, H)",0.836364,0.854545
197,"(C, D, E, F, H)",0.818182,0.824242
204,"(C, D, E, H, J)",0.818182,0.881818


In [49]:
Xuci_knn_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
98,"(A, D, E, H, I)",0.8,0.833333
100,"(A, D, E, I, J)",0.8,0.833333
4,"(A, B, C, D, I)",0.790909,0.857576
128,"(B, C, D, E, H)",0.790909,0.875758
102,"(A, D, F, G, I)",0.781818,0.830303


In [50]:
Xuci_knn_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
135,"(B, C, D, G, H)",0.818182,0.890909
30,"(A, B, D, G, H)",0.8,0.872727
138,"(B, C, D, H, I)",0.8,0.860606
136,"(B, C, D, G, I)",0.790909,0.830303
65,"(A, C, D, G, H)",0.790909,0.890909


In [51]:
Xdb_knn_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
56,"(A, C, D, E, F)",0.749495,0.783165
235,"(D, E, F, H, J)",0.749495,0.798653
244,"(D, F, H, I, J)",0.745455,0.801347
73,"(A, C, E, F, I)",0.745455,0.790572
172,"(B, D, F, G, I)",0.743434,0.794613


In [52]:
Xdb_knn_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
51,"(A, B, F, I, J)",0.682879,0.793372
173,"(B, D, F, G, J)",0.678988,0.803119
212,"(C, D, G, H, I)",0.678988,0.795322
67,"(A, C, D, G, J)",0.675097,0.79922
124,"(A, F, H, I, J)",0.673152,0.791423


In [53]:
Xdb_knn_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
139,"(B, C, D, H, J)",0.709677,0.779946
204,"(C, D, E, H, J)",0.707661,0.786676
29,"(A, B, D, F, J)",0.701613,0.814939
60,"(A, C, D, E, J)",0.699597,0.794078
134,"(B, C, D, F, J)",0.699597,0.808883
