In [1]:
%run data_package_loading.py # Code loads data as well as packages that are relevant across most project phases
%matplotlib inline

uci_features = ['28',  '48',  '64', '105', '128', '153', '241', '281', '318', '336', 
                '338', '378', '433', '442', '451', '453', '455', '472', '475', '493']

madelon_features = ['feat_257', 'feat_269', 'feat_308', 'feat_315', 'feat_336',
                   'feat_341', 'feat_395', 'feat_504', 'feat_526', 'feat_639',
                   'feat_681', 'feat_701', 'feat_724', 'feat_736', 'feat_769',
                   'feat_808', 'feat_829', 'feat_867', 'feat_920', 'feat_956']

Xuci_1 = Xuci_1[uci_features]
Xuci_2 = Xuci_2[uci_features]
Xuci_3 = Xuci_3[uci_features]

Xdb_1 = Xdb_1[madelon_features]
Xdb_2 = Xdb_2[madelon_features]
Xdb_3 = Xdb_3[madelon_features]

# !conda install -y psycopg2

from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel, RFECV 
from sklearn.decomposition import PCA

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.grid_search import GridSearchCV

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler



In [2]:
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm

In [3]:
def uci_group(X):
    grouped_df = pd.DataFrame()
    grouped_df['A'] = X[['28','451','318']].mean(axis=1)
    grouped_df['B'] = X[['105','128']].mean(axis=1)
    grouped_df['C'] = X[['241','475']].mean(axis=1)
    grouped_df['D'] = X[['378','48']].mean(axis=1)
    grouped_df['E'] = X[['153','281','433']].mean(axis=1)
    grouped_df['F'] = X[['64','336']].mean(axis=1)
    grouped_df['G'] = X[['453', '493']].mean(axis=1)
    grouped_df['H'] = X[['472','442']].mean(axis=1)
    grouped_df['I'] = X[['338']].mean(axis=1)
    grouped_df['J'] = X[['455']].mean(axis=1)
    
    return grouped_df

def mad_group(X):
    grouped_df = pd.DataFrame()
    grouped_df['A'] = X[['feat_956','feat_639','feat_829']].mean(axis=1)
    grouped_df['B'] = X[['feat_269','feat_315', 'feat_701']].mean(axis=1) #701 is negative correlated with the other two, so i am not 100% sure what the outcome will be
    grouped_df['C'] = X[['feat_341','feat_395']].mean(axis=1)
    grouped_df['D'] = X[['feat_336', 'feat_867']].mean(axis=1)
    grouped_df['E'] = X[['feat_808', 'feat_257']].mean(axis=1)
    grouped_df['F'] = X[['feat_308', 'feat_736']].mean(axis=1)
    grouped_df['G'] = X[['feat_504', 'feat_681']].mean(axis=1)
    grouped_df['H'] = X[['feat_724', 'feat_769']].mean(axis=1)
    grouped_df['I'] = X[['feat_526']].mean(axis=1)
    grouped_df['J'] = X[['feat_920']].mean(axis=1)
    
    return grouped_df


In [4]:
Xuci_1_grouped = uci_group(Xuci_1)
Xuci_2_grouped = uci_group(Xuci_2)
Xuci_3_grouped = uci_group(Xuci_3)

Xdb_1_grouped = mad_group(Xdb_1)
Xdb_2_grouped = mad_group(Xdb_2)
Xdb_3_grouped = mad_group(Xdb_3)


In [5]:
dtc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', DecisionTreeClassifier())])

lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', LogisticRegression())])

knn_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', KNeighborsClassifier())])

rfc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('pca', PCA()),
                     ('classifier', SVC())])

### Some naive testing of the pipes

In [6]:
def pipe_test(X, y, pipeline):
    X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.25, random_state=42)
    
    pipe = pipeline
    pipe.fit(X_train, y_train)
    
    print(pipe.score(X_train, y_train))
    print(pipe.score(X_test, y_test))
    

In [7]:
print("\nTesting UCI 1")
pipe_test(Xuci_1_grouped, yuci_1, dtc_pipe)

print("\nTesting UCI 2")
pipe_test(Xuci_2_grouped, yuci_2, dtc_pipe)

print("\nTesting UCI 3")
pipe_test(Xuci_3_grouped, yuci_3, dtc_pipe)

print("\nTesting DB 1")
pipe_test(Xdb_1_grouped, ydb_1, dtc_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_2_grouped, ydb_2, dtc_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_3_grouped, ydb_3, dtc_pipe)


Testing UCI 1
1.0
0.575757575758

Testing UCI 2
1.0
0.551515151515

Testing UCI 3
1.0
0.590909090909

Testing DB 1
1.0
0.649158249158

Testing DB 2
1.0
0.615334632878

Testing DB 2
1.0
0.646702557201


In [8]:
print("\nTesting UCI 1")
pipe_test(Xuci_1_grouped, yuci_1, lr_pipe)

print("\nTesting UCI 2")
pipe_test(Xuci_2_grouped, yuci_2, lr_pipe)

print("\nTesting UCI 3")
pipe_test(Xuci_3_grouped, yuci_3, lr_pipe)

print("\nTesting DB 1")
pipe_test(Xdb_1_grouped, ydb_1, lr_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_2_grouped, ydb_2, lr_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_3_grouped, ydb_3, lr_pipe)


Testing UCI 1
0.7
0.515151515152

Testing UCI 2
0.7
0.584848484848

Testing UCI 3
0.609090909091
0.590909090909

Testing DB 1
0.60202020202
0.594612794613

Testing DB 2
0.651750972763
0.573099415205

Testing DB 2
0.667338709677
0.582099596231


In [9]:
print("\nTesting UCI 1")
pipe_test(Xuci_1_grouped, yuci_1, knn_pipe)

print("\nTesting UCI 2")
pipe_test(Xuci_2_grouped, yuci_2, knn_pipe)

print("\nTesting UCI 3")
pipe_test(Xuci_3_grouped, yuci_3, knn_pipe)

print("\nTesting DB 1")
pipe_test(Xdb_1_grouped, ydb_1, knn_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_2_grouped, ydb_2, knn_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_3_grouped, ydb_3, knn_pipe)


Testing UCI 1
0.881818181818
0.672727272727

Testing UCI 2
0.836363636364
0.633333333333

Testing UCI 3
0.809090909091
0.675757575758

Testing DB 1
0.836363636364
0.699663299663

Testing DB 2
0.787937743191
0.666666666667

Testing DB 2
0.828629032258
0.680349932705


In [10]:
print("\nTesting UCI 1")
pipe_test(Xuci_1_grouped, yuci_1, rfc_pipe)

print("\nTesting UCI 2")
pipe_test(Xuci_2_grouped, yuci_2, rfc_pipe)

print("\nTesting UCI 3")
pipe_test(Xuci_3_grouped, yuci_3, rfc_pipe)

print("\nTesting DB 1")
pipe_test(Xdb_1_grouped, ydb_1, rfc_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_2_grouped, ydb_2, rfc_pipe)

print("\nTesting DB 2")
pipe_test(Xdb_3_grouped, ydb_3, rfc_pipe)


Testing UCI 1
0.990909090909
0.560606060606

Testing UCI 2
0.981818181818
0.593939393939

Testing UCI 3
0.972727272727
0.524242424242

Testing DB 1
0.987878787879
0.686868686869

Testing DB 2
0.988326848249
0.660168940871

Testing DB 2
0.983870967742
0.695827725437


### Almost all naive testing results in test scores ranging from ~0.55 - 0.68

In [11]:
dtc_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
             'classifier__n_neighbors': [1, 5, 9, 15, 25]}

rfc_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
             'classifier__n_estimators': [10, 50, 100, 200, 500],
             'classifier__max_depth': [1, 5, None]}

svc_params = {'rfe__k': [5, 10],
             'pca__n_components': [1, 2, 3, 4, 5],
              'classifier__C': np.logspace(-3,3,7)}




In [12]:
def gridsearch_pipe(X, y, pipeline, params, cv=5):
    X_test, X_train, y_test, y_train = train_test_split(X, y, test_size = 0.25, random_state=42)
    
    gs = GridSearchCV(pipeline, params, cv=cv, n_jobs=-1)
    gs.fit(X_train, y_train)
    
    results = {'best_params': gs.best_params_,
               'cv_score': gs.best_score_,
              'train_score': gs.score(X_train, y_train),
              'test_score': gs.score(X_test, y_test),
              'estimator': gs.best_estimator_.named_steps['classifier']}

    return results

In [13]:
def test_all_pipes(X, y):
    scores = []
    
    for pipe, param in tqdm([(dtc_pipe, dtc_params), (lr_pipe, lr_params), 
                        (knn_pipe, knn_params), (rfc_pipe, rfc_params),
                            (svc_pipe, svc_params)]):
        scores.append(gridsearch_pipe(X, y, pipe, param))
    
    scores_df = pd.DataFrame(scores)
    return scores_df
        

In [14]:
uci1_gs_results = test_all_pipes(Xuci_1_grouped, yuci_1)
uci2_gs_results = test_all_pipes(Xuci_2_grouped, yuci_2)
uci3_gs_results = test_all_pipes(Xuci_3_grouped, yuci_3)
db1_gs_results = test_all_pipes(Xdb_1_grouped, ydb_1)
db2_gs_results = test_all_pipes(Xdb_2_grouped, ydb_2)
db3_gs_results = test_all_pipes(Xdb_3_grouped, ydb_3)

100%|██████████| 5/5 [00:32<00:00,  6.43s/it]
100%|██████████| 5/5 [00:32<00:00,  6.55s/it]
100%|██████████| 5/5 [00:32<00:00,  6.42s/it]
100%|██████████| 5/5 [00:37<00:00,  7.48s/it]
100%|██████████| 5/5 [00:38<00:00,  7.63s/it]
100%|██████████| 5/5 [00:38<00:00,  7.66s/it]


In [15]:
uci1_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 3, 'classifier__spli...",0.736364,"DecisionTreeClassifier(class_weight=None, crit...",0.563636,0.854545
1,"{'classifier__C': 0.1, 'classifier__max_iter':...",0.7,"LogisticRegression(C=0.10000000000000001, clas...",0.542424,0.7
2,"{'classifier__n_neighbors': 1, 'pca__n_compone...",0.781818,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.69697,1.0
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.763636,"(DecisionTreeClassifier(class_weight=None, cri...",0.636364,0.981818
4,"{'classifier__C': 1.0, 'pca__n_components': 5,...",0.809091,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.7,0.909091


In [16]:
uci2_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 3, 'classifier__spli...",0.681818,"DecisionTreeClassifier(class_weight=None, crit...",0.60303,0.827273
1,"{'classifier__C': 1.0, 'classifier__max_iter':...",0.672727,"LogisticRegression(C=1.0, class_weight=None, d...",0.606061,0.681818
2,"{'classifier__n_neighbors': 1, 'pca__n_compone...",0.763636,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.690909,1.0
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.718182,"(DecisionTreeClassifier(class_weight=None, cri...",0.633333,0.945455
4,"{'classifier__C': 1.0, 'pca__n_components': 5,...",0.754545,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.681818,0.890909


In [17]:
uci3_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 15, 'classifier__spl...",0.627273,"DecisionTreeClassifier(class_weight=None, crit...",0.542424,1.0
1,"{'classifier__C': 1.0, 'classifier__max_iter':...",0.518182,"LogisticRegression(C=1.0, class_weight=None, d...",0.569697,0.545455
2,"{'classifier__n_neighbors': 1, 'pca__n_compone...",0.709091,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.736364,1.0
3,"{'classifier__max_depth': None, 'classifier__n...",0.672727,"(DecisionTreeClassifier(class_weight=None, cri...",0.639394,0.972727
4,"{'classifier__C': 10.0, 'pca__n_components': 5...",0.709091,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.721212,0.990909


In [18]:
db1_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': None, 'classifier__s...",0.676768,"DecisionTreeClassifier(class_weight=None, crit...",0.637037,1.0
1,"{'classifier__C': 0.01, 'classifier__max_iter'...",0.60404,"LogisticRegression(C=0.01, class_weight=None, ...",0.589899,0.6
2,"{'classifier__n_neighbors': 5, 'pca__n_compone...",0.727273,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.690236,0.822222
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.723232,"(DecisionTreeClassifier(class_weight=None, cri...",0.690236,0.860606
4,"{'classifier__C': 10.0, 'pca__n_components': 5...",0.729293,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.709764,0.824242


In [19]:
db2_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 15, 'classifier__spl...",0.661479,"DecisionTreeClassifier(class_weight=None, crit...",0.636127,0.996109
1,"{'classifier__C': 10.0, 'classifier__max_iter'...",0.638132,"LogisticRegression(C=10.0, class_weight=None, ...",0.573099,0.653696
2,"{'classifier__n_neighbors': 9, 'pca__n_compone...",0.694553,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.673814,0.747082
3,"{'classifier__max_depth': None, 'classifier__n...",0.723735,"(DecisionTreeClassifier(class_weight=None, cri...",0.697206,1.0
4,"{'classifier__C': 10.0, 'pca__n_components': 5...",0.706226,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.699155,0.889105


In [20]:
db3_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': None, 'classifier__s...",0.681452,"DecisionTreeClassifier(class_weight=None, crit...",0.648721,1.0
1,"{'classifier__C': 10.0, 'classifier__max_iter'...",0.657258,"LogisticRegression(C=10.0, class_weight=None, ...",0.584791,0.671371
2,"{'classifier__n_neighbors': 15, 'pca__n_compon...",0.725806,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.670256,0.752016
3,"{'classifier__max_depth': None, 'classifier__n...",0.717742,"(DecisionTreeClassifier(class_weight=None, cri...",0.70323,1.0
4,"{'classifier__C': 100.0, 'pca__n_components': ...",0.709677,"SVC(C=100.0, cache_size=200, class_weight=None...",0.672275,0.939516


Trying without the PCAs

In [21]:
dtc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', DecisionTreeClassifier())])

lr_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', LogisticRegression())])

knn_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', KNeighborsClassifier())])

rfc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', RandomForestClassifier())])

svc_pipe = Pipeline([('scaler', StandardScaler()),
                     ('rfe', SelectKBest()),
                     ('classifier', SVC())])

In [22]:
dtc_params = {'rfe__k': [5, 10],
             'classifier__max_depth': [1, 3, 5, 10, 15, None],
             'classifier__splitter': ['random', 'best']}

lr_params = {'rfe__k': [5, 10],
             'classifier__penalty': ['l1', 'l2'],
             'classifier__max_iter': [100, 500],
             'classifier__C': np.logspace(-3,3,7)}

knn_params = {'rfe__k': [5, 10],
             'classifier__n_neighbors': [1, 5, 9, 15, 25]}

rfc_params = {'rfe__k': [5, 10],
             'classifier__n_estimators': [10, 50, 100, 200, 500],
             'classifier__max_depth': [1, 5, None]}

svc_params = {'rfe__k': [5, 10],
              'classifier__C': np.logspace(-3,3,7)}

In [23]:
uci1_gs_results = test_all_pipes(Xuci_1_grouped, yuci_1)
uci2_gs_results = test_all_pipes(Xuci_2_grouped, yuci_2)
uci3_gs_results = test_all_pipes(Xuci_3_grouped, yuci_3)
db1_gs_results = test_all_pipes(Xdb_1_grouped, ydb_1)
db2_gs_results = test_all_pipes(Xdb_2_grouped, ydb_2)
db3_gs_results = test_all_pipes(Xdb_3_grouped, ydb_3)

100%|██████████| 5/5 [00:08<00:00,  1.65s/it]
100%|██████████| 5/5 [00:08<00:00,  1.62s/it]
100%|██████████| 5/5 [00:08<00:00,  1.65s/it]
100%|██████████| 5/5 [00:10<00:00,  2.11s/it]
100%|██████████| 5/5 [00:11<00:00,  2.35s/it]
100%|██████████| 5/5 [00:11<00:00,  2.25s/it]


In [24]:
uci1_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': None, 'classifier__s...",0.690909,"DecisionTreeClassifier(class_weight=None, crit...",0.612121,1.0
1,"{'classifier__C': 0.001, 'classifier__max_iter...",0.690909,"LogisticRegression(C=0.001, class_weight=None,...",0.557576,0.681818
2,"{'classifier__n_neighbors': 1, 'rfe__k': 10}",0.781818,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.7,1.0
3,"{'classifier__max_depth': None, 'classifier__n...",0.727273,"(DecisionTreeClassifier(class_weight=None, cri...",0.590909,1.0
4,"{'classifier__C': 1.0, 'rfe__k': 10}",0.8,"SVC(C=1.0, cache_size=200, class_weight=None, ...",0.672727,0.890909


In [25]:
uci2_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 10, 'classifier__spl...",0.681818,"DecisionTreeClassifier(class_weight=None, crit...",0.60303,0.963636
1,"{'classifier__C': 100.0, 'classifier__max_iter...",0.681818,"LogisticRegression(C=100.0, class_weight=None,...",0.578788,0.672727
2,"{'classifier__n_neighbors': 1, 'rfe__k': 10}",0.763636,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.690909,1.0
3,"{'classifier__max_depth': 5, 'classifier__n_es...",0.709091,"(DecisionTreeClassifier(class_weight=None, cri...",0.621212,0.936364
4,"{'classifier__C': 10.0, 'rfe__k': 10}",0.754545,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.709091,0.945455


In [26]:
uci3_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 10, 'classifier__spl...",0.627273,"DecisionTreeClassifier(class_weight=None, crit...",0.633333,0.945455
1,"{'classifier__C': 10.0, 'classifier__max_iter'...",0.518182,"LogisticRegression(C=10.0, class_weight=None, ...",0.590909,0.590909
2,"{'classifier__n_neighbors': 1, 'rfe__k': 10}",0.7,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.727273,1.0
3,"{'classifier__max_depth': None, 'classifier__n...",0.627273,"(DecisionTreeClassifier(class_weight=None, cri...",0.633333,0.981818
4,"{'classifier__C': 10.0, 'rfe__k': 10}",0.681818,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.718182,0.963636


In [27]:
db1_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 10, 'classifier__spl...",0.684848,"DecisionTreeClassifier(class_weight=None, crit...",0.610774,0.830303
1,"{'classifier__C': 0.01, 'classifier__max_iter'...",0.60404,"LogisticRegression(C=0.01, class_weight=None, ...",0.589899,0.6
2,"{'classifier__n_neighbors': 5, 'rfe__k': 5}",0.727273,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.690236,0.822222
3,"{'classifier__max_depth': None, 'classifier__n...",0.721212,"(DecisionTreeClassifier(class_weight=None, cri...",0.682155,1.0
4,"{'classifier__C': 10.0, 'rfe__k': 5}",0.729293,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.709764,0.824242


In [28]:
db2_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': None, 'classifier__s...",0.651751,"DecisionTreeClassifier(class_weight=None, crit...",0.615335,1.0
1,"{'classifier__C': 1.0, 'classifier__max_iter':...",0.640078,"LogisticRegression(C=1.0, class_weight=None, d...",0.57115,0.653696
2,"{'classifier__n_neighbors': 9, 'rfe__k': 10}",0.694553,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.673814,0.747082
3,"{'classifier__max_depth': None, 'classifier__n...",0.688716,"(DecisionTreeClassifier(class_weight=None, cri...",0.691358,1.0
4,"{'classifier__C': 10.0, 'rfe__k': 10}",0.70428,"SVC(C=10.0, cache_size=200, class_weight=None,...",0.707602,0.801556


In [29]:
db3_gs_results

Unnamed: 0,best_params,cv_score,estimator,test_score,train_score
0,"{'classifier__max_depth': 5, 'classifier__spli...",0.673387,"DecisionTreeClassifier(class_weight=None, crit...",0.611709,0.780242
1,"{'classifier__C': 10.0, 'classifier__max_iter'...",0.657258,"LogisticRegression(C=10.0, class_weight=None, ...",0.583445,0.669355
2,"{'classifier__n_neighbors': 15, 'rfe__k': 10}",0.725806,"KNeighborsClassifier(algorithm='auto', leaf_si...",0.679677,0.774194
3,"{'classifier__max_depth': None, 'classifier__n...",0.707661,"(DecisionTreeClassifier(class_weight=None, cri...",0.687079,1.0
4,"{'classifier__C': 1000.0, 'rfe__k': 10}",0.703629,"SVC(C=1000.0, cache_size=200, class_weight=Non...",0.677658,0.925403


### Time for some brute force

In [30]:
import itertools

In [31]:
def brute_test(X, y, estimator):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)
    
#     X_train = pd.DataFrame(X_train, columns=X.columns)
#     X_test = pd.DataFrame(X_test, columns=X.columns)
    
#     skb = SelectKBest(k=10)
#     skb.fit(X_train, y_train)
    
#     X_k_train = pd.DataFrame(skb.transform(X_train), columns=X.columns[skb.get_support()])
#     X_k_test = pd.DataFrame(skb.transform(X_test), columns=X.columns[skb.get_support()])
    X_k_train = X_train
    X_k_test = X_test


    results = []
    combos = list(itertools.combinations(X_k_train, 5))
    
    for cols in tqdm(combos):
        
        X_tr_tmp = X_k_train[list(cols)]
        X_te_tmp = X_k_test[list(cols)]
        
        model = estimator
        model.fit(X_tr_tmp, y_train)
        
        metrics = {'features': cols,
                  'train_score': model.score(X_tr_tmp, y_train),
                  'test_score': model.score(X_te_tmp, y_test)}
        
        results.append(metrics)
        
    return pd.DataFrame(results)

In [53]:
Xuci_1_df = brute_test(Xuci_1_grouped, yuci_1, RandomForestClassifier(n_estimators=1000))
Xuci_2_df = brute_test(Xuci_2_grouped, yuci_2, RandomForestClassifier(n_estimators=1000))
Xuci_3_df = brute_test(Xuci_3_grouped, yuci_3, RandomForestClassifier(n_estimators=1000))
Xdb_1_df = brute_test(Xdb_1_grouped, ydb_1, RandomForestClassifier(n_estimators=1000))
Xdb_2_df = brute_test(Xdb_2_grouped, ydb_2, RandomForestClassifier(n_estimators=1000))
Xdb_3_df = brute_test(Xdb_3_grouped, ydb_3, RandomForestClassifier(n_estimators=1000))

100%|██████████| 252/252 [04:34<00:00,  1.09s/it]
100%|██████████| 252/252 [04:30<00:00,  1.07s/it]
100%|██████████| 252/252 [04:28<00:00,  1.07s/it]
100%|██████████| 252/252 [10:01<00:00,  2.39s/it]
100%|██████████| 252/252 [10:22<00:00,  2.47s/it]
100%|██████████| 252/252 [10:04<00:00,  2.40s/it]


In [54]:
Xuci_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
36,"(A, B, E, F, G)",0.818182,1.0
39,"(A, B, E, F, J)",0.809091,1.0
37,"(A, B, E, F, H)",0.8,1.0
42,"(A, B, E, G, J)",0.790909,1.0
46,"(A, B, F, G, H)",0.790909,1.0


In [55]:
Xuci_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
98,"(A, D, E, H, I)",0.763636,1.0
58,"(A, C, D, E, H)",0.763636,1.0
100,"(A, D, E, I, J)",0.754545,1.0
128,"(B, C, D, E, H)",0.745455,1.0
231,"(D, E, F, G, H)",0.736364,1.0


In [56]:
Xuci_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
96,"(A, D, E, G, I)",0.763636,1.0
31,"(A, B, D, G, I)",0.754545,1.0
138,"(B, C, D, H, I)",0.745455,1.0
95,"(A, D, E, G, H)",0.745455,1.0
47,"(A, B, F, G, I)",0.745455,1.0


In [57]:
Xdb_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
89,"(A, C, G, I, J)",0.765657,1.0
17,"(A, B, C, G, J)",0.759596,1.0
99,"(A, D, E, H, J)",0.757576,1.0
80,"(A, C, E, I, J)",0.755556,1.0
202,"(C, D, E, G, J)",0.755556,1.0


In [58]:
Xdb_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
155,"(B, C, F, H, J)",0.72179,1.0
227,"(C, F, G, H, J)",0.717899,1.0
176,"(B, D, F, I, J)",0.715953,1.0
64,"(A, C, D, F, J)",0.714008,1.0
219,"(C, E, F, H, I)",0.712062,1.0


In [59]:
Xdb_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
64,"(A, C, D, F, J)",0.737903,1.0
74,"(A, C, E, F, J)",0.737903,1.0
86,"(A, C, F, I, J)",0.733871,1.0
115,"(A, E, F, H, J)",0.731855,1.0
82,"(A, C, F, G, I)",0.731855,1.0


In [39]:
Xuci_lr_1_df = brute_test(Xuci_1_grouped, yuci_1, LogisticRegression())
Xuci_lr_2_df = brute_test(Xuci_2_grouped, yuci_2, LogisticRegression())
Xuci_lr_3_df = brute_test(Xuci_3_grouped, yuci_3, LogisticRegression())
Xdb_lr_1_df = brute_test(Xdb_1_grouped, ydb_1, LogisticRegression())
Xdb_lr_2_df = brute_test(Xdb_2_grouped, ydb_2, LogisticRegression())
Xdb_lr_3_df = brute_test(Xdb_3_grouped, ydb_3, LogisticRegression())

100%|██████████| 252/252 [00:00<00:00, 569.68it/s]
100%|██████████| 252/252 [00:00<00:00, 614.59it/s]
100%|██████████| 252/252 [00:00<00:00, 566.80it/s]
100%|██████████| 252/252 [00:00<00:00, 337.76it/s]
100%|██████████| 252/252 [00:00<00:00, 302.71it/s]
100%|██████████| 252/252 [00:00<00:00, 312.07it/s]


In [40]:
Xuci_lr_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
34,"(A, B, D, H, J)",0.636364,0.6
3,"(A, B, C, D, H)",0.627273,0.581818
19,"(A, B, C, H, J)",0.627273,0.581818
153,"(B, C, F, G, J)",0.618182,0.587879
69,"(A, C, D, H, J)",0.618182,0.6


In [41]:
Xuci_lr_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
48,"(A, B, F, G, J)",0.645455,0.593939
14,"(A, B, C, F, J)",0.645455,0.593939
11,"(A, B, C, F, G)",0.645455,0.590909
10,"(A, B, C, E, J)",0.627273,0.593939
103,"(A, D, F, G, J)",0.627273,0.624242


In [42]:
Xuci_lr_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
130,"(B, C, D, E, J)",0.618182,0.59697
131,"(B, C, D, F, G)",0.618182,0.6
50,"(A, B, F, H, J)",0.618182,0.612121
105,"(A, D, F, H, J)",0.618182,0.609091
73,"(A, C, E, F, I)",0.618182,0.615152


In [43]:
Xdb_lr_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
9,"(A, B, C, E, I)",0.614141,0.589899
251,"(F, G, H, I, J)",0.610101,0.584512
59,"(A, C, D, E, I)",0.610101,0.589226
4,"(A, B, C, D, I)",0.610101,0.597306
132,"(B, C, D, F, H)",0.610101,0.583838


In [44]:
Xdb_lr_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
182,"(B, E, F, G, I)",0.616732,0.588694
183,"(B, E, F, G, J)",0.614786,0.594542
193,"(B, F, G, I, J)",0.61284,0.589344
228,"(C, F, G, I, J)",0.61284,0.59974
186,"(B, E, F, I, J)",0.610895,0.589994


In [45]:
Xdb_lr_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
181,"(B, E, F, G, H)",0.655242,0.604307
231,"(D, E, F, G, H)",0.647177,0.601615
72,"(A, C, E, F, H)",0.645161,0.602961
161,"(B, D, E, F, G)",0.643145,0.596904
81,"(A, C, F, G, H)",0.639113,0.599596


In [46]:
Xuci_knn_1_df = brute_test(Xuci_1_grouped, yuci_1, KNeighborsClassifier())
Xuci_knn_2_df = brute_test(Xuci_2_grouped, yuci_2, KNeighborsClassifier())
Xuci_knn_3_df = brute_test(Xuci_3_grouped, yuci_3, KNeighborsClassifier())
Xdb_knn_1_df = brute_test(Xdb_1_grouped, ydb_1, KNeighborsClassifier())
Xdb_knn_2_df = brute_test(Xdb_2_grouped, ydb_2, KNeighborsClassifier())
Xdb_knn_3_df = brute_test(Xdb_3_grouped, ydb_3, KNeighborsClassifier())

100%|██████████| 252/252 [00:00<00:00, 315.26it/s]
100%|██████████| 252/252 [00:00<00:00, 284.57it/s]
100%|██████████| 252/252 [00:00<00:00, 301.11it/s]
100%|██████████| 252/252 [00:02<00:00, 118.25it/s]
100%|██████████| 252/252 [00:02<00:00, 102.74it/s]
100%|██████████| 252/252 [00:02<00:00, 124.00it/s]


In [47]:
Xuci_knn_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
128,"(B, C, D, E, H)",0.836364,0.866667
92,"(A, D, E, F, H)",0.836364,0.815152
135,"(B, C, D, G, H)",0.836364,0.854545
197,"(C, D, E, F, H)",0.818182,0.824242
204,"(C, D, E, H, J)",0.818182,0.881818


In [48]:
Xuci_knn_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
98,"(A, D, E, H, I)",0.8,0.833333
100,"(A, D, E, I, J)",0.8,0.833333
4,"(A, B, C, D, I)",0.790909,0.857576
128,"(B, C, D, E, H)",0.790909,0.875758
102,"(A, D, F, G, I)",0.781818,0.830303


In [49]:
Xuci_knn_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
135,"(B, C, D, G, H)",0.818182,0.890909
30,"(A, B, D, G, H)",0.8,0.872727
138,"(B, C, D, H, I)",0.8,0.860606
136,"(B, C, D, G, I)",0.790909,0.830303
65,"(A, C, D, G, H)",0.790909,0.890909


In [50]:
Xdb_knn_1_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
229,"(C, F, H, I, J)",0.771717,0.813468
86,"(A, C, F, I, J)",0.761616,0.810774
204,"(C, D, E, H, J)",0.755556,0.799327
134,"(B, C, D, F, J)",0.753535,0.810101
160,"(B, C, H, I, J)",0.753535,0.800673


In [51]:
Xdb_knn_2_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
173,"(B, D, F, G, J)",0.706226,0.794022
164,"(B, D, E, F, J)",0.700389,0.790773
235,"(D, E, F, H, J)",0.696498,0.784276
238,"(D, E, G, H, J)",0.696498,0.781027
153,"(B, C, F, G, J)",0.694553,0.773879


In [52]:
Xdb_knn_3_df.sort_values('test_score', ascending=False).head()

Unnamed: 0,features,test_score,train_score
220,"(C, E, F, H, J)",0.72379,0.804172
105,"(A, D, F, H, J)",0.721774,0.796097
69,"(A, C, D, H, J)",0.721774,0.795424
85,"(A, C, F, H, J)",0.719758,0.79677
213,"(C, D, G, H, J)",0.717742,0.794751
