## Setup

In [1]:
import GMHI
import config
import functions
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import balanced_accuracy_score
from GMHI import GMHI

In [2]:
X, y = functions.load_taxonomy()

In [3]:
gmhi = GMHI(theta_f=1.6, theta_d=0)
gmhi.fit(X.values, y.values)
y_hat = gmhi.predict(X.values)
score = balanced_accuracy_score(y, y_hat)
score

False


0.7218442508719711

## Models and CV methods

In [3]:
cross_validation_methods = [StratifiedKFold(n_splits=10, shuffle=True, random_state=42), 
                            LeaveOneGroupOut(),
                            LeaveOneOut()
                           ]

In [154]:
groups = functions.get_groups(X)
groups

array([ 8,  8,  8, ...,  5,  5, 41])

In [155]:
logreg = LogisticRegression(penalty='l1',
fit_intercept=True, max_iter=700, random_state=42, # l1_ratio=0.8,
                    solver='saga', n_jobs=-1, verbose=0)
gmhi = GMHI(use_shannon=True)

In [156]:
models = {
    "gmhi" : {
        "estimator" : gmhi,
        "param_grid" : {
            "theta_f" : [1.5, 1.6],
            "theta_d" : [0]
        }
    },
    "logreg" : {
        "estimator" : logreg,
        "param_grid" : {
            "C" : [0.3]
        }
    }
}

In [157]:
model_names = ['gmhi', 'logreg']

In [158]:
model = GridSearchCV(
    estimator=models['gmhi']['estimator'],
    param_grid=models['gmhi']['param_grid'],
    scoring="balanced_accuracy",
    verbose=10,
    n_jobs=-1,
    cv=cross_validation_methods[2]
)

In [159]:
%%time
model.fit(X, y, groups=groups)

Fitting 5026 folds for each of 2 candidates, totalling 10052 fits
CPU times: user 32.7 s, sys: 3.25 s, total: 36 s
Wall time: 1min 4s


GridSearchCV(cv=LeaveOneOut(), estimator=GMHI(use_shannon=True), n_jobs=-1,
             param_grid={'theta_d': [0], 'theta_f': [1.5, 1.6]},
             scoring='balanced_accuracy', verbose=10)

In [161]:
model.best_estimator_.get_params()

{'theta_d': 0, 'theta_f': 1.6, 'use_shannon': True}