## Setup

In [89]:
import GMHI
import config
import functions
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from GMHI import GMHI
import numpy as np

In [34]:
X, y = functions.load_both()

## Models and CV methods

In [35]:
strat = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
logo = LeaveOneGroupOut()
loo = LeaveOneOut()

In [36]:
groups = functions.get_groups(X)
groups

array([ 2,  2,  2, ...,  1,  1, 35])

In [37]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Binarizer
minmax = MinMaxScaler()
standard = StandardScaler()
binary = Binarizer(0.00001)

In [118]:
logreg = LogisticRegression(C = 1, penalty='l1',
fit_intercept=True, max_iter=700, random_state=42, # l1_ratio=0.8,
                    solver='liblinear', n_jobs=-1, verbose=0)
gmhi = GMHI(use_shannon=False, theta_f=1.1, theta_d=0.0)

In [106]:
gmhi_parameters = {
#     'gmhi__theta_f' : [1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
#     'gmhi__theta_d' : [0, 0.05, 0.1, 0.15, 0.2]
    'gmhi__theta_f' : [1.5],
    'gmhi__theta_d' : [0]
}
logreg_parameters = {
    'logreg__C' : [1]
}

In [119]:
from sklearn.feature_selection import VarianceThreshold
X_bool = X > 0.0001
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_bool_new = sel.fit_transform(X_bool)
X_bool_new.shape

(4588, 194)

In [120]:
from sklearn.feature_selection import SelectKBest, chi2
X_new = SelectKBest(chi2, k=214).fit_transform(X, y)
X_new = minmax.fit_transform(X_new)
X_new.max(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [121]:
scores = cross_val_score(logreg, minmax.fit_transform(X), y, groups=groups, 
                         cv=logo, n_jobs=-1, verbose=10, scoring="accuracy",
                        )
scores

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 out of  36 | elapsed:    6.7s remaining:   20.1s
[Parallel(n_jobs=-1)]: Done  13 out of  36 | elapsed:    7.1s remaining:   12.5s
[Parallel(n_jobs=-1)]: Done  17 out of  36 | elapsed:    9.1s remaining:   10.1s
[Parallel(n_jobs=-1)]: Done  21 out of  36 | elapsed:   10.4s remaining:    7.4s
[Parallel(n_jobs=-1)]: Done  25 out of  36 | elapsed:   11.4s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done  29 out of  36 | elapsed:   12.1s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  33 out of  36 | elapsed:   13.5s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   14.2s finished


array([0.71428571, 0.69512195, 0.5483871 , 0.9       , 0.81313703,
       0.60487805, 0.41409692, 0.48      , 0.38461538, 0.88571429,
       0.72727273, 0.75      , 0.35820896, 0.92307692, 0.69148936,
       0.51327434, 0.53299492, 0.44554455, 0.6741573 , 0.53804348,
       0.46575342, 0.82608696, 0.56      , 0.45714286, 0.5       ,
       0.14285714, 0.58333333, 0.87878788, 0.7130621 , 0.58878505,
       0.7       , 0.57142857, 0.45882353, 0.53691275, 0.51351351,
       0.08163265])

In [122]:
group_sizes = np.bincount(groups)

In [123]:
def get_overall_acc(scores, group_sizes):
    total_num_correct = (scores * group_sizes).sum()
    return total_num_correct / group_sizes.sum()

In [124]:
get_overall_acc(scores, group_sizes)

0.6231473408892764

In [55]:
pipe = Pipeline(steps=[('gmhi', gmhi)])
gridsearch = GridSearchCV(
    estimator=pipe,
    param_grid=gmhi_parameters,
    scoring='balanced_accuracy',
    verbose=10,
    n_jobs=-1,
    cv=loo
)
gridsearch.fit(X, y, groups)

Fitting 4588 folds for each of 1 candidates, totalling 4588 fits


KeyboardInterrupt: 

In [None]:
best_params = gridsearch.best_estimator_.get_params()
print(best_params)
score = gridsearch.best_score_
print(score)