In [1]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

import pandas as pd
import scipy as sp
import numpy as np

from models import AggregateLearner
from utils import refSample

In [2]:
def performanceP(true, f1, acc, n=1000, rs=42):
    np.random.seed(rs)

    f1_dist = np.array([f1_score(true, np.random.choice(2, len(true)))
                        for _ in range(n)])
    f1N = np.sum(f1_dist > f1)
    f1p = 1.0 * f1N / n
    print("F1: {0}/{1} (p = {2}; chance = {3})".format(f1N, n, f1p, np.mean(f1_dist)))
    
    acc_dist = np.array([accuracy_score(true, np.random.choice(2, len(true)))
                for _ in range(n)])
    accN = np.sum(acc_dist > acc)
    accp = 1.0 * accN / n
    print("Acc: {0}/{1} (p = {2}; chance = {3})".format(accN, n, accp, np.mean(acc_dist)))
    return f1p, accp


def logoffset(x): return np.log10(x+1)
def passthrough(x): return x
def rank(x): return sp.stats.rankdata(x, method='average', axis=1)
def nonzerolocs(x, p=0.8): return np.where(np.percentile(x, (1-p)*100, axis=0) > 0)
def prune(x, p=0.8, locs=None): return np.vstack(x[:, locs]) if locs else np.vstack(x[:, nonzerolocs(x, p)])

In [3]:
# Load Dataset
df = pd.read_hdf('../data/aggregation_connect+feature+demo_dset100x1x1x20.h5')

# Create dummy classifier instance to determine prune locations
target = "age"
observation = "simulation"
refstr = "ref"
jack = 100
rs = 41
p=0.2

clf = AggregateLearner(df, [], target_id=target, observation_id=observation,
                       sample_id='subject', data_id='graph', refstr=refstr,
                       cvfolds=5, oos=0.2, jack=jack, triu=True,
                       random_seed=rs, verbose=False)

X, y, g = clf._prep_data(clf.dat, clf.tar, clf.sam, refSample, index=clf.refloc)
X_t, y_t, g_t = clf._prep_data(clf.dat_t, clf.tar_t, clf.sam_t, refSample, index=clf.refloc)

locs = nonzerolocs(np.vstack([X, X_t]), p=p)

In [4]:
# Create pre-processing components
prn = ('prune', FunctionTransformer(prune, kw_args={'locs':locs}))
log = ('log', FunctionTransformer(logoffset))
rnk = ('rnk', FunctionTransformer(rank))
passthrough = ('pass', FunctionTransformer(passthrough))

# Create decomposition components
pca = ('pca', PCA(n_components=20))
lda = ('lda', LinearDiscriminantAnalysis())

# Create classifier components
svc = ('svc', SVC(class_weight="balanced", probability=True, max_iter=1e5))
lrc = ('lrc', LogisticRegression(class_weight="balanced", solver='liblinear', max_iter=1e5, penalty='l2'))
gnb = ('gnb', GaussianNB())
gpc = ('gpc', GaussianProcessClassifier())
knn = ('knn', KNeighborsClassifier(n_neighbors=10))
rfc = ('rfc', RandomForestClassifier(class_weight="balanced"))
ada = ('ada', AdaBoostClassifier())
clfs = [svc, gnb, gpc, knn, rfc, ada]

In [5]:
target = "age"
pipe = Pipeline([rnk, pca, lrc])

clf = AggregateLearner(df, pipe, target_id=target, observation_id=observation,
                       sample_id='subject', data_id='graph', refstr=refstr,
                       cvfolds=5, oos=0.2, jack=jack, triu=True,
                       random_seed=rs, verbose=False)

_, y_t, _ = clf._prep_data(clf.dat_t, clf.tar_t, clf.sam_t, refSample, index=clf.refloc)

print(target, ": ", " -> ".join(str(p) for p in pipe))
perf = clf.fit(aggregation="ref")
print(clf.performance_report().to_string())

f1p, accp = performanceP(y_t, perf['f1'], perf['acc'], n=1000)

age :  FunctionTransformer(func=<function rank at 0x11949fc80>) -> PCA(n_components=20) -> LogisticRegression(class_weight='balanced', max_iter=100000.0,
                   solver='liblinear')
  aggregation       acc        f1   p_acc    p_f1  test_acc   test_f1  p_test_acc  p_test_f1  test_mean_acc  test_mean_f1  n_models
0         ref  0.704167  0.781031  0.0424  0.0444      0.65  0.758621       0.061      0.045           0.68       0.77892         5
F1: 41/1000 (p = 0.041; chance = 0.5729847496870505)
Acc: 63/1000 (p = 0.063; chance = 0.5021)


In [6]:
target = "sex"
pipe = Pipeline([log, pca, gnb])

clf = AggregateLearner(df, pipe, target_id=target, observation_id=observation,
                       sample_id='subject', data_id='graph', refstr=refstr,
                       cvfolds=5, oos=0.2, jack=jack, triu=True,
                       random_seed=rs, verbose=False)

_, y_t, _ = clf._prep_data(clf.dat_t, clf.tar_t, clf.sam_t, refSample, index=clf.refloc)

print(target, ": ", " -> ".join(str(p) for p in pipe))
perf = clf.fit(aggregation="ref")
print(clf.performance_report().to_string())

f1p, accp = performanceP(y_t, perf['f1'], perf['acc'], n=1000)

sex :  FunctionTransformer(func=<function logoffset at 0x11949fa60>) -> PCA(n_components=20) -> GaussianNB()
  aggregation     acc        f1   p_acc    p_f1  test_acc   test_f1  p_test_acc  p_test_f1  test_mean_acc  test_mean_f1  n_models
0         ref  0.6025  0.730364  0.1448  0.0806      0.65  0.774194       0.061      0.015           0.62      0.749677         5
F1: 15/1000 (p = 0.015; chance = 0.5408287689913871)
Acc: 48/1000 (p = 0.048; chance = 0.49679999999999996)


In [7]:
target = "bmi"
pipe = Pipeline([log, pca, rfc])

clf = AggregateLearner(df, pipe, target_id=target, observation_id=observation,
                       sample_id='subject', data_id='graph', refstr=refstr,
                       cvfolds=5, oos=0.2, jack=jack, triu=True,
                       random_seed=rs, verbose=False)

_, y_t, _ = clf._prep_data(clf.dat_t, clf.tar_t, clf.sam_t, refSample, index=clf.refloc)

print(target, ": ", " -> ".join(str(p) for p in pipe))
perf = clf.fit(aggregation="ref")
print(clf.performance_report().to_string())

f1p, accp = performanceP(y_t, perf['f1'], perf['acc'], n=1000)

bmi :  FunctionTransformer(func=<function logoffset at 0x11949fa60>) -> PCA(n_components=20) -> RandomForestClassifier(class_weight='balanced')
  aggregation  acc        f1   p_acc    p_f1  test_acc   test_f1  p_test_acc  p_test_f1  test_mean_acc  test_mean_f1  n_models
0         ref  0.6  0.620363  0.1972  0.2386       0.7  0.727273        0.02      0.017           0.62      0.627273         5
F1: 17/1000 (p = 0.017; chance = 0.490177290194139)
Acc: 23/1000 (p = 0.023; chance = 0.5075)


In [8]:
target = "rel_vo2max"
pipe = Pipeline([rnk, pca, svc])

clf = AggregateLearner(df, pipe, target_id=target, observation_id=observation,
                       sample_id='subject', data_id='graph', refstr=refstr,
                       cvfolds=5, oos=0.2, jack=jack, triu=True,
                       random_seed=rs, verbose=False)

_, y_t, _ = clf._prep_data(clf.dat_t, clf.tar_t, clf.sam_t, refSample, index=clf.refloc)

print(target, ": ", " -> ".join(str(p) for p in pipe))
perf = clf.fit(aggregation="ref")
print(clf.performance_report().to_string())

f1p, accp = performanceP(y_t, perf['f1'], perf['acc'], n=1000)

rel_vo2max :  FunctionTransformer(func=<function rank at 0x11949fc80>) -> PCA(n_components=20) -> SVC(class_weight='balanced', max_iter=100000.0, probability=True)
  aggregation       acc        f1  p_acc    p_f1  test_acc  test_f1  p_test_acc  p_test_f1  test_mean_acc  test_mean_f1  n_models
0         ref  0.540833  0.550364  0.313  0.3706      0.45     0.56       0.589      0.308           0.51      0.595222         5
F1: 312/1000 (p = 0.312; chance = 0.4968675929171709)
Acc: 630/1000 (p = 0.63; chance = 0.5071)


In [None]:
target = "cholesterol"

for c in clfs:
    pipe = Pipeline([prn, rnk, pca, gpc])

    clf = AggregateLearner(df, pipe, target_id=target, observation_id=observation,
                           sample_id='subject', data_id='graph', refstr=refstr,
                           cvfolds=5, oos=0.2, jack=jack, triu=True,
                           random_seed=rs, verbose=False)

    _, y_t, _ = clf._prep_data(clf.dat_t, clf.tar_t, clf.sam_t, refSample, index=clf.refloc)

    print(target, ": ", " -> ".join(str(p) for p in pipe))
    perf = clf.fit(aggregation="ref")
    print(clf.performance_report().to_string())

    f1p, accp = performanceP(y_t, perf['f1'], perf['acc'], n=1000)