In [12]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import FeatureAgglomeration
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

import pandas as pd
import scipy as sp
import numpy as np

from models import AggregateLearner
from utils import refSample

In [13]:
def performanceP(true, f1, acc, n=1000, rs=42):
    np.random.seed(rs)

    f1_dist = np.array([f1_score(true, np.random.choice(2, len(true)))
                        for _ in range(n)])
    f1N = np.sum(f1_dist > f1)
    f1p = 1.0 * f1N / n
    print("F1: {0} (p = {2}; chance = {3})".format(f1, f1p, np.mean(f1_dist)))
    
    acc_dist = np.array([accuracy_score(true, np.random.choice(2, len(true)))
                for _ in range(n)])
    accN = np.sum(acc_dist > acc)
    accp = 1.0 * accN / n
    print("Acc: {0} (p = {2}; chance = {3})".format(acc, accp, np.mean(acc_dist)))
    return f1p, accp

In [14]:
# Load Dataset
df = pd.read_hdf('../data/aggregation_connect+feature+demo_dset100x1x1x20.h5')

In [71]:
# Create list of targets
targets = ["age", "sex", "bmi"]

# Create pre-processing components
datavars = ["graph", "rankgraph", "loggraph", "zgraph"]

# Create decomposition components
pca = ('pca', PCA(n_components=20))
fa = ('fa', FeatureAgglomeration(n_clusters=20))
embeds = [pca, fa]

# Create classifier components
svc = ('svc', SVC(class_weight="balanced", probability=True, max_iter=1e5))
lrc = ('lrc', LogisticRegression(class_weight="balanced", solver='liblinear', max_iter=1e5, penalty='l2'))
gnb = ('gnb', GaussianNB())
gpc = ('gpc', GaussianProcessClassifier())
knn = ('knn', KNeighborsClassifier(n_neighbors=10))
rfc = ('rfc', RandomForestClassifier(class_weight="balanced"))
ada = ('ada', AdaBoostClassifier())
clfs = [svc, lrc, gnb, gpc, knn, rfc, ada]

- age (zgraph fa rfc): 0.8750
- sex (rankgraph pca knn): 0.8148
- bmi (graph fa ada): 0.8182

In [73]:
observation = "simulation"
refstr = "ref"
jack = 100
rs = 41

results = []
targets = ['age']
for t in targets:
    for dva in datavars:
        for e in embeds:
            for c in clfs:
                pipe = Pipeline([e, c])
                clf = AggregateLearner(df, pipe, data_id=dva, target_id=t, observation_id=observation,
                                       sample_id='subject', refstr=refstr, cvfolds=5, oos=0.2,
                                       jack=jack, triu=True, random_seed=rs, verbose=False)

                
                print(t, dva, e[0], c[0])
                perf = clf.fit(aggregation="ref")
                perf['f1'], perf['acc'], perf['p_f1'], perf['p_acc']
                results += [{
                    "target": t,
                    "preproc": dva,
                    "embedding": e[0],
                    "classifier": c[0],
                    "f1": perf['f1'],
                    "acc": perf['acc'],
                    "p_f1": perf['p_f1'],
                    "p_acc": perf['p_acc']
                }]

resdf = pd.DataFrame.from_dict(results)

age graph pca svc
age graph pca lrc
age graph pca gnb
age graph pca gpc
age graph pca knn
age graph pca rfc
age graph pca ada
age graph fa svc
age graph fa lrc
age graph fa gnb
age graph fa gpc
age graph fa knn
age graph fa rfc
age graph fa ada
age rankgraph pca svc
age rankgraph pca lrc
age rankgraph pca gnb
age rankgraph pca gpc
age rankgraph pca knn
age rankgraph pca rfc
age rankgraph pca ada
age rankgraph fa svc
age rankgraph fa lrc
age rankgraph fa gnb
age rankgraph fa gpc
age rankgraph fa knn
age rankgraph fa rfc
age rankgraph fa ada
age loggraph pca svc
age loggraph pca lrc
age loggraph pca gnb
age loggraph pca gpc
age loggraph pca knn
age loggraph pca rfc
age loggraph pca ada
age loggraph fa svc
age loggraph fa lrc
age loggraph fa gnb
age loggraph fa gpc
age loggraph fa knn
age loggraph fa rfc
age loggraph fa ada
age zgraph pca svc
age zgraph pca lrc
age zgraph pca gnb
age zgraph pca gpc
age zgraph pca knn
age zgraph pca rfc
age zgraph pca ada
age zgraph fa svc
age zgraph fa lr

In [75]:
resdf.groupby("classifier").mean()

Unnamed: 0_level_0,f1,acc,p_f1,p_acc
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ada,0.795168,0.68125,0.0285,0.05675
gnb,0.702895,0.59375,0.202125,0.204125
gpc,0.477456,0.5,0.460375,0.424625
knn,0.807112,0.7,0.01575,0.03375
lrc,0.794953,0.71875,0.0495,0.05
rfc,0.823467,0.7125,0.0095,0.023
svc,0.826615,0.725,0.008625,0.02075


In [33]:
newdf = resdf.copy(deep=True)
newdf['embedding'] = newdf['embedding'].apply(lambda x: x[0:][0])
newdf['classifier'] = newdf['classifier'].apply(lambda x: x[0:][0])

In [43]:
print(newdf.groupby(['classifier'], as_index=False).mean())

  classifier        f1       acc      p_f1     p_acc
0        ada  0.723823  0.643750  0.083333  0.113208
1        gnb  0.651501  0.597917  0.248292  0.190917
2        gpc  0.422049  0.539583  0.539292  0.355250
3        knn  0.691480  0.631250  0.164833  0.123917
4        rfc  0.752474  0.677083  0.037917  0.051875
5        svc  0.710844  0.639583  0.126375  0.104792


In [48]:
print(newdf.query("classifier == 'svc'").groupby(['target', 'classifier'], as_index=False).max())

  target classifier preproc embedding        f1   acc   p_f1  p_acc
0    age        svc  zgraph       pca  0.866667  0.80  0.021  0.061
1    bmi        svc  zgraph       pca  0.636364  0.65  0.688  0.390
2    sex        svc  zgraph       pca  0.758621  0.65  0.039  0.136


In [45]:
print(newdf.query("classifier == 'svc' and target == 'bmi'"))

    target    preproc embedding classifier        f1   acc   p_f1  p_acc
96     bmi      graph       pca        svc  0.500000  0.60  0.477  0.128
102    bmi      graph        fa        svc  0.636364  0.60  0.110  0.128
108    bmi  rankgraph       pca        svc  0.600000  0.60  0.191  0.128
114    bmi  rankgraph        fa        svc  0.526316  0.55  0.395  0.244
120    bmi   loggraph       pca        svc  0.444444  0.50  0.688  0.390
126    bmi   loggraph        fa        svc  0.500000  0.60  0.477  0.128
132    bmi     zgraph       pca        svc  0.631579  0.65  0.136  0.062
138    bmi     zgraph        fa        svc  0.600000  0.60  0.191  0.128


In [54]:
newdf.query('target == "bmi"').groupby(['classifier']).mean()

Unnamed: 0_level_0,f1,acc,p_f1,p_acc
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ada,0.663949,0.65,0.125875,0.11025
gnb,0.567478,0.6125,0.357375,0.151
gpc,0.376553,0.59375,0.601625,0.22275
knn,0.518138,0.5625,0.435625,0.2325
rfc,0.693341,0.6875,0.052625,0.0305
svc,0.554838,0.5875,0.333125,0.167


In [67]:
newdf.query('target == "sex"').groupby(['classifier']).max()

Unnamed: 0_level_0,target,preproc,embedding,f1,acc,p_f1,p_acc
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ada,sex,zgraph,pca,0.8,0.7,0.188,0.408
gnb,sex,zgraph,pca,0.8,0.7,0.897,0.862
gpc,sex,zgraph,pca,0.8,0.75,1.0,0.743
knn,sex,zgraph,pca,0.814815,0.75,0.12,0.248
rfc,sex,zgraph,pca,0.8,0.7,0.188,0.248
svc,sex,zgraph,pca,0.758621,0.65,0.039,0.136


In [70]:
newdf.query('target == "age"').groupby(['classifier']).min()

Unnamed: 0_level_0,target,preproc,embedding,f1,acc,p_f1,p_acc
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ada,age,graph,fa,0.709677,0.55,0.002,0.0
gnb,age,graph,fa,0.47619,0.45,0.017,0.018
gpc,age,graph,fa,0.0,0.3,0.021,0.061
knn,age,graph,fa,0.774194,0.65,0.003,0.0
rfc,age,graph,fa,0.787879,0.65,0.002,0.0
svc,age,graph,fa,0.787879,0.65,0.002,0.0
