In [12]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import FeatureAgglomeration
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

import pandas as pd
import scipy as sp
import numpy as np

from models import AggregateLearner
from utils import refSample

In [13]:
def performanceP(true, f1, acc, n=1000, rs=42):
    np.random.seed(rs)

    f1_dist = np.array([f1_score(true, np.random.choice(2, len(true)))
                        for _ in range(n)])
    f1N = np.sum(f1_dist > f1)
    f1p = 1.0 * f1N / n
    print("F1: {0} (p = {2}; chance = {3})".format(f1, f1p, np.mean(f1_dist)))
    
    acc_dist = np.array([accuracy_score(true, np.random.choice(2, len(true)))
                for _ in range(n)])
    accN = np.sum(acc_dist > acc)
    accp = 1.0 * accN / n
    print("Acc: {0} (p = {2}; chance = {3})".format(acc, accp, np.mean(acc_dist)))
    return f1p, accp

In [14]:
# Load Dataset
df = pd.read_hdf('../data/aggregation_connect+feature+demo_dset100x1x1x20.h5')

In [17]:
# Create list of targets
targets = ["age", "sex", "bmi"]

# Create pre-processing components
datavars = ["graph", "rankgraph", "loggraph", "zgraph"]

# Create decomposition components
pca = ('pca', PCA(n_components=20))
fa = ('fa', FeatureAgglomeration(n_clusters=20))
embeds = [pca, fa]

# Create classifier components
svc = ('svc', SVC(class_weight="balanced", probability=True, max_iter=1e5))
lrc = ('lrc', LogisticRegression(class_weight="balanced", solver='liblinear', max_iter=1e5, penalty='l2'))
gnb = ('gnb', GaussianNB())
gpc = ('gpc', GaussianProcessClassifier())
knn = ('knn', KNeighborsClassifier(n_neighbors=10))
rfc = ('rfc', RandomForestClassifier(class_weight="balanced"))
ada = ('ada', AdaBoostClassifier())
clfs = [svc, gnb, gpc, knn, rfc, ada]

- age (zgraph fa rfc): 0.8750
- sex (rankgraph pca knn): 0.8148
- bmi (graph fa ada): 0.8182

In [22]:
observation = "simulation"
refstr = "ref"
jack = 100
rs = 41

results = []
for t in targets:
    for dva in datavars:
        for e in embeds:
            for c in clfs:
                pipe = Pipeline([e, c])
                clf = AggregateLearner(df, pipe, data_id=dva, target_id=t, observation_id=observation,
                                       sample_id='subject', refstr=refstr, cvfolds=5, oos=0.2,
                                       jack=jack, triu=True, random_seed=rs, verbose=False)

                
                print(t, dva, e[0], c[0])
                perf = clf.fit(aggregation="ref")
                perf['f1'], perf['acc'], perf['p_f1'], perf['p_acc']
                results += [{
                    "target": t,
                    "preproc": dva,
                    "embedding": e[0],
                    "classifier": c[0],
                    "f1": perf['f1'],
                    "acc": perf['acc'],
                    "p_f1": perf['p_f1'],
                    "p_acc": perf['p_acc']
                }]

resdf = pd.DataFrame.from_dict(results)



Age
age graph pca svc
age graph pca gnb
age graph pca gpc
age graph pca knn
age graph pca rfc
age graph pca ada
age graph fa svc
age graph fa gnb
age graph fa gpc
age graph fa knn
age graph fa rfc
age graph fa ada
age rankgraph pca svc
age rankgraph pca gnb
age rankgraph pca gpc
age rankgraph pca knn
age rankgraph pca rfc
age rankgraph pca ada
age rankgraph fa svc
age rankgraph fa gnb
age rankgraph fa gpc
age rankgraph fa knn
age rankgraph fa rfc
age rankgraph fa ada
age loggraph pca svc
age loggraph pca gnb
age loggraph pca gpc
age loggraph pca knn
age loggraph pca rfc
age loggraph pca ada
age loggraph fa svc
age loggraph fa gnb
age loggraph fa gpc
age loggraph fa knn
age loggraph fa rfc
age loggraph fa ada
age zgraph pca svc
age zgraph pca gnb
age zgraph pca gpc
age zgraph pca knn
age zgraph pca rfc
age zgraph pca ada
age zgraph fa svc
age zgraph fa gnb
age zgraph fa gpc
age zgraph fa knn
age zgraph fa rfc
age zgraph fa ada


Sex
sex graph pca svc
sex graph pca gnb
sex graph pca gp

In [33]:
newdf = resdf.copy(deep=True)
newdf['embedding'] = newdf['embedding'].apply(lambda x: x[0:][0])
newdf['classifier'] = newdf['classifier'].apply(lambda x: x[0:][0])

In [43]:
print(newdf.groupby(['classifier'], as_index=False).mean())

  classifier        f1       acc      p_f1     p_acc
0        ada  0.723823  0.643750  0.083333  0.113208
1        gnb  0.651501  0.597917  0.248292  0.190917
2        gpc  0.422049  0.539583  0.539292  0.355250
3        knn  0.691480  0.631250  0.164833  0.123917
4        rfc  0.752474  0.677083  0.037917  0.051875
5        svc  0.710844  0.639583  0.126375  0.104792


In [44]:
print(newdf.query("classifier == 'svc'").groupby(['target', 'classifier'], as_index=False).mean())

  target classifier        f1      acc      p_f1     p_acc
0    age        svc  0.826615  0.72500  0.008625  0.020750
1    bmi        svc  0.554838  0.58750  0.333125  0.167000
2    sex        svc  0.751078  0.60625  0.037375  0.126625


In [45]:
print(newdf.query("classifier == 'svc' and target == 'bmi'"))

    target    preproc embedding classifier        f1   acc   p_f1  p_acc
96     bmi      graph       pca        svc  0.500000  0.60  0.477  0.128
102    bmi      graph        fa        svc  0.636364  0.60  0.110  0.128
108    bmi  rankgraph       pca        svc  0.600000  0.60  0.191  0.128
114    bmi  rankgraph        fa        svc  0.526316  0.55  0.395  0.244
120    bmi   loggraph       pca        svc  0.444444  0.50  0.688  0.390
126    bmi   loggraph        fa        svc  0.500000  0.60  0.477  0.128
132    bmi     zgraph       pca        svc  0.631579  0.65  0.136  0.062
138    bmi     zgraph        fa        svc  0.600000  0.60  0.191  0.128
