In [1]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC

import os.path as op
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt

from utils import qcAggregate, unstratifiedSample
from models import AggregatedLearner

%matplotlib inline

### No Repeated Measures Experiment

In [2]:
bp = '../data/'
exp = '../figures/'
bpp = op.join(bp, exp)
try:
    os.makedirs(bpp)
except FileExistsError:
    pass

In [3]:
df = pd.read_hdf('../data/aggregation_connect+feature+demo_dset100x1x1x20.h5')
df.drop(["edgecount", "globaleffic", "degree", "modularity",
         "assort", "avplength", "weight", "ccoeff", "betweenness",
         "plength", "age", "bmi", "cholesterol", "vo2max"], axis=1, inplace=True)
df.columns

Index(['subject', 'simulation', 'graph', 'sex'], dtype='object')

In [4]:
subs = list(df['subject'].unique())
for idx, row in df.iterrows():
    # Replace subids with unique numbers
    loc = subs.index(row.subject)
    row['subject'] = loc
    del loc
    
    row['sex'] = int(row['sex'])
    
    # Replace graphs with some ID signal but a lot of common signal
    np.random.seed(row.subject + row.sex+4)
    g = 10*(np.random.random(row.graph.shape) - 0.5) + int(row.sex)*0.73
    np.random.seed(422)
    g += 20 * (np.random.random(row.graph.shape) - 0.2)
    row['graph'] = g
    del g

In [5]:
# clf = StackingClassifier
pca = PCA(n_components=20)
svc = SVC(kernel='linear', probability=True)
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])

rs = 41  # Determined by: start at 42, decrement until holdout set was balanced across sex
clfagg = AggregatedLearner(df, pipe, target_id='sex', observation_id='simulation',
                           sample_id='subject', data_id='graph', random_seed=rs,
                           verbose=False)

In [8]:
clfagg.fit(aggregation='ref')
clfagg.fit(aggregation='median')
clfagg.fit(aggregation='consensus')
clfagg.fit(aggregation='mean')
clfagg.fit(aggregation='mega')
clfagg.fit(aggregation='none')
clfagg.fit(aggregation='meta')

In [9]:
clfagg.performance_report()

Unnamed: 0,aggregation,acc,f1,test_acc,test_f1,n_models
0,ref,0.931944,0.912698,0.8,0.75,10
1,median,0.931944,0.912698,0.8,0.75,10
2,consensus,0.920833,0.90381,0.9,0.857143,10
3,mean,0.931944,0.912698,0.8,0.75,10
4,mega,0.931885,0.912698,0.8,0.75,10
5,none,0.926581,0.904811,0.810891,0.753654,1010
6,meta,0.919444,0.893651,0.8,0.75,10


### Repeated Measures Experiment

In [13]:
df = pd.read_hdf('../data/aggregation_connect+demo_dset25x2x2x20.h5')
df.drop(["age", "bmi", "cholesterol", "vo2max", "seed"], axis=1, inplace=True)
df.columns

Index(['directions', 'graph', 'session', 'simulation', 'subject', 'sex'], dtype='object')

In [12]:
df

Unnamed: 0,directions,graph,seed,session,simulation,subject,sex
0,odd,"[[384.0, 104.0, 0.0, 95.0, 18.0, 0.0, 176.0, 6...",42,NFB2,sim-8,A00058503,2
1,even,"[[507.0, 118.0, 0.0, 135.0, 12.0, 0.0, 259.0, ...",42,NFB2,sim-8,A00058503,2
2,odd,"[[1341.0, 96.0, 1.0, 268.0, 70.0, 2.0, 172.0, ...",42,ALGA,sim-8,A00058503,2
3,even,"[[1380.0, 192.0, 2.0, 230.0, 60.0, 0.0, 190.0,...",42,ALGA,sim-8,A00058503,2
4,even,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",42,ALGA,sim-8,A00060602,2
...,...,...,...,...,...,...,...
2095,odd,"[[114.0, 0.0, 0.0, 108.0, 36.0, 0.0, 0.0, 2.0,...",42,NFBAR,sim-11,A00038642,1
2096,odd,"[[277.0, 22.0, 27.0, 1.0, 16.0, 10.0, 42.0, 0....",42,DS2,sim-11,A00040628,2
2097,even,"[[608.0, 49.0, 51.0, 0.0, 50.0, 15.0, 144.0, 0...",42,DS2,sim-11,A00040628,2
2098,even,"[[672.0, 0.0, 0.0, 54.0, 105.0, 92.0, 2.0, 1.0...",42,NFBAR,sim-11,A00040628,2
