In [1]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import os.path as op
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt

from utils import qcAggregate, unstratifiedSample
from models import AggregateLearner

%matplotlib inline

### No Repeated Measures Experiment

In [2]:
bp = '../data/'
exp = '../figures/'
bpp = op.join(bp, exp)
try:
    os.makedirs(bpp)
except FileExistsError:
    pass

In [3]:
df_100 = pd.read_hdf('../data/aggregation_connect+feature+demo_dset100x1x1x20.h5')
df_100.drop(["edgecount", "globaleffic", "degree", "modularity",
             "assort", "avplength", "weight", "ccoeff", "betweenness",
             "plength", "age", "bmi", "cholesterol", "vo2max"], axis=1, inplace=True)
df_100.columns

Index(['subject', 'simulation', 'graph', 'sex'], dtype='object')

In [4]:
subs = list(df_100['subject'].unique())
for idx, row in df_100.iterrows():    
    # Replace graphs with some ID signal but a lot of common signal
    np.random.seed(row.subject + row.sex+4)
    g = 10*(np.random.random(row.graph.shape) - 0.5) + int(row.sex)*0.73
    np.random.seed(422)
    g += 20 * (np.random.random(row.graph.shape) - 0.2)
    row['graph'] = g
    del g

In [5]:
pca = PCA(n_components=20)
svc = SVC(kernel='linear', probability=True)
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])

# RS Determined as: 42, decrement until holdout was balanced by sex
rs = 41
clfmca_100 = AggregateLearner(df_100, pipe, target_id='sex',
                              observation_id='simulation',
                              sample_id='subject', data_id='graph',
                              random_seed=rs, verbose=False, triu=True)

In [6]:
%%time
clfmca_100.fit(aggregation='ref')
clfmca_100.fit(aggregation='median')
clfmca_100.fit(aggregation='consensus')
clfmca_100.fit(aggregation='mean')
clfmca_100.fit(aggregation='mega')
clfmca_100.fit(aggregation='none')
clfmca_100.fit(aggregation='meta')

CPU times: user 1min 31s, sys: 7.33 s, total: 1min 39s
Wall time: 1min 5s


In [7]:
clfmca_100.performance_report()

Unnamed: 0,aggregation,acc,f1,test_acc,test_f1,test_mean_acc,test_mean_f1,n_models
0,ref,0.886667,0.858077,0.736842,0.615385,0.736842,0.624908,5
1,median,0.886667,0.858077,0.736842,0.615385,0.736842,0.624908,5
2,consensus,0.899167,0.886049,0.789474,0.666667,0.705263,0.620696,5
3,mean,0.886667,0.858077,0.736842,0.615385,0.736842,0.624908,5
4,mega,0.911629,0.887326,0.736842,0.615385,0.747368,0.646154,5
5,none,0.905974,0.879676,0.831162,0.74529,0.764461,0.664856,505
6,meta,0.911667,0.882564,0.736842,0.615385,0.757895,0.647619,5


### Repeated Measures Experiment

In [8]:
df_25 = pd.read_hdf('../data/aggregation_connect+demo_dset25x2x2x20.h5')
df_25.drop(["age", "bmi", "cholesterol", "vo2max"], axis=1, inplace=True)
df_25.columns

Index(['subsample', 'graph', 'session', 'simulation', 'subject', 'sex'], dtype='object')

In [9]:
subs = list(df_25['subject'].unique())

np.random.seed(1234567890)
for idx, row in df_25.iterrows():    
    # Replace graphs with some ID signal but a lot of common signal
#     np.random.seed(row.subject*(row.session+1) + row.sex + 4)
    g = 10*(np.random.random(row.graph.shape) - 0.5) + int(row.sex)*0.7
#     np.random.seed(422)
    g += 20 * (np.random.random(row.graph.shape) - 0.2)
    row['graph'] = g
    del g

In [10]:
df_25mca = df_25.query('subsample == 0 and session == 0')
print(len(df_25mca))

df_25sub = df_25.query('simulation == "ref" and session == 0')
print(len(df_25sub))

df_25ses = df_25.query('simulation == "ref" and subsample == 0')
print(len(df_25ses))

525
50
50


In [11]:
pca = PCA(n_components=10)
svc = SVC(kernel='linear', probability=True)
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])

# RS Determined as: 42, increment until holdout was balanced by sex
rs = 45
clfmca_25 = AggregateLearner(df_25ses, pipe, target_id='sex',
                             observation_id='simulation',
                             sample_id='subject', data_id='graph',
                             random_seed=rs, verbose=False, oos=0.2,
                             cvfolds=5, triu=True)

In [12]:
clfmca_25.fit(aggregation='ref')
# clfmca_25.fit(aggregation='median')
# clfmca_25.fit(aggregation='consensus')
clfmca_25.fit(aggregation='mean')
clfmca_25.fit(aggregation='mega')
# clfmca_25.fit(aggregation='none')
# clfmca_25.fit(aggregation='meta')

In [13]:
clfmca_25.performance_report()

Unnamed: 0,aggregation,acc,f1,test_acc,test_f1,test_mean_acc,test_mean_f1,n_models
0,ref,0.62,0.759524,0.5,0.666667,0.5,0.666667,5
1,mean,0.62,0.759524,0.5,0.666667,0.5,0.666667,5
2,mega,0.62,0.759524,0.5,0.666667,0.5,0.666667,5


In [14]:
pca = PCA(n_components=10)
lrc = LogisticRegression(solver='liblinear')
pipe = Pipeline(steps=[('pca', pca), ('lrc', lrc)])

# RS Determined as: 42, increment until holdout was balanced by sex
rs = 45
clfmca_25 = AggregateLearner(df_25sub, pipe, target_id='sex',
                             observation_id='simulation',
                             sample_id='subject', data_id='graph',
                             random_seed=rs, verbose=False, oos=0.2,
                             cvfolds=5, triu=True)

In [15]:
clfmca_25.fit(aggregation='ref')
# clfmca_25.fit(aggregation='median')
# clfmca_25.fit(aggregation='consensus')
clfmca_25.fit(aggregation='mean')
clfmca_25.fit(aggregation='mega')
# clfmca_25.fit(aggregation='none')
# clfmca_25.fit(aggregation='meta')

In [16]:
clfmca_25.performance_report()

Unnamed: 0,aggregation,acc,f1,test_acc,test_f1,test_mean_acc,test_mean_f1,n_models
0,ref,0.79,0.850216,0.75,0.8,0.65,0.72,5
1,mean,0.92,0.933333,0.75,0.8,0.75,0.78,5
2,mega,0.79,0.850216,0.75,0.8,0.65,0.72,5


In [17]:
pca = PCA(n_components=13)
rfc = RandomForestClassifier()
pipe = Pipeline(steps=[('pca', pca), ('rfc', rfc)])

# RS Determined as: 42, increment until holdout was balanced by sex
rs = 43
clfmca_25 = AggregateLearner(df_25mca, pipe, target_id='sex',
                             observation_id='simulation',
                             sample_id='subject', data_id='graph',
                             random_seed=rs, verbose=False, oos=0.2,
                             cvfolds=5, triu=True)

In [18]:
clfmca_25.fit(aggregation='ref')
# clfmca_25.fit(aggregation='median')
# clfmca_25.fit(aggregation='consensus')
clfmca_25.fit(aggregation='mean')
clfmca_25.fit(aggregation='mega')
# clfmca_25.fit(aggregation='none')
# clfmca_25.fit(aggregation='meta')

In [19]:
clfmca_25.performance_report()

Unnamed: 0,aggregation,acc,f1,test_acc,test_f1,test_mean_acc,test_mean_f1,n_models
0,ref,0.65,0.780952,0.4,0.571429,0.4,0.571429,5
1,mean,0.95,0.96,1.0,1.0,0.92,0.933333,5
2,mega,0.942857,0.954417,0.914286,0.903226,0.902857,0.894149,5
