In [3]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import os.path as op
import pandas as pd
import numpy as np
import os

import matplotlib.pyplot as plt

from utils import qcAggregate, unstratifiedSample
from models import AggregateLearner

%matplotlib inline

### No Repeated Measures Experiment

In [4]:
bp = '../data/'
exp = '../figures/'
bpp = op.join(bp, exp)
try:
    os.makedirs(bpp)
except FileExistsError:
    pass

In [3]:
df_100 = pd.read_hdf('../data/aggregation_connect+feature+demo_dset100x1x1x20.h5')
df_100.drop(["edgecount", "globaleffic", "degree", "modularity",
             "assort", "avplength", "weight", "ccoeff", "betweenness",
             "plength", "age", "bmi", "cholesterol", "rel_vo2max"], axis=1, inplace=True)
df_100.columns

Index(['subject', 'simulation', 'graph', 'sex'], dtype='object')

In [4]:
subs = list(df_100['subject'].unique())
for idx, row in df_100.iterrows():    
    # Replace graphs with some ID signal but a lot of common signal
    np.random.seed(int(row.subject + row.sex+4))
    g = 10*(np.random.random(row.graph.shape) - 0.5) + int(row.sex)*0.73
    np.random.seed(422)
    g += 20 * (np.random.random(row.graph.shape) - 0.2)
    row['graph'] = g
    df_100.iloc[idx] = row

    del g

In [8]:
df_100.graph

0       [[798.0, 87.0, 1.0, 140.0, 45.0, 0.0, 142.0, 7...
1       [[665.0, 131.0, 0.0, 10.0, 24.0, 161.0, 1.0, 7...
2       [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
3       [[1017.0, 124.0, 0.0, 88.0, 5.0, 2.0, 49.0, 8....
4       [[717.0, 56.0, 2.0, 26.0, 6.0, 5.0, 28.0, 13.0...
                              ...                        
2052    [[1877.0, 391.0, 0.0, 308.0, 249.0, 2.0, 988.0...
2053    [[464.0, 20.0, 37.0, 8.0, 43.0, 24.0, 87.0, 0....
2054    [[387.0, 0.0, 0.0, 127.0, 57.0, 0.0, 47.0, 9.0...
2055    [[248.0, 12.0, 0.0, 12.0, 3.0, 0.0, 0.0, 6.0, ...
2056    [[67.0, 0.0, 0.0, 8.0, 2.0, 0.0, 5.0, 0.0, 0.0...
Name: graph, Length: 2057, dtype: object

In [9]:
pca = PCA(n_components=20)
svc = SVC(kernel='linear', probability=True)
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])

# RS Determined as: 42, decrement until holdout was balanced by sex
rs = 41
clfmca_100 = AggregateLearner(df_100, pipe, target_id='sex',
                              observation_id='simulation',
                              sample_id='subject', data_id='graph',
                              random_seed=rs, verbose=False, triu=True)

In [10]:
%%time
clfmca_100.fit(aggregation='ref')
# clfmca_100.fit(aggregation='median')
# clfmca_100.fit(aggregation='consensus')
# clfmca_100.fit(aggregation='mean')
# clfmca_100.fit(aggregation='mega')
# clfmca_100.fit(aggregation='none')
# clfmca_100.fit(aggregation='meta')

CPU times: user 1min 34s, sys: 416 ms, total: 1min 34s
Wall time: 1min 37s


In [28]:
t_per_ex = 1.7  # D100: 1min37s
t_per_ex += 0.02  # D25: 1s
n_classif = 3
n_jack = 100
n_agg = 7
n_trains = n_agg - 1 + n_jack
n_targets = 5
mca_samples = [20, 15, 10, 5, 2]
total_t = t_per_ex * n_classif * n_targets * n_trains * len(mca_samples)

total_t/60, 'hours'

(227.9, 'hours')

In [None]:
%%time
clfmca_100.fit(aggregation='median')

In [24]:
clfmca_100.performance_report()

Unnamed: 0,aggregation,acc,f1,test_acc,test_f1,test_mean_acc,test_mean_f1,n_models
0,ref,0.911667,0.927948,0.947368,0.962963,0.842105,0.882268,5
1,median,0.911667,0.927948,0.947368,0.962963,0.842105,0.882268,5
2,consensus,0.911667,0.927948,0.947368,0.962963,0.810526,0.849659,5
3,mean,0.911667,0.927948,0.947368,0.962963,0.842105,0.882268,5
4,mega,0.911667,0.927133,0.894737,0.923077,0.8,0.848889,5
5,none,0.909042,0.925485,0.922632,0.944328,0.826737,0.868804,500
6,meta,0.899167,0.918475,0.894737,0.923077,0.842105,0.883009,5


### Repeated Measures Experiment

In [5]:
df_25 = pd.read_hdf('../data/aggregation_connect+demo_dset25x2x2x20.h5')
# df_25.drop(["age", "bmi", "cholesterol", "rel_vo2max"], axis=1, inplace=True)
df_25.columns

Index(['subsample', 'graph', 'session', 'simulation', 'subject', 'age', 'sex',
       'bmi', 'cholesterol', 'rel_vo2max'],
      dtype='object')

In [6]:
subs = list(df_25['subject'].unique())

np.random.seed(1234567890)
for idx, row in df_25.iterrows():    
    # Replace graphs with some ID signal but a lot of common signal
#     np.random.seed(row.subject*(row.session+1) + row.sex + 4)
    g = 10*(np.random.random(row.graph.shape) - 0.5) + int(row.sex)*0.7
#     np.random.seed(422)
    g += 20 * (np.random.random(row.graph.shape) - 0.2)
    row['graph'] = g
    df_25.iloc[idx] = row
    del g

In [7]:
df_25.graph

0       [[14.793140221040614, 7.972560738515426, 2.514...
1       [[2.3801033199545607, 18.849329436871756, 11.1...
2       [[14.629131659346623, 1.383584067810371, 17.43...
3       [[3.609143108436224, 0.9104843796529809, 6.857...
4       [[8.397275737397655, 12.473016254439427, 4.348...
                              ...                        
2095    [[-0.7025835451650368, 17.175273695207046, 3.0...
2096    [[0.354784558152363, -3.589410212641905, 8.667...
2097    [[12.16770423548934, 6.631674281395401, 13.667...
2098    [[-3.0842302978971303, 10.01293488093115, 7.40...
2099    [[-1.6813894838022103, 13.650586204939676, 9.0...
Name: graph, Length: 2100, dtype: object

In [8]:
df_25mca = df_25.query('subsample == 0 and session == 0')
print(len(df_25mca))

df_25sub = df_25.query('simulation == "ref" and session == 0')
print(len(df_25sub))

df_25ses = df_25.query('simulation == "ref" and subsample == 0')
print(len(df_25ses))

525
50
50


In [9]:
pca = PCA(n_components=15)
svc = SVC(kernel='linear', probability=True)
pipe = Pipeline(steps=[('pca', pca), ('svc', svc)])

# RS Determined as: 42, increment until holdout was balanced by sex
rs = 45
clfmca_25 = AggregateLearner(df_25ses, pipe, target_id='age',
                             observation_id='simulation',
                             sample_id='subject', data_id='graph',
                             random_seed=rs, verbose=False, oos=0.2,
                             cvfolds=5, triu=True)

In [10]:
clfmca_25.tar, clfmca_25.tar_t

([array([1., 1.]),
  array([0., 0.]),
  array([0., 0.]),
  array([1., 1.]),
  array([1., 1.]),
  array([1., 1.]),
  array([0., 0.]),
  array([0., 0.]),
  array([0., 0.]),
  array([0., 0.]),
  array([1., 1.]),
  array([0., 0.]),
  array([0., 0.]),
  array([0., 0.]),
  array([1., 1.]),
  array([1., 1.]),
  array([0., 0.]),
  array([1., 1.]),
  array([0., 0.]),
  array([0., 0.])],
 [array([0., 0.]),
  array([1., 1.]),
  array([0., 0.]),
  array([1., 1.]),
  array([0., 0.])])

In [11]:
%%time
clfmca_25.fit(aggregation='ref')
clfmca_25.fit(aggregation='median')
clfmca_25.fit(aggregation='consensus')
clfmca_25.fit(aggregation='mean')
clfmca_25.fit(aggregation='mega')
clfmca_25.fit(aggregation='none')
clfmca_25.fit(aggregation='meta')

CPU times: user 26.7 s, sys: 920 ms, total: 27.6 s
Wall time: 22.5 s


In [12]:
clfmca_25.performance_report()

Unnamed: 0,aggregation,acc,f1,test_acc,test_f1,test_mean_acc,test_mean_f1,n_models
0,ref,0.575,0.0,0.6,0.0,0.62,0.114286,5
1,median,0.6,0.0,0.6,0.5,0.6,0.0,5
2,consensus,0.6,0.0,0.8,0.8,0.6,0.0,5
3,mean,0.6,0.0,0.6,0.5,0.6,0.0,5
4,mega,0.575,0.0,0.6,0.0,0.62,0.114286,5
5,none,0.5945,0.009333,0.57,0.288333,0.5952,0.001333,500
6,meta,0.65,0.673333,0.49,0.225,0.486,0.293667,5


In [13]:
clfmca_25.oos_perf['consensus']

{'true': array([0., 1., 0., 1., 0.]),
 'pred': array([0., 1., 1., 1., 0.]),
 'acc': 0.8,
 'f1': 0.8,
 'comp_acc': 0.6,
 'comp_f1': 0.0}

In [17]:
lrc = LogisticRegression(solver='liblinear')
pipe = Pipeline(steps=[('pca', pca), ('lrc', lrc)])

# RS Determined as: 42, increment until holdout was balanced by sex
rs = 45
clfmca_25 = AggregateLearner(df_25sub, pipe, target_id='sex',
                             observation_id='simulation',
                             sample_id='subject', data_id='graph',
                             random_seed=rs, verbose=False, oos=0.2,
                             cvfolds=5, triu=True)

In [18]:
%%time
clfmca_25.fit(aggregation='ref')
# clfmca_25.fit(aggregation='median')
# clfmca_25.fit(aggregation='consensus')
# clfmca_25.fit(aggregation='mean')
# clfmca_25.fit(aggregation='mega')
# clfmca_25.fit(aggregation='none')
# clfmca_25.fit(aggregation='meta')

CPU times: user 207 ms, sys: 33.6 ms, total: 241 ms
Wall time: 101 ms


In [39]:
clfmca_25.performance_report()

Unnamed: 0,aggregation,acc,f1,test_acc,test_f1,test_mean_acc,test_mean_f1,n_models
0,ref,0.79,0.850216,0.75,0.8,0.65,0.72,5
1,mean,0.92,0.933333,0.75,0.8,0.75,0.78,5
2,mega,0.79,0.850216,0.75,0.8,0.65,0.72,5


In [40]:
rfc = RandomForestClassifier()
pipe = Pipeline(steps=[('pca', pca), ('rfc', rfc)])

# RS Determined as: 42, increment until holdout was balanced by sex
rs = 43
clfmca_25 = AggregateLearner(df_25mca, pipe, target_id='sex',
                             observation_id='simulation',
                             sample_id='subject', data_id='graph',
                             random_seed=rs, verbose=False, oos=0.2,
                             cvfolds=5, triu=True)

In [41]:
clfmca_25.fit(aggregation='ref')
# clfmca_25.fit(aggregation='median')
# clfmca_25.fit(aggregation='consensus')
clfmca_25.fit(aggregation='mean')
clfmca_25.fit(aggregation='mega')
# clfmca_25.fit(aggregation='none')
# clfmca_25.fit(aggregation='meta')

In [35]:
clfmca_25.tar_t

[array([2., 2.]), array([1., 1.]), array([1., 1.]), array([2., 2.])]

In [37]:
clfmca_25.performance_report().to_csv('~/Desktop/tmp.csv', index=False)

In [33]:
type(df_25mca.session.values[0])

int

In [38]:
experiment_pieces = ['mca_full', 'meta', 'age', 'RF', '41']
"report_"+"_".join(experiment_pieces)

'report_mca_full_meta_age_RF_41'

In [51]:
n_mca = 5
rs = 41
for idx, sub in enumerate(df_25mca['subject'].unique()):
    tdf = df_25mca.query("subject == {0}".format(sub))
    if idx == 0:
        newdf = tdf.sample(n=n_mca, axis=0)
    else:
        newdf = pd.concat([newdf, tdf.sample(n=n_mca, axis=0)])
newdf

Unnamed: 0,subsample,graph,session,simulation,subject,sex
603,0,"[[1307.0, 174.0, 4.0, 209.0, 41.0, 0.0, 179.0,...",0,sim-1,0,2.0
1303,0,"[[2090.0, 299.0, 2.0, 371.0, 76.0, 0.0, 290.0,...",0,sim-18,0,2.0
703,0,"[[2003.0, 279.0, 4.0, 342.0, 51.0, 0.0, 282.0,...",0,sim-14,0,2.0
903,0,"[[2034.0, 282.0, 4.0, 331.0, 74.0, 0.0, 275.0,...",0,sim-2,0,2.0
1403,0,"[[1307.0, 174.0, 4.0, 209.0, 41.0, 0.0, 179.0,...",0,sim-7,0,2.0
...,...,...,...,...,...,...
97,0,"[[414.0, 38.0, 35.0, 0.0, 32.0, 12.0, 94.0, 0....",0,sim-8,24,2.0
597,0,"[[279.0, 24.0, 25.0, 0.0, 21.0, 8.0, 62.0, 0.0...",0,sim-17,24,2.0
697,0,"[[608.0, 49.0, 51.0, 0.0, 50.0, 15.0, 144.0, 0...",0,sim-1,24,2.0
1597,0,"[[414.0, 38.0, 35.0, 0.0, 32.0, 12.0, 94.0, 0....",0,sim-6,24,2.0


In [54]:
df_25mca.query("simulation == 'ref' and {0} == 0".format('session'))

Unnamed: 0,subsample,graph,session,simulation,subject,sex
1703,0,"[[890.0, 117.0, 2.0, 143.0, 35.0, 0.0, 125.0, ...",0,ref,0,2.0
1704,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0,ref,1,2.0
1708,0,"[[679.0, 84.0, 0.0, 48.0, 8.0, 0.0, 32.0, 10.0...",0,ref,2,1.0
1715,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0,ref,3,2.0
1718,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0,ref,4,1.0
1722,0,"[[514.0, 20.0, 0.0, 0.0, 35.0, 64.0, 207.0, 0....",0,ref,5,2.0
1726,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0,ref,6,1.0
1730,0,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0,ref,7,2.0
1735,0,"[[652.0, 94.0, 0.0, 35.0, 7.0, 0.0, 1.0, 11.0,...",0,ref,8,1.0
1738,0,"[[387.0, 75.0, 23.0, 3.0, 8.0, 23.0, 44.0, 8.0...",0,ref,9,1.0


In [61]:
from sklearn.model_selection import train_test_split

In [68]:
sub = clfmca_25.df['subject'].unique()
tar = [clfmca_25.df.query('subject == {0}'.format(s))['sex'].values[0]
       for s in sub]

train_test_split(sub, tar, test_size=0.2, stratify=tar, random_state=42)

[array([18, 17, 21, 11, 20, 0, 3, 14, 9, 15, 13, 19, 24, 12, 1, 10, 22, 4,
        2, 6], dtype=object),
 array([5, 7, 16, 23, 8], dtype=object),
 [2.0,
  2.0,
  2.0,
  1.0,
  1.0,
  2.0,
  2.0,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  2.0,
  1.0,
  2.0,
  1.0,
  2.0,
  1.0,
  1.0,
  1.0],
 [2.0, 2.0, 1.0, 1.0, 1.0]]

In [None]:
locs = 

In [59]:
n_targs = 5
n_classifs = 3
n_mcas_L5 = 2   # 5 and 2 giving issues
n_aggs = 1  # consensus and 
n_mcas_L5 * n_classifs * n_targs

# Adjust random seed for small experiments in MCA setting

30

In [60]:
n_agg = 7
n_clf = 3
n_mcas = 5
n_mcas * n_clf * n_agg

105

In [None]:
# All for D25

mca chol ref SVM 5     # Class balance (i.e. not balanced across TTV sets)
mca chol ref SVM 2     # ""
mca chol meta SVM 20   # ""
mca chol meta SVM 5    # ""
mca chol meta SVM 2    # ""
mca chol mega SVM 15   # ""
mca chol mega SVM 10   # ""
mca chol mega SVM 5    # ""
mca chol mega SVM 2    # ""
mca chol mean SVM 20   # ""
mca chol mean SVM 15   # ""
mca chol mean SVM 10   # ""
mca chol mean SVM 5    # ""
mca chol mean SVM 2    # ""
mca chol medi SVM 20   # ""

mca chol mega RF 2
mca chol ref RF 5  # "" + but also, 
"""
Traceback (most recent call last):
 93290   File "wrapper.py", line 121, in <module>
 93291     main()
 93292   File "wrapper.py", line 105, in main
 93293     clf.fit(aggregation=ar.aggregation)
 93294   File "/home/gkiar/code/gkiar-aggregate/code/models.py", line 85, in fit
 93295     clf, oos = self._oos_eval(clfs, func=refSample, index=self.refloc)
 93296   File "/home/gkiar/code/gkiar-aggregate/code/models.py", line 204, in _oos_eval
 93297     pred = clf.predict(Xo)
 93298   File "/home/gkiar/code/env/aggregate/lib/python3.6/site-packages/sklearn/ensemble/_voting.py", line 282, in predict
 93299     maj = np.argmax(self.predict_proba(X), axis=1)
 93300   File "/home/gkiar/code/env/aggregate/lib/python3.6/site-packages/sklearn/ensemble/_voting.py", line 302, in _predict_       proba
 93301     avg = np.average(self._collect_probas(X), axis=0,
 93302   File "/home/gkiar/code/env/aggregate/lib/python3.6/site-packages/sklearn/ensemble/_voting.py", line 297, in _collect_       probas
 93303     return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
 93304   File "/home/gkiar/code/env/aggregate/lib/python3.6/site-packages/numpy/core/_asarray.py", line 83, in asarray
 93305     return array(a, dtype, copy=False, order=order)
 93306 ValueError: could not broadcast input array from shape (5,2) into shape (5)
"""