## study multilabel stratification function 

study https://stackoverflow.com/questions/53378970/how-to-perform-multilabel-stratified-sampling

In [1]:
# example of a multi-label classification task
from sklearn.datasets import make_multilabel_classification
import numpy as np 
# define dataset
X, y = make_multilabel_classification(n_samples=1000, n_features=10, n_classes=3, n_labels=2, random_state=1)
# summarize dataset shape
print(X.shape, y.shape)
# summarize first few examples
for i in range(10):
	print(X[i], y[i])

(1000, 10) (1000, 3)
[ 3.  3.  6.  7.  8.  2. 11. 11.  1.  3.] [1 1 0]
[7. 6. 4. 4. 6. 8. 3. 4. 6. 4.] [0 0 0]
[ 5.  5. 13.  7.  6.  3.  6. 11.  4.  2.] [1 1 0]
[1. 1. 5. 5. 7. 3. 4. 6. 4. 4.] [1 1 1]
[ 4.  2.  3. 13.  7.  2.  4. 12.  1.  7.] [0 1 0]
[ 4.  3.  3.  2.  5.  2.  3.  7.  2. 10.] [0 0 0]
[ 3.  3.  3. 11.  6.  3.  4. 14.  1.  3.] [0 1 0]
[ 2.  1.  7.  8.  4.  5. 10.  4.  6.  6.] [1 1 1]
[ 5.  1.  9.  5.  3.  4. 11.  8.  1.  8.] [1 1 1]
[ 2. 11.  7.  6.  2.  2.  9. 11.  9.  3.] [1 1 1]


In [2]:
from skmultilearn.model_selection import iterative_train_test_split
x_train, y_train, x_test, y_test = iterative_train_test_split(X, y, test_size = 0.2)

In [3]:
#def calc_prop(arr):
#    return arr.sum(axis=0)/arr.shape[0]
#
#def calc_prop_change(arr, base_arr):
#    thing = (calc_prop(arr)-calc_prop(base_arr))/calc_prop(base_arr)
#    #return np.array([round(thing_i,5) for thing_i in thing])
#    return thing


def calc_prop(arr):
    aggregate = []
    #for i in range(np.max(arr)+1):
    if type(arr) == np.ndarray : 
        set_thing = set(arr.flatten())
    else : 
        set_thing = set(arr)
    for i in set_thing:
        aggregate.append(np.count_nonzero(arr == i, axis=0))
    return np.array(aggregate)/arr.shape[0]


def calc_prop_change(arr, base_arr):
    thing = (calc_prop(arr)-calc_prop(base_arr))/calc_prop(base_arr)
    return thing

In [4]:
##base proportions
100*calc_prop(y)

array([[36.3, 26.9, 79.5],
       [63.7, 73.1, 20.5]])

In [5]:
##y-wise difference
100*calc_prop_change(y_train,y), 100*calc_prop_change(y_test,y) #percentage wise

(array([[-0.13774105, -0.0929368 ,  1.41509434],
        [ 0.07849294,  0.03419973, -5.48780488]]),
 array([[ 0.55096419,  0.37174721, -5.66037736],
        [-0.31397174, -0.13679891, 21.95121951]]))

In [6]:
##can see that division was well done, and the proportions was somewhat well preserved for all the three categories
#however, NOT cross validation (only one split)

# Cross validaiton using multilabel 
* http://scikit.ml/concepts.html#The-multi-label-data-representation, http://scikit.ml/stratification.html

look at this! (iterative stratification) : http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html#module-skmultilearn.model_selection.iterative_stratification => this seems to be the kfold versio nthingie


* X : (n_samples, n_features) (dense ones like np arrays are possible)
* y : (n_samples, n_labels)
    * **the labels in y shouldn't be mmore than binary class!** (see below)
> y is expected to be a binary integer indicator matrix of shape. In the binary indicator matrix each matrix element A[i,j] should be either 1 if label j is assigned to an object no i, and 0 if not.
    * therefore, stratified 할때 binary 해야함!
        * **sklearn은 binary/multiclass 다되는데.... 이거는 안되나?**

## 2.1. ABCD config에서 가져오기 


In [7]:
import sys 
import pandas as pd 
import numpy as np 
label_csv = '/scratch/connectome/dyhan316/VAE_ADHD/junbeom_finetuning/csv/ABCD_csv/ABCD_phenotype_total_ONLY_MRI.csv'

In [8]:
from sklearn.model_selection import StratifiedKFold, KFold
kf = StratifiedKFold(n_splits=5)

labels = pd.read_csv(label_csv)

label_name = "sex"
labels = labels[labels[label_name].notna()] #이파트를 더 추가해야할듯 (if doing stuff)
labels = labels[labels['married'].notna()] #이파트를 더 추가해야할듯 (if doing stuff)
labels[label_name] = labels[label_name].astype('str') #regressino하려면 이파트를 바꿔야할듯 

task_include = ['1.0','2.0']
data_1 = labels[labels[label_name] == task_include[0]]
data_2 = labels[labels[label_name] == task_include[1]]
data_1 = data_1[:50]
data_2 = data_2[:50]

label_tv = pd.concat([data_1, data_2])

col2view = ['sex','age']

##default thing 


In [9]:

##first, get the things

#print(calc_prop(label_tv[['sex','married']].values))
print(col2view)
print(calc_prop(label_tv[col2view].values))


for FOLD, (train_idx, valid_idx) in enumerate(kf.split(label_tv, label_tv[label_name])): 
    print(f"===FOLD : {FOLD}===")
    train = label_tv.iloc[train_idx]
    valid = label_tv.iloc[valid_idx]
    
    print("with training")
    print(calc_prop_change(train[col2view].values, label_tv[col2view].values))
    
    
    print("with validation")
    if FOLD == 2 : 
        import pdb; pdb.set_trace()
    print(calc_prop_change(valid[col2view].values, label_tv[col2view].values))
    #view_train = train[['sex','married']]
    #print(calc_prop_change(.values, values = ))

['sex', 'age']
[[0.   0.06]
 [0.   0.02]
 [0.   0.06]
 [0.   0.06]
 [0.5  0.  ]
 [0.5  0.  ]
 [0.   0.03]
 [0.   0.1 ]
 [0.   0.03]
 [0.   0.06]
 [0.   0.03]
 [0.   0.01]
 [0.   0.04]
 [0.   0.03]
 [0.   0.03]
 [0.   0.02]
 [0.   0.05]
 [0.   0.07]
 [0.   0.02]
 [0.   0.02]
 [0.   0.06]
 [0.   0.08]
 [0.   0.01]
 [0.   0.02]
 [0.   0.07]
 [0.   0.02]]
===FOLD : 0===
with training


ValueError: operands could not be broadcast together with shapes (25,2) (26,2) 

In [None]:
train[['sex','married']].values


In [None]:
label_tv.iloc[train_idx]

In [None]:
train_idx

In [None]:
label_tv[label_name]

In [None]:
np.max(train[label_name])

In [None]:
set(train[label_name])

In [None]:
aggregate = []
arr = train[label_name]
for i in set(train[label_name]):
    aggregate.append(np.count_nonzero(arr == i, axis=0))
print(np.array(aggregate)/arr.shape[0])

## 2.2. now doing iterative stratification
(http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html#module-skmultilearn.model_selection.iterative_stratification) 



In [None]:
from skmultilearn.model_selection import IterativeStratification
k_fold = IterativeStratification(n_splits=5, order=1) #on't increae the order (i think)(if > 1, does with replacement)
print(100*calc_prop(y))

total_val_set = set()

for train,test in k_fold.split(X,y):
    x_train,y_train = X[train], y[train]
    x_test,y_test = X[test], y[test]
    print("\n===y_train, y_test percentage deviations from y (%)===")
    print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)
    
    print(100*calc_prop_change(y_train,y))
    print(100*calc_prop_change(y_test,y))
    #print(calc_prop_change(train,X))
    
    total_val_set = total_val_set.union(train)


In [None]:
len(total_val_set) #as expected => i.e. didn't reuse the things, but CV was actually well done 

## NOW HAVE TO CHECK IF multiclass + multilabel works!(i.e. try multiclass)

In [None]:
#y_multiclass (modifing)
y_multiclass = y.copy()
y_multiclass[::10,0] = 2 #changing the first labels to 2 (multiclass) every 5 things
#y_multiclass[::11,0] = 3 #changing the first labels to 2 (multiclass) every 5 things

In [None]:
y_multiclass

In [None]:
from skmultilearn.model_selection import IterativeStratification
k_fold = IterativeStratification(n_splits=5, order = 1) #on't increae the order (i think)(if > 1, does with replacement)
print("below : the shapes of (class_i, n_labels)")
print(100*calc_prop(y_multiclass))

total_val_set = set()

for train,test in k_fold.split(X,y_multiclass):
    x_train,y_train = X[train], y_multiclass[train]
    x_test,y_test = X[test], y_multiclass[test]
    #print(x_train.shape, x_test.shape)

    print("\n===y_train, y_test percentage deviations from y (%)===")
    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
    print(100*calc_prop_change(y_train,y_multiclass))
    print(100*calc_prop_change(y_test,y_multiclass))
    #print(calc_prop_change(train,X))
    
    total_val_set = total_val_set.union(train)

# 보듯이, https://github.com/scikit-multilearn/scikit-multilearn/issues/132
> multiclass + multilabel은 잘 안되는 것 같다!

https://github.com/trent-b/iterative-stratification/issues/27#issue-1577817136

일단 위에 두개보고, 되는지 안되는지 확인해보자 => 안된다면.. 흠.. 어떻게 하지? 
(그리고 어차피, age같은 것은 하지도 못할거같은데..?)(unless artificially splitting into age brackets and treating age as a categorical thing)

=> 이것 보고 하기!


In [None]:
aaaa #meant to interrupt it 

In [None]:
small_y = y_multiclass[:5]
print(np.count_nonzero(small_y == 0, axis = 0),
np.count_nonzero(small_y == 1, axis = 0),
np.count_nonzero(small_y == 2, axis = 0))




In [None]:
small_y

In [None]:
100*calc_prop(y)

In [None]:
np.max(small_y)

In [None]:
def calc_prop_multiclass(arr):
    aggregate = []
    for i in range(np.max(arr)+1):
        aggregate.append(np.count_nonzero(arr == i, axis=0))
    return np.array(aggregate)/arr.shape[0]



def calc_prop_change_multiclass(arr, base_arr):
    thing = (calc_prop_multiclass(arr)-calc_prop_multiclass(base_arr))/calc_prop_multiclass(base_arr)
    return thing

In [None]:
calc_prop_multiclass(y_multiclass)

In [None]:
calc_prop_change_multiclass(y_multiclass[:8],y_multiclass)

In [None]:
y_multiclass[:10]

In [None]:
y_multiclass

### past stuff (sparse X, y 쓸대인데... 그냥 일반적인 dense 써도 되어서 아놤)

In [None]:
from skmultilearn.dataset import load_dataset
X, y, _, _ = load_dataset('emotions', 'train')

In [None]:
##looking at data
print(X.shape, y.shape)


for i, y_i in enumerate(y):
    print(f"===the {i}th target is===")
    print(y_i)
    if i == 5:
        break
#here, y is sparse (hence where the sparse thing came from)