study https://stackoverflow.com/questions/53378970/how-to-perform-multilabel-stratified-sampling

In [1]:
# example of a multi-label classification task
from sklearn.datasets import make_multilabel_classification
import numpy as np 
# define dataset
X, y = make_multilabel_classification(n_samples=1000, n_features=10, n_classes=3, n_labels=2, random_state=1)
# summarize dataset shape
print(X.shape, y.shape)
# summarize first few examples
for i in range(10):
	print(X[i], y[i])

(1000, 10) (1000, 3)
[ 3.  3.  6.  7.  8.  2. 11. 11.  1.  3.] [1 1 0]
[7. 6. 4. 4. 6. 8. 3. 4. 6. 4.] [0 0 0]
[ 5.  5. 13.  7.  6.  3.  6. 11.  4.  2.] [1 1 0]
[1. 1. 5. 5. 7. 3. 4. 6. 4. 4.] [1 1 1]
[ 4.  2.  3. 13.  7.  2.  4. 12.  1.  7.] [0 1 0]
[ 4.  3.  3.  2.  5.  2.  3.  7.  2. 10.] [0 0 0]
[ 3.  3.  3. 11.  6.  3.  4. 14.  1.  3.] [0 1 0]
[ 2.  1.  7.  8.  4.  5. 10.  4.  6.  6.] [1 1 1]
[ 5.  1.  9.  5.  3.  4. 11.  8.  1.  8.] [1 1 1]
[ 2. 11.  7.  6.  2.  2.  9. 11.  9.  3.] [1 1 1]


In [2]:
from skmultilearn.model_selection import iterative_train_test_split
x_train, y_train, x_test, y_test = iterative_train_test_split(X, y, test_size = 0.2)

In [3]:
def calc_prop(arr):
    thing = arr.sum(axis=0)/arr.sum()
    #a = [f"{i:.5f}" for i in thing]
    #print(a)
    return arr.sum(axis=0)/arr.sum()

def calc_prop_change(arr, base_arr):
    thing = (calc_prop(arr)-calc_prop(base_arr))/calc_prop(base_arr)
    #return np.array([round(thing_i,5) for thing_i in thing])
    return thing


In [4]:
##base proportions
100*calc_prop(X), 100*calc_prop(y)

(array([ 5.82767394,  7.00162269, 11.48506521, 12.45467476, 12.1441593 ,
         5.68944448, 13.89706913, 18.290362  ,  4.98828055,  8.22164794]),
 array([40.49586777, 46.47171011, 13.03242212]))

In [5]:
##X-wise difference
#calc_prop(X),calc_prop(x_train), calc_prop(x_test)
100*calc_prop_change(x_train,X), 100*calc_prop_change(x_test,X) #percentage wise

(array([-2.00965138,  0.15886591, -0.18542026,  0.42435743,  0.12032346,
         1.73819324, -0.19527413,  0.16726535,  0.54679074, -0.84899388]),
 array([ 7.96110382, -0.62933701,  0.73453035, -1.68106447, -0.47665361,
        -6.88573994,  0.77356581, -0.66261087, -2.16607609,  3.36323427]))

In [6]:
##y-wise difference
100*calc_prop_change(y_train,y), 100*calc_prop_change(y_test,y) #percentage wise

(array([ 0.75102041,  0.70642955, -4.85268293]),
 array([-2.90642573, -2.73386048, 18.77973269]))

In [7]:
##can see that division was well done, and the proportions was somewhat well preserved for all the three categories
#however, NOT cross validation (only one split)

# Cross validaiton using multilabel 
* http://scikit.ml/concepts.html#The-multi-label-data-representation, http://scikit.ml/stratification.html

look at this! (iterative stratification) : http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html#module-skmultilearn.model_selection.iterative_stratification => this seems to be the kfold versio nthingie


* X : (n_samples, n_features) (dense ones like np arrays are possible)
* y : (n_samples, n_labels)
    * **the labels in y shouldn't be mmore than binary class!** (see below)
> y is expected to be a binary integer indicator matrix of shape. In the binary indicator matrix each matrix element A[i,j] should be either 1 if label j is assigned to an object no i, and 0 if not.
    * therefore, stratified 할때 binary 해야함!
        * **sklearn은 binary/multiclass 다되는데.... 이거는 안되나?**

## now doing iterative stratification
(http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html#module-skmultilearn.model_selection.iterative_stratification) 



In [8]:
from skmultilearn.model_selection import IterativeStratification
k_fold = IterativeStratification(n_splits=5, order=1) #on't increae the order (i think)(if > 1, does with replacement)
print(100*calc_prop(X), 100*calc_prop(y))

total_val_set = set()

for train,test in k_fold.split(X,y):
    x_train,y_train = X[train], y[train]
    x_test,y_test = X[test], y[test]
    #print(x_train.shape, x_test.shape)
    print("\n===x_train, x_test percentage deviations from X (%)===")
    print(100*calc_prop_change(x_train,X))
    print(100*calc_prop_change(x_test,X))
    print("===y_train, y_test percentage deviations from y (%)===")
    print(100*calc_prop_change(y_train,y))
    print(100*calc_prop_change(y_test,y))
    #print(calc_prop_change(train,X))
    
    total_val_set = total_val_set.union(train)


[ 5.82767394  7.00162269 11.48506521 12.45467476 12.1441593   5.68944448
 13.89706913 18.290362    4.98828055  8.22164794] [40.49586777 46.47171011 13.03242212]

===x_train, x_test percentage deviations from X (%)===
[ 0.52060999 -0.21622498 -0.45632995 -0.29297303 -0.96487809  0.36231924
  1.61079072  0.5806744  -1.81867375 -0.84020608]
[-2.06776538  0.85880512  1.81245713  1.16363403  3.83231508 -1.43906417
 -6.39775911 -2.30632999  7.22343157  3.3371412 ]
===y_train, y_test percentage deviations from y (%)===
[-0.40266503 -0.25115932  2.1468053 ]
[ 1.63396548  1.01917384 -8.71147361]

===x_train, x_test percentage deviations from X (%)===
[-1.26569408  0.33819945  0.50446718  0.22522416  0.88441532 -0.23591877
  0.49007362  0.03597596 -0.71958277 -2.05167847]
[ 4.97858851 -1.33030243 -1.98431403 -0.88591582 -3.47883428  0.9279829
 -1.92769717 -0.14151089  2.83046795  8.07024624]
===y_train, y_test percentage deviations from y (%)===
[ 0.75102041  0.53428181 -4.23882927]
[-2.90642573

In [9]:
len(total_val_set) #as expected => i.e. didn't reuse the things, but CV was actually well done 

1000

In [10]:
##NOW HAVE TO CHECK IF multiclass + multilabel works!

In [35]:
#y_multiclass (modifing)
y_multiclass = y.copy()
y_multiclass[::5,0] = 2 #changing the first labels to 2 (multiclass) every 5 things

In [36]:
from skmultilearn.model_selection import IterativeStratification
k_fold = IterativeStratification(n_splits=5, order=1) #on't increae the order (i think)(if > 1, does with replacement)
print(100*calc_prop(X), 100*calc_prop(y_multiclass))

total_val_set = set()

for train,test in k_fold.split(X,y_multiclass):
    x_train,y_train = X[train], y_multiclass[train]
    x_test,y_test = X[test], y_multiclass[test]
    #print(x_train.shape, x_test.shape)
    print("\n===x_train, x_test percentage deviations from X (%)===")
    print(100*calc_prop_change(x_train,X))
    print(100*calc_prop_change(x_test,X))
    print("===y_train, y_test percentage deviations from y (%)===")
    print(100*calc_prop_change(y_train,y_multiclass))
    print(100*calc_prop_change(y_test,y_multiclass))
    #print(calc_prop_change(train,X))
    
    total_val_set = total_val_set.union(train)

[ 5.82767394  7.00162269 11.48506521 12.45467476 12.1441593   5.68944448
 13.89706913 18.290362    4.98828055  8.22164794] [49.18566775 39.68512486 11.12920738]

===x_train, x_test percentage deviations from X (%)===
[ 0.06641281 -1.380179    0.44151182 -0.16222617  0.51992163  0.04046949
  0.12775623 -0.06487472  0.99061782 -0.71134793]
[-0.2698412   5.60779088 -1.79390208  0.65913945 -2.11248816 -0.16443116
 -0.51908499  0.26359178 -4.02496893  2.89027036]
===y_train, y_test percentage deviations from y (%)===
[-0.27018792 -0.26360168  2.13406383]
[ 1.09708173  1.07033869 -8.6652372 ]

===x_train, x_test percentage deviations from X (%)===
[ 1.04196225  1.60049951 -0.57978054  0.35134741  0.23495716  0.32588752
 -1.37358151  0.06164023  0.11187198 -0.27970065]
[-4.1550431  -6.38232761  2.31199656 -1.40107152 -0.93694098 -1.29954487
  5.47744448 -0.24580336 -0.44611299  1.11536502]
===y_train, y_test percentage deviations from y (%)===
[ 0.41255759 -0.02825195 -1.72256098]
[-1.6413101

In [12]:
aaaa #meant to interrupt it 

NameError: name 'aaaa' is not defined

### past stuff (sparse X, y 쓸대인데... 그냥 일반적인 dense 써도 되어서 아놤)

In [None]:
from skmultilearn.dataset import load_dataset
X, y, _, _ = load_dataset('emotions', 'train')

In [None]:
##looking at data
print(X.shape, y.shape)


for i, y_i in enumerate(y):
    print(f"===the {i}th target is===")
    print(y_i)
    if i == 5:
        break
#here, y is sparse (hence where the sparse thing came from)