study https://stackoverflow.com/questions/53378970/how-to-perform-multilabel-stratified-sampling

In [1]:
# example of a multi-label classification task
from sklearn.datasets import make_multilabel_classification
import numpy as np 
# define dataset
X, y = make_multilabel_classification(n_samples=1000, n_features=10, n_classes=3, n_labels=2, random_state=1)
# summarize dataset shape
print(X.shape, y.shape)
# summarize first few examples
for i in range(10):
	print(X[i], y[i])

(1000, 10) (1000, 3)
[ 3.  3.  6.  7.  8.  2. 11. 11.  1.  3.] [1 1 0]
[7. 6. 4. 4. 6. 8. 3. 4. 6. 4.] [0 0 0]
[ 5.  5. 13.  7.  6.  3.  6. 11.  4.  2.] [1 1 0]
[1. 1. 5. 5. 7. 3. 4. 6. 4. 4.] [1 1 1]
[ 4.  2.  3. 13.  7.  2.  4. 12.  1.  7.] [0 1 0]
[ 4.  3.  3.  2.  5.  2.  3.  7.  2. 10.] [0 0 0]
[ 3.  3.  3. 11.  6.  3.  4. 14.  1.  3.] [0 1 0]
[ 2.  1.  7.  8.  4.  5. 10.  4.  6.  6.] [1 1 1]
[ 5.  1.  9.  5.  3.  4. 11.  8.  1.  8.] [1 1 1]
[ 2. 11.  7.  6.  2.  2.  9. 11.  9.  3.] [1 1 1]


In [2]:
from skmultilearn.model_selection import iterative_train_test_split
x_train, y_train, x_test, y_test = iterative_train_test_split(X, y, test_size = 0.2)

In [3]:
def calc_prop(arr):
    thing = arr.sum(axis=0)/arr.sum()
    #a = [f"{i:.5f}" for i in thing]
    #print(a)
    return arr.sum(axis=0)/arr.sum()

def calc_prop_change(arr, base_arr):
    thing = (calc_prop(arr)-calc_prop(base_arr))/calc_prop(base_arr)
    #return np.array([round(thing_i,5) for thing_i in thing])
    return thing


In [4]:
##base proportions
100*calc_prop(X), 100*calc_prop(y)

(array([ 5.82767394,  7.00162269, 11.48506521, 12.45467476, 12.1441593 ,
         5.68944448, 13.89706913, 18.290362  ,  4.98828055,  8.22164794]),
 array([40.49586777, 46.47171011, 13.03242212]))

In [5]:
##X-wise difference
#calc_prop(X),calc_prop(x_train), calc_prop(x_test)
100*calc_prop_change(x_train,X), 100*calc_prop_change(x_test,X) #percentage wise

(array([-1.35342464,  0.4363202 , -0.39379796,  0.58321631,  0.17702815,
         0.9108414 , -0.26016604,  0.25164608,  0.74597004, -1.21008658]),
 array([ 5.33557515, -1.72009519,  1.55246074, -2.29920037, -0.69789404,
        -3.59078929,  1.02564666, -0.9920586 , -2.94082071,  4.77049679]))

In [6]:
##y-wise difference
100*calc_prop_change(y_train,y), 100*calc_prop_change(y_test,y) #percentage wise

(array([ 0.75102041,  0.70642955, -4.85268293]),
 array([-2.90642573, -2.73386048, 18.77973269]))

In [7]:
##can see that division was well done, and the proportions was somewhat well preserved for all the three categories
#however, NOT cross validation (only one split)

# Cross validaiton using multilabel 
* http://scikit.ml/concepts.html#The-multi-label-data-representation, http://scikit.ml/stratification.html

look at this! (iterative stratification) : http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html#module-skmultilearn.model_selection.iterative_stratification => this seems to be the kfold versio nthingie


* X : (n_samples, n_features) (dense ones like np arrays are possible)
* y : (n_samples, n_labels)
    * **the labels in y shouldn't be mmore than binary class!** (see below)
> y is expected to be a binary integer indicator matrix of shape. In the binary indicator matrix each matrix element A[i,j] should be either 1 if label j is assigned to an object no i, and 0 if not.
    * therefore, stratified 할때 binary 해야함!
        * **sklearn은 binary/multiclass 다되는데.... 이거는 안되나?**

## now doing iterative stratification
(http://scikit.ml/api/skmultilearn.model_selection.iterative_stratification.html#module-skmultilearn.model_selection.iterative_stratification) 



In [8]:
from skmultilearn.model_selection import IterativeStratification
k_fold = IterativeStratification(n_splits=5, order=1) #on't increae the order (i think)(if > 1, does with replacement)
print(100*calc_prop(X), 100*calc_prop(y))

total_val_set = set()

for train,test in k_fold.split(X,y):
    x_train,y_train = X[train], y[train]
    x_test,y_test = X[test], y[test]
    #print(x_train.shape, x_test.shape)
    print("\n===x_train, x_test percentage deviations from X (%)===")
    print(100*calc_prop_change(x_train,X))
    print(100*calc_prop_change(x_test,X))
    print("===y_train, y_test percentage deviations from y (%)===")
    print(100*calc_prop_change(y_train,y))
    print(100*calc_prop_change(y_test,y))
    #print(calc_prop_change(train,X))
    
    total_val_set = total_val_set.union(train)


[ 5.82767394  7.00162269 11.48506521 12.45467476 12.1441593   5.68944448
 13.89706913 18.290362    4.98828055  8.22164794] [40.49586777 46.47171011 13.03242212]

===x_train, x_test percentage deviations from X (%)===
[-0.21102372  0.17546317  0.39886121  0.5418495  -0.73226118  1.24790705
 -0.78404711  0.29887512 -2.09016846  0.76874031]
[ 0.85643384 -0.71211233 -1.61876699 -2.19908095  2.97186136 -5.06459562
  3.18203311 -1.21297624  8.48288981 -3.11991089]
===y_train, y_test percentage deviations from y (%)===
[-0.08598034  0.0660071   0.0317965 ]
[ 0.34337545 -0.26360932 -0.12698413]

===x_train, x_test percentage deviations from X (%)===
[ 0.46962086  0.79335054 -0.44444003 -0.45720579 -0.18519916 -0.08892953
  0.11871639  0.45310728 -1.99384498  0.64109211]
[-1.85805907 -3.13889837  1.75843089  1.80893874  0.7327421   0.35185045
 -0.46970245 -1.79272293  7.88866518 -2.53648656]
===y_train, y_test percentage deviations from y (%)===
[ 0.75102041  0.53428181 -4.23882927]
[-2.9064257

In [9]:
len(total_val_set) #as expected => i.e. didn't reuse the things, but CV was actually well done 

1000

### past stuff (sparse X, y 쓸대인데... 그냥 일반적인 dense 써도 되어서 아놤)

In [10]:
from skmultilearn.dataset import load_dataset
X, y, _, _ = load_dataset('emotions', 'train')

emotions:train - exists, not redownloading


In [11]:
##looking at data
print(X.shape, y.shape)


for i, y_i in enumerate(y):
    print(f"===the {i}th target is===")
    print(y_i)
    if i == 5:
        break
#here, y is sparse (hence where the sparse thing came from)

(391, 72) (391, 6)
===the 0th target is===
  (0, 1)	1
  (0, 2)	1
===the 1th target is===
  (0, 0)	1
  (0, 5)	1
===the 2th target is===
  (0, 1)	1
  (0, 5)	1
===the 3th target is===
  (0, 2)	1
===the 4th target is===
  (0, 3)	1
===the 5th target is===
  (0, 1)	1
  (0, 2)	1
