### Import y data and packages

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
from scipy.misc import imread


In [2]:
train_labels = pd.read_csv("../Data/train.csv")

In [3]:
label_names = {
    0:  "Nucleoplasm",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes",   
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}

# reverse_train_labels = dict((v,k) for k,v in label_names.items())

One hot encode y

In [4]:
for key in label_names.keys():
    train_labels[label_names[key]] = 0
    
def fill_targets(row):
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    for num in row.Target:
        name = label_names[int(num)]
        row.loc[name] = 1
    return row


train_labels = train_labels.apply(fill_targets, axis=1)

Count how many samples are in each category and store that in "counts". Then create a new minority label that picks the smallest class for a given sample. We will then stratify based on this lable

In [5]:
y = train_labels
y.columns =['Id', 'Target', 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]
y = y[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]]

In [6]:
counts = y.sum(axis=0).sort_values(ascending=False).values

In [7]:
minority = []
for label in train_labels['Target']:
    if len(label)==1:
        minority.append(label[0])
    else:
        counts_list = []
        for i in label:
            counts_list.append(counts[i])
        min_index = counts_list.index(min(counts_list))
        minority.append(label[min_index])

In [8]:
train_labels['minority_class'] = minority

### Train/Test Split on minority class

In [9]:
y_split = np.array(train_labels['minority_class'])
X_split = np.zeros(len(y_split)) # Using a dummy X for now

In [10]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=47)
ind = folds.split(X_split,y_split)

In [11]:
train_indexes = []
test_indexes = []
for train_index, test_index in ind:
    train_indexes.append(train_index)
    test_indexes.append(test_index)

In [12]:
y_train_minority = [y_split[i] for i in train_indexes[1]]
y_test_minority = [y_split[i] for i in test_indexes[1]]

In [13]:
y_train = y.ix[train_indexes[1]]
y_test = y.ix[test_indexes[1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [14]:
y_train_target = train_labels.ix[train_indexes[1]]
y_test_target = train_labels.ix[test_indexes[1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Checking that they appear in similar distributions:

### Train/Test Split training data (to use in model selection)

In [15]:
y_sub_split = np.array(y_train_target['minority_class'])
X_sub_split = np.zeros(len(y_sub_split)) # Using a dummy X for now

In [16]:
from sklearn.model_selection import StratifiedKFold

sub_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state = 4)
sub_ind = sub_folds.split(X_sub_split,y_sub_split)

In [17]:
sub_train_indexes = []
sub_test_indexes = []
for train_index, test_index in sub_ind:
    sub_train_indexes.append(train_index)
    sub_test_indexes.append(test_index)

In [18]:
y_train_minority_sub = [y_sub_split[i] for i in sub_train_indexes[1]]
y_test_minority_sub = [y_sub_split[i] for i in sub_test_indexes[1]]

In [19]:
y_train_sub = y.ix[sub_train_indexes[1]]
y_test_sub = y.ix[sub_test_indexes[1]]

In [20]:
y_train_target_sub = train_labels.ix[sub_train_indexes[1]]
y_test_target_sub = train_labels.ix[sub_test_indexes[1]]

Checking that they appear in similar distributions:

### Limiting training data to only single label:

In [21]:
cat = []
for label in y_train_target['Target']:
    if len(label)==1:
        cat.append(label[0])
    else:
        cat.append(28)
# Make a new category (28) for all multi label images


cat_sub = []
for label in y_train_target_sub['Target']:
    if len(label)==1:
        cat_sub.append(label[0])
    else:
        cat_sub.append(28)
# Make a new category (28) for all multi label images


In [22]:
y_train_target['multi_label'] = cat

y_train_target_sub['multi_label'] = cat_sub

In [23]:
y_s_train_target = y_train_target[y_train_target['multi_label']!=28]
y_s_train = y_s_train_target[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]]

y_s_train_target_sub = y_train_target_sub[y_train_target_sub['multi_label']!=28]
y_s_train_sub = y_s_train_target_sub[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]]

In [24]:
cat = []
for label in y_test_target['Target']:
    if len(label)==1:
        cat.append(label[0])
    else:
        cat.append(28)
# Make a new category (28) for all multi label images


cat_sub = []
for label in y_test_target_sub['Target']:
    if len(label)==1:
        cat_sub.append(label[0])
    else:
        cat_sub.append(28)
# Make a new category (28) for all multi label images


In [25]:
y_test_target['multi_label'] = cat

y_test_target_sub['multi_label'] = cat_sub

In [26]:
y_s_test_target = y_test_target[y_test_target['multi_label']!=28]
y_s_test = y_s_test_target[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]]

y_s_test_target_sub = y_test_target_sub[y_test_target_sub['multi_label']!=28]
y_s_test_sub = y_s_test_target_sub[[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]]

In [27]:
train_sl_indexes = y_s_train.index.values
test_sl_indexes = y_s_test.index.values


train_sl_indexes_sub = y_s_train_sub.index.values
test_sl_indexes_sub = y_s_test_sub.index.values

### Limit to only categories 4 and 21, which are balanced

In [28]:
counts_2cat = y_s_train.sum(axis=0).values
for i,count in enumerate(counts_2cat):
    print(i,count)

0 1931
1 221
2 651
3 519
4 864
5 775
6 501
7 924
8 25
9 13
10 0
11 482
12 183
13 126
14 389
15 0
16 21
17 0
18 253
19 411
20 79
21 851
22 167
23 1331
24 96
25 1170
26 107
27 1


In [29]:
selection_2cat = [4,21]

In [30]:
# y_s_train_2cat = y_s_train[[selection[0],selection[1]]]
# y_s_train_2cat = y_s_train_2cat[y_s_train_2cat.sum(axis=1) != 0 ]
# print(len(y_s_train_2cat))
# print(print(y_s_train_2cat.sum(axis=0)))

y_s_test_2cat = y_s_test[[selection_2cat[0],selection_2cat[1]]]
y_s_test_2cat = y_s_test_2cat[y_s_test_2cat.sum(axis=1) != 0 ]
print(print(y_s_test_2cat.sum(axis=0)))


y_s_test_sub_2cat = y_s_test_sub[[selection_2cat[0],selection_2cat[1]]]
y_s_test_sub_2cat = y_s_test_sub_2cat[y_s_test_sub_2cat.sum(axis=1) != 0 ]
print(print(y_s_test_sub_2cat.sum(axis=0)))

y_s_train_sub_2cat = y_s_train_sub[[selection_2cat[0],selection_2cat[1]]]
y_s_train_sub_2cat = y_s_train_sub_2cat[y_s_train_sub_2cat.sum(axis=1) != 0 ]
print(print(y_s_train_sub_2cat.sum(axis=0)))

4     213
21    207
dtype: int64
None
4     165
21    197
dtype: int64
None
4     666
21    650
dtype: int64
None


In [31]:
# train_s_indexes_2cat = y_s_train_2cat.index.values
test_s_indexes_2cat = y_s_test_2cat.index.values

train_s_indexes_sub_2cat = y_s_train_sub_2cat.index.values
test_s_indexes_sub_2cat = y_s_test_sub_2cat.index.values

### Limit to only 3 categories: 4, 5 and 21

In [32]:
counts_3 = y_s_train.sum(axis=0).values
for i,count in enumerate(counts_3):
    print(i,count)

0 1931
1 221
2 651
3 519
4 864
5 775
6 501
7 924
8 25
9 13
10 0
11 482
12 183
13 126
14 389
15 0
16 21
17 0
18 253
19 411
20 79
21 851
22 167
23 1331
24 96
25 1170
26 107
27 1


In [33]:
selection_3 = [4,5,21]

In [34]:
y_s_test_3cat = y_s_test[selection_3]
y_s_test_3cat = y_s_test_3cat[y_s_test_3cat.sum(axis=1) != 0 ]
print(print(y_s_test_3cat.sum(axis=0)))

y_s_test_sub_3cat = y_s_test_sub[selection_3]
y_s_test_sub_3cat = y_s_test_sub_3cat[y_s_test_sub_3cat.sum(axis=1) != 0 ]
print(print(y_s_test_sub_3cat.sum(axis=0)))

y_s_train_sub_3cat = y_s_train_sub[selection_3]
y_s_train_sub_3cat = y_s_train_sub_3cat[y_s_train_sub_3cat.sum(axis=1) != 0 ]
print(print(y_s_train_sub_3cat.sum(axis=0)))

4     213
5     208
21    207
dtype: int64
None
4     165
5     154
21    197
dtype: int64
None
4     666
5     623
21    650
dtype: int64
None


In [35]:
# train_s_indexes_3cat = y_s_train_3cat.index.values
test_s_indexes_3cat = y_s_test_3cat.index.values

train_s_indexes_sub_3cat = y_s_train_sub_3cat.index.values
test_s_indexes_sub_3cat = y_s_test_sub_3cat.index.values

### Limit to any category with more than 200 individually labeled data points and 700 multi-labeled points

In [36]:
counts_3 = y_train.sum(axis=0).values
for i,count in enumerate(counts_3):
    print(i,count)

0 10273
1 1011
2 2892
3 1251
4 1485
5 2032
6 811
7 2243
8 43
9 35
10 22
11 872
12 557
13 432
14 853
15 17
16 419
17 164
18 725
19 1191
20 134
21 3044
22 638
23 2378
24 259
25 6584
26 262
27 9


In [37]:
enough_data = [0,1,2,3,4,5,6,7,11,14,18,19,21,23,25]

In [38]:
y_test_enough_data = y_test[enough_data]
y_test_enough_data = y_test_enough_data[y_test_enough_data.sum(axis=1) != 0 ]
# print(print(y_test_enough_data.sum(axis=0)))

y_test_sub_enough_data = y_test_sub[enough_data]
y_test_sub_enough_data = y_test_sub_enough_data[y_test_sub_enough_data.sum(axis=1) != 0 ]
# print(print(y_test_sub_enough_data.sum(axis=0)))

y_train_sub_enough_data = y_train_sub[enough_data]
y_train_sub_enough_data = y_train_sub_enough_data[y_train_sub_enough_data.sum(axis=1) != 0 ]
# print(print(y_train_sub_enough_data.sum(axis=0)))

In [39]:
# train_s_indexes_3cat = y_s_train_3cat.index.values
test_indexes_enough_data = y_test_enough_data.index.values

train_indexes_sub_enough_data = y_train_sub_enough_data.index.values
test_indexes_sub_enough_data = y_test_sub_enough_data.index.values

### Enough Data (at least 200 examples) single label

In [40]:
counts_3 = y_s_train.sum(axis=0).values
for i,count in enumerate(counts_3):
    print(i,count)

0 1931
1 221
2 651
3 519
4 864
5 775
6 501
7 924
8 25
9 13
10 0
11 482
12 183
13 126
14 389
15 0
16 21
17 0
18 253
19 411
20 79
21 851
22 167
23 1331
24 96
25 1170
26 107
27 1


In [41]:
enough_data = [0,1,2,3,4,5,6,7,11,14,18,19,21,23,25]

In [42]:
y_s_test_enough_data = y_s_test[enough_data]
y_s_test_enough_data = y_s_test_enough_data[y_s_test_enough_data.sum(axis=1) != 0 ]
# print(print(y_test_enough_data.sum(axis=0)))

y_s_test_sub_enough_data = y_s_test_sub[enough_data]
y_s_test_sub_enough_data = y_s_test_sub_enough_data[y_s_test_sub_enough_data.sum(axis=1) != 0 ]
# print(print(y_test_sub_enough_data.sum(axis=0)))

y_s_train_sub_enough_data = y_s_train_sub[enough_data]
y_s_train_sub_enough_data = y_s_train_sub_enough_data[y_s_train_sub_enough_data.sum(axis=1) != 0 ]
# print(print(y_train_sub_enough_data.sum(axis=0)))

In [43]:
# train_s_indexes_3cat = y_s_train_3cat.index.values
test_indexes_enough_data = y_test_enough_data.index.values

train_indexes_sub_enough_data = y_train_sub_enough_data.index.values
test_indexes_sub_enough_data = y_test_sub_enough_data.index.values

# Importing 4D Image Data (X)

Let's look at our file names and define some important variables

In [40]:
################################################################################
input_pixels = 512
cnn_pixels = 512 
colors = 4
################################################################################

Load in each image and stack them in an array

In [41]:
# from skimage.transform import resize
# from keras.applications.imagenet_utils import preprocess_input, decode_predictions
# ##################################################################################################
# fpath = []
# for image_id in np.array(train_labels['Id']):
#     path = "../Data/train/"+ image_id
#     fpath.append(path)

# X = np.empty(shape=(len(train_labels['Id']),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)

# for i, fpath in enumerate(fpath):
#     images = np.zeros(shape=(input_pixels,input_pixels,colors))
#     images[:,:,0] = imread(fpath + "_green" + ".png")
#     images[:,:,1] = imread(fpath + "_red" + ".png")
#     images[:,:,2] = imread(fpath + "_blue" + ".png")
#     images[:,:,3] = imread(fpath + "_yellow" + ".png")
#     images = resize(images, (cnn_pixels, cnn_pixels, colors), preserve_range=True).astype(np.float32)
#     images = preprocess_input(images)
#     X[i, ...] = images

In [42]:
# from os import listdir

# # SAVE
# ##################################################################################################
# import tables
# h5file = tables.open_file('Generated_Files/4d_image_data_512.h5', mode='w', title="3d_image_data")
# root = h5file.root
# h5file.create_array(root, "image_data_rgby", X)
# h5file.close()

# # OPEN
# ##################################################################################################
X = np.empty(shape=(len(train_labels['Id']),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
import tables
hdf5_file = tables.open_file('Generated_Files/4d_image_data_512.h5', mode='r')
X[:,:,:,:] = np.array(hdf5_file.root.image_data_rgby)
hdf5_file.close()

### Single Label, 2 Categories

In [37]:
# X_s_train_2cat = np.array([X[i] for i in train_s_indexes_2cat])
X_s_test_2cat = np.empty(shape=(len(y_s_test_2cat),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_s_test_2cat = np.array([X[i] for i in test_s_indexes_2cat])

X_s_train_sub_2cat = np.empty(shape=(len(y_s_train_sub_2cat),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_s_train_sub_2cat = np.array([X[i] for i in train_s_indexes_sub_2cat])

X_s_test_sub_2cat = np.empty(shape=(len(y_s_test_sub_2cat),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_s_test_sub_2cat = np.array([X[i] for i in test_s_indexes_sub_2cat])

In [39]:
# SAVE
import tables
##################################################################################################
h5file = tables.open_file('Generated_Files/4d_small_train_data_2categories.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_s_train_2cat", np.array(X_s_train_sub_2cat))
h5file.create_array(root, "y_s_train_2cat", np.array(y_s_train_sub_2cat))
h5file.create_array(root, "X_s_test_2cat", np.array(X_s_test_sub_2cat))
h5file.create_array(root, "y_s_test_2cat", np.array(y_s_test_sub_2cat))
h5file.close()

h5file = tables.open_file('Generated_Files/4d_small_test_data_2categories.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_s_test_2cat", np.array(X_s_test_2cat))
h5file.create_array(root, "y_s_test_2cat", np.array(y_s_test_2cat))
h5file.close()

### Single Label, 3 Categories

In [39]:
X_s_test_3cat = np.array([X[i] for i in test_s_indexes_3cat])

X_s_train_sub_3cat = np.array([X[i] for i in train_s_indexes_sub_3cat])
X_s_test_sub_3cat = np.array([X[i] for i in test_s_indexes_sub_3cat])

In [41]:
import tables
# SAVE
##################################################################################################
h5file = tables.open_file('Generated_Files/4d_small_train_data_3categories.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_s_train_3cat", np.array(X_s_train_sub_3cat))
h5file.create_array(root, "y_s_train_3cat", np.array(y_s_train_sub_3cat))
h5file.create_array(root, "X_s_test_3cat", np.array(X_s_test_sub_3cat))
h5file.create_array(root, "y_s_test_3cat", np.array(y_s_test_sub_3cat))
h5file.close()

h5file = tables.open_file('Generated_Files/4d_small_test_data_3categories.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_s_test_3cat", np.array(X_s_test_3cat))
h5file.create_array(root, "y_s_test_3cat", np.array(y_s_test_3cat))
h5file.close()

### Single Labels, All Categories

In [None]:
X_s_test = np.empty(shape=(len(y_s_test),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_s_test = np.array([X[i] for i in test_sl_indexes])

X_s_train_sub = np.empty(shape=(len(y_s_train_sub),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_s_train_sub = np.array([X[i] for i in train_sl_indexes_sub])

X_s_test_sub = np.empty(shape=(len(y_s_test_sub),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_s_test_sub = np.array([X[i] for i in test_sl_indexes_sub])

In [None]:
# SAVE
##################################################################################################
h5file = tables.open_file('Generated_Files/4d_small_train_data.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_s_train", np.array(X_s_train_sub))
h5file.create_array(root, "y_s_train", np.array(y_s_train_sub))
h5file.create_array(root, "X_s_test", np.array(X_s_test_sub))
h5file.create_array(root, "y_s_test", np.array(y_s_test_sub))
h5file.close()

h5file = tables.open_file('Generated_Files/4d_small_test_data.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_s_test", np.array(X_s_test))
h5file.create_array(root, "y_s_test", np.array(y_s_test))
h5file.close()

### Enough data (At least 100 individual and 700 multi labels each)

In [48]:
X_test_enough_data = np.empty(shape=(len(y_test_enough_data),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_test_enough_data = np.array([X[i] for i in test_indexes_enough_data])

X_train_sub_enough_data = np.empty(shape=(len(y_train_sub_enough_data),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_train_sub_enough_data = np.array([X[i] for i in train_indexes_sub_enough_data])

X_test_sub_enough_data = np.empty(shape=(len(y_test_sub_enough_data),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_test_sub_enough_data = np.array([X[i] for i in test_indexes_sub_enough_data])


In [49]:
# SAVE
##################################################################################################
h5file = tables.open_file('Generated_Files/4d_enough_train_data.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_train", np.array(X_train_sub_enough_data))
h5file.create_array(root, "y_train", np.array(y_train_sub_enough_data))
h5file.create_array(root, "X_test", np.array(X_test_sub_enough_data))
h5file.create_array(root, "y_test", np.array(y_test_sub_enough_data))
h5file.close()

h5file = tables.open_file('Generated_Files/4d_enough_test_data.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_test", np.array(X_test_enough_data))
h5file.create_array(root, "y_test", np.array(y_test_enough_data))
h5file.close()

In [71]:
y_train_sub_enough_data.shape

(19192, 15)

In [64]:
counts_e = y_test_sub_enough_data.sum(axis=0).values
for i,count in enumerate(counts_e):
    print(i,count)

0 2078
1 204
2 571
3 267
4 284
5 370
6 166
7 489
8 155
9 168
10 119
11 236
12 649
13 451
14 1317


In [68]:
counts_e = y_test_sub_enough_data.sum(axis=0).values
class_weights = []
for i,count in enumerate(counts_e):
    print(i,np.round(len(y_test_sub_enough_data)/count,2))
    class_weights.append(round(len(y_test_sub_enough_data)/count,2))

0 2.31
1 23.57
2 8.42
3 18.01
4 16.93
5 12.99
6 28.96
7 9.83
8 31.02
9 28.62
10 40.4
11 20.37
12 7.41
13 10.66
14 3.65


In [69]:
class_weights

[2.31,
 23.57,
 8.42,
 18.01,
 16.93,
 12.99,
 28.96,
 9.83,
 31.02,
 28.62,
 40.4,
 20.37,
 7.41,
 10.66,
 3.65]

In [59]:
(2078+1317+649+571)/len(y_test_sub_enough_data)

0.9598585690515807

### All data

In [50]:
X_test = np.empty(shape=(len(y_test),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_test[:,:,:,:] = np.array([X[i] for i in test_indexes[1]])


X_train_sub = np.empty(shape=(len(y_train_sub),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_train_sub = np.array([X[i] for i in sub_train_indexes[1]])

X_test_sub = np.empty(shape=(len(y_test_sub),cnn_pixels,cnn_pixels,colors), dtype=np.uint8)
X_test_sub = np.array([X[i] for i in sub_test_indexes[1]])

In [51]:
# SAVE
##################################################################################################
h5file = tables.open_file('Generated_Files/4d_train_data.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_train", np.array(X_train_sub))
h5file.create_array(root, "y_train", np.array(y_train_sub))
h5file.create_array(root, "X_test", np.array(X_test_sub))
h5file.create_array(root, "y_test", np.array(y_test_sub))
h5file.close()

h5file = tables.open_file('Generated_Files/4d_test_data.h5', mode='w', title="Dataset")
root = h5file.root
h5file.create_array(root, "X_test", np.array(X_test))
h5file.create_array(root, "y_test", np.array(y_test))
h5file.close()

In [None]:
print('Multi Label')
print('------------')
print('test: ',len(test_indexes[1])," ",X_test.shape)
print('train: ',len(train_indexes[1])," ",X_train.shape)
print('   train_test: ',len(sub_test_indexes[1])," ",X_test_sub.shape)
print('   train_train: ',len(sub_train_indexes[1])," ",X_train_sub.shape)

print('==============================')

print('Single Label')
print('------------')
print('test: ',len(test_sl_indexes)," ",X_s_test.shape)
print('train: ',len(train_sl_indexes)," ",X_s_train.shape) # sub components don't exactly add
print('   train_test: ',len(test_sl_indexes_sub)," ",X_s_test_sub.shape)
print('   train_train: ',len(train_sl_indexes_sub)," ",X_s_train_sub.shape)

print('==============================')

print('Single Label, 2 Categories')
print('------------')
print('test: ',len(test_s_indexes_2cat)," ",X_s_test_2cat.shape)
print('train: ',len(train_s_indexes_2cat)," ",X_s_train_2cat.shape) # sub components don't exactly add
print('   train_test: ',len(test_s_indexes_sub_2cat)," ",X_s_test_sub_2cat.shape)
print('   train_train: ',len(train_s_indexes_sub_2cat)," ",X_s_train_sub_2cat.shape)


In [44]:
from skmultilearn.cluster.graphtool import GraphToolLabelGraphClusterer, StochasticBlockModel
model = StochasticBlockModel(nested=False, use_degree_correlation=True, allow_overlap=False, weight_model='real-normal')
clusterer_graphtool = GraphToolLabelGraphClusterer(graph_builder=graph_builder, model=model)
clusterer_graphtool.fit_predict(None, y_train)

ModuleNotFoundError: No module named 'graph_tool'

In [None]:
node_label = clusterer_graphtool.graph_.new_vertex_property("string")

for i, v in enumerate(clusterer_graphtool.graph_.vertices()):
    node_label[v] = label_names[i][0]

clusterer_graphtool.model.model_.draw(vertex_text=node_label)