In [4]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


In [23]:
mice_path = '/home/er647/projects/feature-wise-active-learning/data/Mice_Protein/Data_Cortex_Nuclear.csv'


def load_mice(one_hot = True):
    filling_value = -100000

    X = np.genfromtxt(mice_path, delimiter = ',', skip_header = 1, usecols = range(1, 78), filling_values = filling_value, encoding = 'UTF-8')
    classes = np.genfromtxt(mice_path, delimiter = ',', skip_header = 1, usecols = range(78, 81), dtype = None, encoding = 'UTF-8')

    for i, row in enumerate(X):
        for j, val in enumerate(row):
            if val == filling_value:
                X[i, j] = np.mean([X[k, j] for k in range(classes.shape[0]) if np.all(classes[i] == classes[k])])

    DY = np.zeros((classes.shape[0]), dtype = np.uint8)
    for i, row in enumerate(classes):
        for j, (val, label) in enumerate(zip(row, ['Control', 'Memantine', 'C/S'])):
            DY[i] += (2 ** j) * (val == label)

    Y = np.zeros((DY.shape[0], np.unique(DY).shape[0]))
    for idx, val in enumerate(DY):
        Y[idx, val] = 1

    X = MinMaxScaler(feature_range=(0,1)).fit_transform(X)

    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    Y = Y[indices]
    DY = DY[indices]
    classes = classes[indices]
    
    if not one_hot:
        Y = DY
        
    X = X.astype(np.float32)
    Y = Y.astype(np.float32)
    
    print(X.shape, Y.shape)
    
    return (X[: X.shape[0] * 4 // 5], Y[: X.shape[0] * 4 // 5]), (X[X.shape[0] * 4 // 5:], Y[X.shape[0] * 4 // 5: ])


In [24]:
ret = load_mice()

(1080, 77) (1080, 8)


In [25]:
train, test = ret
X_train, y_train = train

X_test, y_test = test


In [26]:
X_train.shape, y_train.shape

((864, 77), (864, 8))

In [27]:
X_test.shape, y_test.shape


((216, 77), (216, 8))

In [28]:
y_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

In [29]:
X_train

array([[0.9990008 , 0.9990378 , 0.99990886, ..., 0.9999959 , 0.99999535,
        0.29492882],
       [0.9990162 , 0.9990806 , 0.99993986, ..., 0.9999938 , 0.99999434,
        0.4443824 ],
       [0.999029  , 0.99907637, 0.99989676, ..., 0.9999932 , 0.9999932 ,
        0.7109026 ],
       ...,
       [0.998987  , 0.99900556, 0.9999088 , ..., 0.4993335 , 0.9999992 ,
        0.05470163],
       [0.9991248 , 0.9991505 , 0.999909  , ..., 0.9999934 , 0.9999946 ,
        0.6992083 ],
       [0.9992677 , 0.99926716, 0.9999119 , ..., 0.99999285, 0.9999934 ,
        0.62603366]], dtype=float32)