In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
Xy = np.loadtxt('data_processed.dat')

print(Xy.shape)
print(Xy)

(372, 20)
[[48.    80.     1.02  ... -1.    -1.     1.   ]
 [ 7.    50.     1.02  ... -1.    -1.     1.   ]
 [62.    80.     1.01  ... -1.     1.     1.   ]
 ...
 [12.    80.     1.02  ... -1.    -1.     0.   ]
 [17.    60.     1.025 ... -1.    -1.     0.   ]
 [58.    80.     1.025 ... -1.    -1.     0.   ]]


We find number of unique value for each column, to have an idea about which variables are continuous, which variables are binary, category. It depends on data, however it is likely that nu = 2 --> binary; nu = 3 or 4: --> category, n > 4: continuous. Of course, we have to see data in detail as well.

In [4]:
X = Xy[:,:-1]
l,n = X.shape
nu = np.array([len(np.unique(X[:,i])) for i in range(n)])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[ 74  10   5   6   6   2   2   2 141 111  76 113  42   2   2   2   2   2
   2]


In [5]:
# We then define variable type, 1: continuous, 2: binary, 3: category.
variable_type  = np.ones(n) # continuous
variable_type[5:8] = 2 # binary
variable_type[13:] = 2 # binary

print(variable_type)

[1. 1. 1. 1. 1. 2. 2. 2. 1. 1. 1. 1. 1. 2. 2. 2. 2. 2. 2.]


In [6]:
def convert_continuous_to_binary(x,variable_type):
    """
    convert continuous to binary by subtracting them with mean values.
    """
    
    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # continuous            
            x1 = np.sign(x[:,i] - x[:,i].mean())            
        x_new = np.hstack((x_new,x1[:,np.newaxis]))
    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [7]:
X_new = convert_continuous_to_binary(X,variable_type)

In [8]:
nu = np.array([len(np.unique(X_new[:,i])) for i in range(n)])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [11]:
print(X_new)

[[-1.  1.  1. ...  1.  1.  1.]
 [-1. -1.  1. ... -1. -1. -1.]
 [ 1.  1. -1. ... -1. -1. -1.]
 ...
 [-1.  1.  1. ...  1.  1.  1.]
 [-1. -1.  1. ...  1.  1.  1.]
 [ 1.  1.  1. ...  1.  1.  1.]]


In [13]:
for i in range(n):
    print(i,np.unique(X_new[:,i],return_counts=True))

0 (array([-1.,  1.]), array([166, 206]))
1 (array([-1.,  1.]), array([178, 194]))
2 (array([-1.,  1.]), array([161, 211]))
3 (array([-1.,  1.]), array([220, 152]))
4 (array([-1.,  1.]), array([313,  59]))
5 (array([-1.,  1.]), array([313,  59]))
6 (array([-1.,  1.]), array([313,  59]))
7 (array([-1.,  1.]), array([313,  59]))
8 (array([-1.,  1.]), array([274,  98]))
9 (array([-1.,  1.]), array([273,  99]))
10 (array([-1.,  1.]), array([279,  93]))
11 (array([-1.,  1.]), array([164, 208]))
12 (array([-1.,  1.]), array([148, 224]))
13 (array([-1.,  1.]), array([148, 224]))
14 (array([-1.,  1.]), array([148, 224]))
15 (array([-1.,  1.]), array([148, 224]))
16 (array([-1.,  1.]), array([148, 224]))
17 (array([-1.,  1.]), array([148, 224]))
18 (array([-1.,  1.]), array([148, 224]))


In [15]:
# disease vs non-disease
# select non-disease
t = Xy[:,-1] == 0
X1 = X_new[t]
X2 = X_new[~t]

In [16]:
for i in range(n):
    print(i,np.unique(X1[:,i],return_counts=True),np.unique(X2[:,i],return_counts=True))

0 (array([-1.,  1.]), array([92, 57])) (array([-1.,  1.]), array([ 74, 149]))
1 (array([-1.,  1.]), array([83, 66])) (array([-1.,  1.]), array([ 95, 128]))
2 (array([1.]), array([149])) (array([-1.,  1.]), array([161,  62]))
3 (array([-1.]), array([149])) (array([-1.,  1.]), array([ 71, 152]))
4 (array([-1.]), array([149])) (array([-1.,  1.]), array([164,  59]))
5 (array([-1.]), array([149])) (array([-1.,  1.]), array([164,  59]))
6 (array([-1.]), array([149])) (array([-1.,  1.]), array([164,  59]))
7 (array([-1.]), array([149])) (array([-1.,  1.]), array([164,  59]))
8 (array([-1.]), array([149])) (array([-1.,  1.]), array([125,  98]))
9 (array([-1.]), array([149])) (array([-1.,  1.]), array([124,  99]))
10 (array([-1.]), array([149])) (array([-1.,  1.]), array([130,  93]))
11 (array([1.]), array([149])) (array([-1.,  1.]), array([164,  59]))
12 (array([1.]), array([149])) (array([-1.,  1.]), array([148,  75]))
13 (array([1.]), array([149])) (array([-1.,  1.]), array([148,  75]))
14 (

In [18]:
np.savetxt('data_binary_nondisease.txt',X1,fmt='%i')
np.savetxt('data_binary_disease.txt',X2,fmt='%i')
np.savetxt('data_binary.txt',X_new,fmt='%i')