## Data processing

This Jupyter Noterbook helps us to convert binary attribute(s) to +/-1, categorical attributes(s) to onehot.

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

We load the data which were cleaned from the `data cleaning` step.

In [2]:
Xy = np.loadtxt('peptide_cleaned.dat', dtype = 'str')

print(Xy.shape)
print(Xy)

(699, 56)
[['63.0' '1.0' '1.0' ... '123.48' '150.0' '1.0']
 ['51.0' '0.0' '1.0' ... '238.96' '100.0' '1.0']
 ['83.0' '1.0' '1.0' ... '618.84' '145.0' '1.0']
 ...
 ['47.0' '1.0' '0.0' ... '207.18' '120.0' '0.0']
 ['59.0' '1.0' '0.0' ... '196.0' '130.0' '0.0']
 ['49.0' '1.0' '1.0' ... '257.33' '120.0' '0.0']]


### Attributes

We find number of unique value for each column, to have an idea about which variables are continuous, which variables are binary, category. It depends on data, however it is likely that nu = 2 --> binary; nu = 3 or 4: --> category, n > 4: continuous. Of course, we have to see data in detail as well.

In [3]:
X = Xy[:,:-1]
l,n = X.shape
nu = np.array([len(np.unique(X[:,i])) for i in range(n)])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[ 58   2   2   2   2   2   2   2   2   2   2   2   2   4   2   2   2   2
 603   2   2   2   2   2   2   2   2   2   2   2   2   2   2  12 506 508
   2 510   2 508 124  59 132 218 134 389 657 516   2 516 627 504  68 670
  48]


In [None]:
i_binary = []
i_category = []
i_continuous = []
for i in range(X.shape[1]):
    nu = np.unique(X[:,i])
    if len(nu) == 2: # binary 
        i_binary.append(i)
    elif len(nu) < 5:
        i_category.append(i)
    else:
        i_continuous.append(i)
        
print('i_binary:',i_binary)
print('i_category:',i_category)

We then define variable type, 1: continuous, 2: binary, 3: category.

In [4]:
variable_type  = np.ones(n) # continuous
variable_type[i_binary] = 2 # binary
variable_type[i_category] = 3 # categorical

print(variable_type)

[1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 3. 1. 2. 2. 2. 1. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 2. 1. 2. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 2. 1. 1. 1. 1. 1. 1.]


In [5]:
np.unique(X[:,13],return_counts=True)


(array(['0.0', '1.0', '2.0', '3.0'], dtype='<U20'),
 array([ 68, 382, 175,  74]))

We now convert binary to +/-1, category to onehot.

In [6]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """
    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # continuous
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))
        elif i_type == 2: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))
        else: # category      
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))        

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [7]:
# convert X
X_new = convert_binary_and_category(X,variable_type)

print(X_new.shape)
print(X_new)

(699, 58)
[[ 6.3000e+01  1.0000e+00  1.0000e+00 ...  0.0000e+00  1.2348e+02
   1.5000e+02]
 [ 5.1000e+01 -1.0000e+00  1.0000e+00 ...  0.0000e+00  2.3896e+02
   1.0000e+02]
 [ 8.3000e+01  1.0000e+00  1.0000e+00 ...  2.3000e-02  6.1884e+02
   1.4500e+02]
 ...
 [ 4.7000e+01  1.0000e+00 -1.0000e+00 ...  0.0000e+00  2.0718e+02
   1.2000e+02]
 [ 5.9000e+01  1.0000e+00 -1.0000e+00 ...  5.0000e-03  1.9600e+02
   1.3000e+02]
 [ 4.9000e+01  1.0000e+00  1.0000e+00 ...  3.0000e-03  2.5733e+02
   1.2000e+02]]


### Target

In [8]:
## target
y = Xy[:,-1].astype(float)

print(np.unique(y,return_counts=True))

(array([0., 1.]), array([676,  23]))


In [9]:
# convert taget to 0 and 1
y_new = y


print(np.unique(y_new,return_counts=True))

(array([0., 1.]), array([676,  23]))


In [10]:
# combine X and y
Xy_new = np.hstack((X_new,y_new[:,np.newaxis]))

In [11]:
np.savetxt('peptide_processed.dat',Xy_new,fmt='%f')