## Data processing

This Jupyter Noterbook helps us to convert binary attribute(s) to +/-1, categorical attributes(s) to onehot.

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

We load the data which were cleaned from the `data cleaning` step.

In [2]:
Xy = np.loadtxt('smoking_cleaned.dat', dtype = 'str')

print(Xy.shape)
print(Xy)

(1574, 94)
[['35.0' '2.0' '1.0' ... '0.0' '1.0' '0.0']
 ['34.0' '1.0' '4.0' ... '0.0' '1.0' '0.0']
 ['37.0' '2.0' '3.0' ... '0.0' '1.0' '0.0']
 ...
 ['28.0' '1.0' '4.0' ... '1.0' '4.0' '0.0']
 ['29.0' '1.0' '2.0' ... '0.0' '2.0' '0.0']
 ['47.0' '2.0' '1.0' ... '0.0' '1.0' '0.0']]


### Attributes

We find number of unique value for each column, to have an idea about which variables are continuous, which variables are binary, category. It depends on data, however it is likely that nu = 2 --> binary; nu = 3 or 4: --> category, n > 4: continuous. Of course, we have to see data in detail as well.

In [3]:
X = Xy[:,:-1].astype(float)
l,n = X.shape
nu = np.array([len(np.unique(X[:,i])) for i in range(n)])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[  58    2    5    3    5    4    6    3    3    2    2    2    3    4
    3    3    3    7    3    3    3    4    3    3    3    3    3    3
    8    9    3    3    3    3    3    3    3    4    4    4    4    3
 1565   10    4    3    3    3    5    2    3    7    3    2    2    2
    3    3    3    3    3    4    2    2    2    2    2    2    2    2
    9    2    9    2    2    2    2    2    2    2    2    2    2    2
    2    2    2    2    2    2    2    2    4]


We then define variable type, 1: continuous, 2: binary, 3: category.

In [4]:
variable_type  = np.ones(n) # continuous
variable_type[1] = 2 # binary
variable_type[9:12] = 2 # binary
variable_type[49] = 2 # binary
variable_type[53:56] = 2 # binary
variable_type[62:70] = 2 # binary
variable_type[71] = 2 # binary
variable_type[73:92] = 2 # binary
variable_type[3] = 3 # category
variable_type[7:9] = 3 # category
variable_type[12] = 3 # category
variable_type[14:17] = 3 # category
variable_type[18:21] = 3 # category
variable_type[22:28] = 3 # category    
variable_type[30:37] = 3 # category        
variable_type[41] = 3 # category            
variable_type[45:48] = 3 # category           
variable_type[50] = 3 # category                
variable_type[52] = 3 # category                
variable_type[56:61] = 3 # category                
print(variable_type)

[1. 2. 1. 3. 1. 1. 1. 3. 3. 2. 2. 2. 3. 1. 3. 3. 3. 1. 3. 3. 3. 1. 3. 3.
 3. 3. 3. 3. 1. 1. 3. 3. 3. 3. 3. 3. 3. 1. 1. 1. 1. 3. 1. 1. 1. 3. 3. 3.
 1. 2. 3. 1. 3. 2. 2. 2. 3. 3. 3. 3. 3. 1. 2. 2. 2. 2. 2. 2. 2. 2. 1. 2.
 1. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1.]


In [5]:
i_binary = [1,9,10,11,49,53,54,55,62,63,64,65,66,67,68,69,71,\
            73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91]

print(len(i_binary))

36


In [None]:
variable_type[i_binary] = 2.

In [6]:
i_category = [3,7,8,12,14,15,16,18,19,20,22,23,24,25,26,27,30,31,32,33,34,35,36,\
             41,45,46,47,50,52,56,57,58,59,60]
print(len(i_category))

34


We now convert binary to +/-1, category to onehot.

In [7]:
#np.savetxt('smoking_processed.dat',Xy_new,fmt='%f')

In [8]:
x_binary = X[:,i_binary]

In [9]:
variable_type_binary = np.full(len(i_binary),2)
print(variable_type_binary)

[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


In [10]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """
    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # continuous
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))
        elif i_type == 2: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))
        else: # category      
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))        

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [11]:
# convert X
x_binary_new = convert_binary_and_category(x_binary,variable_type_binary)

print(x_binary_new.shape)

(1574, 36)


In [12]:
np.savetxt('smoking_binary.dat',x_binary_new,fmt='%i')

In [13]:
i_continuous = np.where(variable_type == 1)[0]
print(len(i_continuous))
print(i_continuous)

23
[ 0  2  4  5  6 13 17 21 28 29 37 38 39 40 42 43 44 48 51 61 70 72 92]


In [14]:
x_continuous = X[:,i_continuous]
print(x_continuous.shape)

(1574, 23)


In [15]:
np.savetxt('smoking_continuous.dat',x_continuous,fmt='%f')