## Data processing

This Jupyter Noterbook helps us to convert binary attribute(s) to +/-1, categorical attributes(s) to onehot.

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

We load the data which were cleaned from the `data cleaning` step.

In [2]:
Xy = np.loadtxt('coimbra_cleaned.dat', dtype = 'str')

print(Xy.shape)
print(Xy)

(116, 10)
[['48.0' '23.5' '70.0' ... '7.99585' '417.11400000000003' '1.0']
 ['83.0' '20.69049454' '92.0' ... '4.06405' '468.786' '1.0']
 ['82.0' '23.12467037' '91.0' ... '9.27715' '554.697' '1.0']
 ...
 ['65.0' '32.05' '97.0' ... '10.33' '314.05' '2.0']
 ['72.0' '25.59' '82.0' ... '3.27' '392.46' '2.0']
 ['86.0' '27.18' '138.0' ... '4.35' '90.09' '2.0']]


### Attributes

We find number of unique value for each column, to have an idea about which variables are continuous, which variables are binary, category. It depends on data, however it is likely that nu = 2 --> binary; nu = 3 or 4: --> category, n > 4: continuous. Of course, we have to see data in detail as well.

In [3]:
X = Xy[:,:-1]
l,n = X.shape
nu = np.array([len(np.unique(X[:,i])) for i in range(n)])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[ 51 110  50 113 116 116 115 116 113]


In [4]:
i_binary = []
i_category = []
i_continuous = []
for i in range(X.shape[1]):
    nu = np.unique(X[:,i])
    if len(nu) == 2: # binary 
        i_binary.append(i)
    elif len(nu) < 5:
        i_category.append(i)
    else:
        i_continuous.append(i)
        
print('i_binary:',i_binary)
print('i_category:',i_category)

i_binary: []
i_category: []


We then define variable type, 1: continuous, 2: binary, 3: category.

In [5]:
variable_type  = np.ones(n) # continuous
variable_type[i_binary] = 2 # binary
variable_type[i_category] = 3 # categorical

print(variable_type)

[1. 1. 1. 1. 1. 1. 1. 1. 1.]


We now convert binary to +/-1, category to onehot.

In [6]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """
    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # continuous
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))
        elif i_type == 2: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))
        else: # category      
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))        

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [7]:
# convert X
X_new = convert_binary_and_category(X,variable_type)

print(X_new.shape)
print(X_new)

(116, 9)
[[ 48.          23.5         70.         ...   9.7024       7.99585
  417.114     ]
 [ 83.          20.69049454  92.         ...   5.429285     4.06405
  468.786     ]
 [ 82.          23.12467037  91.         ...  22.43204      9.27715
  554.697     ]
 ...
 [ 65.          32.05        97.         ...  22.54        10.33
  314.05      ]
 [ 72.          25.59        82.         ...  33.75         3.27
  392.46      ]
 [ 86.          27.18       138.         ...  14.11         4.35
   90.09      ]]


### Target

In [8]:
## target
y = Xy[:,-1].astype(float)

print(np.unique(y,return_counts=True))

(array([1., 2.]), array([52, 64]))


In [9]:
y_new = np.ones(y.shape[0])
y_new[y =='2'] = 0


print(np.unique(y_new,return_counts=True))

(array([1.]), array([116]))


  


In [10]:
# combine X and y
Xy_new = np.hstack((X_new,y_new[:,np.newaxis]))

In [11]:
np.savetxt('coimbra_processed.dat',Xy_new,fmt='%f')