## Data processing

This Jupyter Noterbook helps us to convert binary attribute(s) to +/-1, categorical attributes(s) to onehot.

In [1]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

We load the data which were cleaned from the `data cleaning` step.

In [2]:
Xy = np.loadtxt('language_cleaned.dat', dtype = 'str')

print(Xy.shape)
print(Xy)

(1163, 60)
[['165.0' '13.75' '287.0' ... '1.0' '12.0' '1.0']
 ['172.0' '14.33333333' '368.0' ... '0.0' '9.0' '1.0']
 ['160.0' '13.33333333' '266.0' ... '0.0' '6.0' '1.0']
 ...
 ['119.0' '9.916666667000001' '337.0' ... '4.0' '9.0' '0.0']
 ['112.0' '9.333333332999999' '511.0' ... '5.0' '15.0' '0.0']
 ['108.0' '9.0' '495.0' ... '4.0' '12.0' '0.0']]


### Attributes

We find number of unique value for each column, to have an idea about which variables are continuous, which variables are binary, category. It depends on data, however it is likely that nu = 2 --> binary; nu = 3 or 4: --> category, n > 4: continuous. Of course, we have to see data in detail as well.

In [3]:
X = Xy[:,:-1]
l,n = X.shape
nu = np.array([len(np.unique(X[:,i])) for i in range(n)])
print('number of uniques of each variable:')
print(nu)

number of uniques of each variable:
[ 129  129  611  133   99  396  643  578   51   13   59   48   52 1160
 1161 1162 1158 1161 1156  970  970   13   13  643  643  133  133  664
 1095  970  978   93  592  628   81   56   16   17   30  103   12   43
  129   44   58   60   51   11   17   13 1161   16   24   25   32    3
    3   22   61]


In [4]:
i_binary = []
i_category = []
i_continuous = []
for i in range(X.shape[1]):
    nu = np.unique(X[:,i])
    if len(nu) == 2: # binary 
        i_binary.append(i)
    elif len(nu) < 5:
        i_category.append(i)
    else:
        i_continuous.append(i)
        
print('i_binary:',i_binary)
print('i_category:',i_category)

i_binary: []
i_category: [55, 56]


We then define variable type, 1: continuous, 2: binary, 3: category.

In [5]:
variable_type  = np.ones(n) # continuous
variable_type[i_binary] = 2 # binary
variable_type[i_category] = 3 # categorical

print(variable_type)

[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 3. 3. 1. 1.]


We then check if categorical variables have enough values in each category.

In [6]:
np.unique(X[:,55],return_counts=True)
np.unique(X[:,56],return_counts=True)

(array(['0.0', '1.0', '2.0'], dtype='<U22'), array([1123,   32,    8]))

We then drop both if it's clear that there is a significant majority of a particular value in the category compared to others.

In [7]:
#np.delete(X, np.s_[55:57], axis=1)
np.delete(X, [55,56], axis=1)

array([['165.0', '13.75', '287.0', ..., '7.0', '1.0', '12.0'],
       ['172.0', '14.33333333', '368.0', ..., '5.0', '0.0', '9.0'],
       ['160.0', '13.33333333', '266.0', ..., '5.0', '0.0', '6.0'],
       ...,
       ['119.0', '9.916666667000001', '337.0', ..., '5.0', '4.0', '9.0'],
       ['112.0', '9.333333332999999', '511.0', ..., '9.0', '5.0', '15.0'],
       ['108.0', '9.0', '495.0', ..., '7.0', '4.0', '12.0']], dtype='<U22')

We now convert category to onehot.

In [8]:
def convert_binary_and_category(x,variable_type):
    """
    convert binary to +-1, category to one hot; remain continuous.
    """
    
    onehot_encoder = OneHotEncoder(sparse=False,categories='auto')

    # create 2 initial columns
    x_new = np.zeros((x.shape[0],2))

    for i,i_type in enumerate(variable_type):
        if i_type == 1: # continuous
            x_new = np.hstack((x_new,x[:,i][:,np.newaxis]))
        elif i_type == 2: # binary
            unique_value = np.unique(x[:,i])
            x1 = np.array([-1. if value == unique_value[0] else 1. for value in x[:,i]])        
            x_new = np.hstack((x_new,x1[:,np.newaxis]))
        else: # category      
            x1 = onehot_encoder.fit_transform(x[:,i].reshape(-1,1))
            x_new = np.hstack((x_new,x1))        

    # drop the 2 initial column
    x_new = x_new[:,2:]
    
    return x_new.astype(float)

In [9]:
# convert X
X_new = convert_binary_and_category(X,variable_type)

print(X_new.shape)
print(X_new)

(1163, 63)
[[165.          13.75       287.         ...   0.           1.
   12.        ]
 [172.          14.33333333 368.         ...   0.           0.
    9.        ]
 [160.          13.33333333 266.         ...   0.           0.
    6.        ]
 ...
 [119.           9.91666667 337.         ...   0.           4.
    9.        ]
 [112.           9.33333333 511.         ...   0.           5.
   15.        ]
 [108.           9.         495.         ...   0.           4.
   12.        ]]


### Target

In [10]:
## target
y = Xy[:,-1].astype(float)

print(np.unique(y,return_counts=True))

(array([0., 1.]), array([896, 267]))


In [11]:
y_new = y


print(np.unique(y_new,return_counts=True))

(array([0., 1.]), array([896, 267]))


In [12]:
# combine X and y
Xy_new = np.hstack((X_new,y_new[:,np.newaxis]))

In [13]:
np.savetxt('language_processed.dat',Xy_new,fmt='%f')