### Embeded encoding
### Edgar Acuna
### April 2021

In [44]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [45]:
#Considering a dataset with 10 instances two numerical predictors  and one categorical atribute with two levels
num_data = np.random.random(size=(10,2))
#One categorical variables with 3 levels
cat_data = np.array(["red","blue","green","red","red","blue","blue","green","red","red"])

In [46]:
num_data

array([[0.33411343, 0.80780972],
       [0.02482294, 0.45455455],
       [0.15375505, 0.77538588],
       [0.04623018, 0.80065508],
       [0.11466097, 0.73561668],
       [0.09044322, 0.68652931],
       [0.30915452, 0.54922504],
       [0.61327427, 0.60722644],
       [0.80411887, 0.85150846],
       [0.12623851, 0.25971377]])

In [47]:
#Replacing the values of the categorical features by numerical values using skelearn's LabelEncoder 
#The coding is in alphabetic order
from sklearn.preprocessing  import LabelEncoder 
lb_make = LabelEncoder()
cat_data= lb_make.fit_transform(cat_data)
cat_data

array([2, 0, 1, 2, 2, 0, 0, 1, 2, 2], dtype=int64)

In [48]:
#Original Dataset
all=np.insert(num_data,num_data.shape[1],cat_data,1)
all

array([[0.33411343, 0.80780972, 2.        ],
       [0.02482294, 0.45455455, 0.        ],
       [0.15375505, 0.77538588, 1.        ],
       [0.04623018, 0.80065508, 2.        ],
       [0.11466097, 0.73561668, 2.        ],
       [0.09044322, 0.68652931, 0.        ],
       [0.30915452, 0.54922504, 0.        ],
       [0.61327427, 0.60722644, 1.        ],
       [0.80411887, 0.85150846, 2.        ],
       [0.12623851, 0.25971377, 2.        ]])

In [49]:
#Let's create one-hot encoded matrix for the categorical feature
#Also it can be done with get_dummies from Pandas, Labelbinarizer and One-hot Enonder from scikit-learn
one_hot_encoded_cat_data = np.eye(cat_data.max()+1)[cat_data]

In [51]:
one_hot_encoded_cat_data 

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [52]:
#Let us consider a target attribute
target =[0,1,1,1,1,0,1,1,1,0]
target=np.array(target)

In [53]:
#This is the suggested  embedding size 
no_of_unique_cat  = len(np.unique(cat_data))
#embedding size = min(50, number of categories/2).
embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
embedding_size = int(embedding_size)
embedding_size

2

In [54]:
# Use Input layers, specify input shape (dimensions except first)
inp_cat_data = keras.layers.Input(shape=(no_of_unique_cat,))
inp_num_data = keras.layers.Input(shape=(num_data.shape[1],))
# Bind nulti_hot to embedding layer
emb = keras.layers.Embedding(input_dim=no_of_unique_cat, output_dim=embedding_size)(inp_cat_data)  
# Also you need flatten embedded output-
# otherwise it's not possible to concatenate it with inp_num_data
flatten = keras.layers.Flatten()(emb)
# Concatenate two layers
conc = keras.layers.Concatenate()([flatten, inp_num_data])
dense1 = keras.layers.Dense(3, activation=tf.nn.relu, )(conc)
# Creating output layer
out = keras.layers.Dense(1, activation=None)(dense1)
model = keras.Model(inputs=[inp_cat_data, inp_num_data], outputs=out)

In [55]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 3, 2)         6           input_5[0][0]                    
__________________________________________________________________________________________________
flatten_2 (Flatten)             (None, 6)            0           embedding_2[0][0]                
__________________________________________________________________________________________________
input_6 (InputLayer)            [(None, 2)]          0                                            
____________________________________________________________________________________________

In [56]:
model.compile(optimizer=tf.compat.v1.train.AdamOptimizer(0.01),
              loss=keras.losses.mean_squared_error,
              metrics=[keras.metrics.mean_squared_error])

In [57]:
#Finding out the weights for each value of the categorical variable
model.fit([one_hot_encoded_cat_data, num_data], target)
model.layers[1].get_weights()[0]



array([[-0.04003588, -0.0086223 ],
       [-0.04412245,  0.03312165],
       [ 0.00952185, -0.04416777]], dtype=float32)

In [42]:
#Obtaining the transformed representation of the categorial feature
idx0=np.argwhere(cat_data==0)
idx1=np.argwhere(cat_data==1)
idx2=np.argwhere(cat_data==2)
newcode=np.zeros((10,2))
newcode[idx0,]=[-0.022, 0.047]
newcode[idx1,]=[-0.023, 0.038]
newcode[idx2,]=[-0.022,  -0.012]
newcode

array([[-0.022, -0.012],
       [-0.022,  0.047],
       [-0.023,  0.038],
       [-0.022, -0.012],
       [-0.022, -0.012],
       [-0.022,  0.047],
       [-0.022,  0.047],
       [-0.023,  0.038],
       [-0.022, -0.012],
       [-0.022, -0.012]])

In [43]:
#Trnasformed dataset
all1=np.insert(num_data,num_data.shape[1],np.transpose(newcode),1)
all1

array([[ 0.21572427,  0.94465752, -0.022     , -0.012     ],
       [ 0.13901493,  0.70445493, -0.022     ,  0.047     ],
       [ 0.99609492,  0.33012453, -0.023     ,  0.038     ],
       [ 0.17405204,  0.45957256, -0.022     , -0.012     ],
       [ 0.53931295,  0.46135914, -0.022     , -0.012     ],
       [ 0.13809674,  0.44709299, -0.022     ,  0.047     ],
       [ 0.27858511,  0.46395601, -0.022     ,  0.047     ],
       [ 0.17667699,  0.103717  , -0.023     ,  0.038     ],
       [ 0.04629038,  0.64155933, -0.022     , -0.012     ],
       [ 0.06945534,  0.10670777, -0.022     , -0.012     ]])