## Embedded encoding
### Edgar Acuna
### September 2021

In [6]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [7]:
#Considering a dataset with 10 instances two numerical predictors  and one categorical atribute with two levels
num_data = np.random.random(size=(10,2))
#One categorical variables with 3 levels
cat_data = np.array(["red","blue","green","red","red","blue","blue","green","red","red"])

In [8]:
num_data

array([[0.19019922, 0.93877049],
       [0.56805413, 0.17587088],
       [0.06471525, 0.23607366],
       [0.91845431, 0.90567313],
       [0.62306059, 0.17568216],
       [0.40461198, 0.46084551],
       [0.06367899, 0.19493746],
       [0.23756553, 0.79478187],
       [0.96845424, 0.36368831],
       [0.52578705, 0.75047808]])

In [9]:
#Replacing the values of the categorical features by numerical values using skelearn's LabelEncoder 
#The coding is in alphabetic order
from sklearn.preprocessing  import LabelEncoder 
lb_make = LabelEncoder()
cat_data= lb_make.fit_transform(cat_data)
cat_data

array([2, 0, 1, 2, 2, 0, 0, 1, 2, 2], dtype=int64)

In [10]:
#Original Dataset
all=np.insert(num_data,num_data.shape[1],cat_data,1)
all

array([[0.19019922, 0.93877049, 2.        ],
       [0.56805413, 0.17587088, 0.        ],
       [0.06471525, 0.23607366, 1.        ],
       [0.91845431, 0.90567313, 2.        ],
       [0.62306059, 0.17568216, 2.        ],
       [0.40461198, 0.46084551, 0.        ],
       [0.06367899, 0.19493746, 0.        ],
       [0.23756553, 0.79478187, 1.        ],
       [0.96845424, 0.36368831, 2.        ],
       [0.52578705, 0.75047808, 2.        ]])

In [11]:
#Let's create one-hot encoded matrix for the categorical feature
#Also it can be done with get_dummies from Pandas, Labelbinarizer and One-hot Enonder from scikit-learn
one_hot_encoded_cat_data = np.eye(cat_data.max()+1)[cat_data]

In [12]:
one_hot_encoded_cat_data

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]])

## Embedded Encoding

In [14]:
#Let us consider a simulated target attribute
target =[0,1,1,1,1,0,1,1,1,0]
target=np.array(target)

In [15]:
#This is the suggested  embedding size 
no_of_unique_cat  = len(np.unique(cat_data))
#embedding size = min(50, number of categories/2).
embedding_size = min(np.ceil((no_of_unique_cat)/2), 50 )
embedding_size = int(embedding_size)
embedding_size

2

In [16]:
# Use Input layers, specify input shape (dimensions except first)
inp_cat_data = keras.layers.Input(shape=(no_of_unique_cat,))
inp_num_data = keras.layers.Input(shape=(num_data.shape[1],))
# Bind nulti_hot to embedding layer
emb = keras.layers.Embedding(input_dim=no_of_unique_cat, output_dim=embedding_size)(inp_cat_data)  
# Also you need flatten embedded output-
# otherwise it's not possible to concatenate it with inp_num_data
flatten = keras.layers.Flatten()(emb)
# Concatenate two layers
conc = keras.layers.Concatenate()([flatten, inp_num_data])
dense1 = keras.layers.Dense(3, activation=tf.nn.relu, )(conc)
# Creating output layer
out = keras.layers.Dense(1, activation=None)(dense1)
model = keras.Model(inputs=[inp_cat_data, inp_num_data], outputs=out)

In [17]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 3)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 3, 2)         6           input_1[0][0]                    
__________________________________________________________________________________________________
flatten (Flatten)               (None, 6)            0           embedding[0][0]                  
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2)]          0                                            
_______________________________________________________________________________________

In [18]:
model.compile(optimizer=tf.compat.v1.train.AdamOptimizer(0.01),
              loss=keras.losses.mean_squared_error,
              metrics=[keras.metrics.mean_squared_error])

In [19]:
#Finding out the weights for each value of the categorical variable
model.fit([one_hot_encoded_cat_data, num_data], target)
model.layers[1].get_weights()[0]



array([[-0.00129469,  0.05073811],
       [ 0.02688374,  0.03952122],
       [-0.00515642,  0.03700156]], dtype=float32)

In [20]:
#Obtaining the transformed representation of the categorial feature
idx0=np.argwhere(cat_data==0)
idx1=np.argwhere(cat_data==1)
idx2=np.argwhere(cat_data==2)
newcode=np.zeros((10,2))
newcode[idx0,]=[-0.022, 0.047]
newcode[idx1,]=[-0.023, 0.038]
newcode[idx2,]=[-0.022,  -0.012]
newcode

array([[-0.022, -0.012],
       [-0.022,  0.047],
       [-0.023,  0.038],
       [-0.022, -0.012],
       [-0.022, -0.012],
       [-0.022,  0.047],
       [-0.022,  0.047],
       [-0.023,  0.038],
       [-0.022, -0.012],
       [-0.022, -0.012]])

In [21]:
#Trnasformed dataset
all1=np.insert(num_data,num_data.shape[1],np.transpose(newcode),1)
all1

array([[ 0.19019922,  0.93877049, -0.022     , -0.012     ],
       [ 0.56805413,  0.17587088, -0.022     ,  0.047     ],
       [ 0.06471525,  0.23607366, -0.023     ,  0.038     ],
       [ 0.91845431,  0.90567313, -0.022     , -0.012     ],
       [ 0.62306059,  0.17568216, -0.022     , -0.012     ],
       [ 0.40461198,  0.46084551, -0.022     ,  0.047     ],
       [ 0.06367899,  0.19493746, -0.022     ,  0.047     ],
       [ 0.23756553,  0.79478187, -0.023     ,  0.038     ],
       [ 0.96845424,  0.36368831, -0.022     , -0.012     ],
       [ 0.52578705,  0.75047808, -0.022     , -0.012     ]])

### Embedded encoding and Decisio Tree Classifier applied to the loan dataset

In [22]:
import pandas as pd
df = pd.read_csv("https://academic.uprm.edu/eacuna/datosarbol.csv")
print(df)

      Sexo  Familia CasPropia  AnosEmpleo  Sueldo StatustMarital Prestamo
0   Hombre        3        No          17    2500        Soltero       No
1    Mujer        5        Si          10    3000         Casado       Si
2    Mujer        4        No          15    2000          Viudo       No
3   Hombre        3        Si          16    2800        Soltero       Si
4   Hombre        6        Si          11    4000          Viudo       Si
5    Mujer        4        Si          26    3200        Soltero       Si
6    Mujer        2        Si          14    1800        Soltero       No
7   Hombre        5        Si          10    3750         Casado       Si
8   Hombre        6        No          18    2970     Divorciado       No
9   Hombre        4        Si          12    3350     Divorciado       No
10  Hombre        1        No          23    1950        Soltero       No
11   Mujer        2        Si          25    2740        Soltero       Si
12   Mujer        3        No         

In [23]:
y=df['Prestamo']
data=df.iloc[:,0:6]
#Codificando 0-1 las Variables Sexo y CasPropia
lb_make = LabelEncoder()
data["Sexo"] = lb_make.fit_transform(data["Sexo"])
data['CasPropia']=lb_make.fit_transform(data["CasPropia"])
data1=data['StatustMarital']
#Aplicando OneHot encoding a la predictora StatustMarital
data2=pd.get_dummies(data1,prefix=['StatustMarital'])
data3=pd.concat([data.iloc[:,0:5],data2],axis=1)
y2=lb_make.fit_transform(y)
num_data1=data.iloc[:,0:5]
num_data1

Unnamed: 0,Sexo,Familia,CasPropia,AnosEmpleo,Sueldo
0,0,3,0,17,2500
1,1,5,1,10,3000
2,1,4,0,15,2000
3,0,3,1,16,2800
4,0,6,1,11,4000
5,1,4,1,26,3200
6,1,2,1,14,1800
7,0,5,1,10,3750
8,0,6,0,18,2970
9,0,4,1,12,3350


In [24]:
cat_data1= lb_make.fit_transform(data1)
cat_data1

array([2, 0, 3, 2, 3, 2, 2, 0, 1, 1, 2, 2, 2, 1, 0, 2, 2, 3, 1, 3, 1, 2,
       1, 0, 0])

In [25]:
one_hot_encoded_cat_data1 = np.eye(cat_data1.max()+1)[cat_data1]

In [26]:
one_hot_encoded_cat_data1

array([[0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [27]:
#This is the suggested  embedding size 
no_of_unique_cat1  = len(np.unique(cat_data1))
#embedding size = min(50, number of categories/2).
embedding_size1= min(np.ceil((no_of_unique_cat1)/2), 50 )
embedding_size1 = int(embedding_size1)
embedding_size1

2

In [28]:
# Use Input layers, specify input shape (dimensions except first)
inp_cat_data1 = keras.layers.Input(shape=(no_of_unique_cat1,))
inp_num_data1 = keras.layers.Input(shape=(num_data1.shape[1],))
# Bind nulti_hot to embedding layer
emb = keras.layers.Embedding(input_dim=no_of_unique_cat1, output_dim=embedding_size1)(inp_cat_data1)  
# Also you need flatten embedded output-
# otherwise it's not possible to concatenate it with inp_num_data
flatten = keras.layers.Flatten()(emb)
# Concatenate two layers
conc = keras.layers.Concatenate()([flatten, inp_num_data1])
dense1 = keras.layers.Dense(3, activation=tf.nn.relu, )(conc)
# Creating output layer
out = keras.layers.Dense(1, activation=None)(dense1)
model = keras.Model(inputs=[inp_cat_data1, inp_num_data1], outputs=out)
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 4)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 4, 2)         8           input_3[0][0]                    
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 8)            0           embedding_1[0][0]                
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 5)]          0                                            
_______________________________________________________________________________________

In [29]:
model.compile(optimizer=tf.compat.v1.train.AdamOptimizer(0.01),
              loss=keras.losses.mean_squared_error,
              metrics=[keras.metrics.mean_squared_error])

In [30]:
#Finding out the weights for each value of the categorical variable
model.fit([one_hot_encoded_cat_data1, num_data1], y2)
model.layers[1].get_weights()[0]



array([[-0.03236977, -0.05855633],
       [-0.02510054,  0.03333823],
       [ 0.03515378, -0.04568821],
       [-0.016428  ,  0.0346511 ]], dtype=float32)

In [31]:
#Obtaining the transformed representation of the categorial feature
idx0=np.argwhere(cat_data1==0)
idx1=np.argwhere(cat_data1==1)
idx2=np.argwhere(cat_data1==2)
idx3=np.argwhere(cat_data1==3)
newcode1=np.zeros((25,2))
newcode1[idx0,]=[0.0478, -0.0047]
newcode1[idx1,]=[-0.0084, 0.0283]
newcode1[idx2,]=[-0.0497,  -0.011]
newcode1[idx3,]=[-.0450,   0.0063]
newcode1

array([[-0.0497, -0.011 ],
       [ 0.0478, -0.0047],
       [-0.045 ,  0.0063],
       [-0.0497, -0.011 ],
       [-0.045 ,  0.0063],
       [-0.0497, -0.011 ],
       [-0.0497, -0.011 ],
       [ 0.0478, -0.0047],
       [-0.0084,  0.0283],
       [-0.0084,  0.0283],
       [-0.0497, -0.011 ],
       [-0.0497, -0.011 ],
       [-0.0497, -0.011 ],
       [-0.0084,  0.0283],
       [ 0.0478, -0.0047],
       [-0.0497, -0.011 ],
       [-0.0497, -0.011 ],
       [-0.045 ,  0.0063],
       [-0.0084,  0.0283],
       [-0.045 ,  0.0063],
       [-0.0084,  0.0283],
       [-0.0497, -0.011 ],
       [-0.0084,  0.0283],
       [ 0.0478, -0.0047],
       [ 0.0478, -0.0047]])

In [32]:
#Transformed dataset
df = pd.DataFrame(newcode1, columns = ['w1','w2'])
data3=pd.concat([num_data1,df],axis=1)
#num_data1.shape
#all1=np.insert(num_data1,num_data1.shape[1],np.transpose(newcode1),1)
data3

Unnamed: 0,Sexo,Familia,CasPropia,AnosEmpleo,Sueldo,w1,w2
0,0,3,0,17,2500,-0.0497,-0.011
1,1,5,1,10,3000,0.0478,-0.0047
2,1,4,0,15,2000,-0.045,0.0063
3,0,3,1,16,2800,-0.0497,-0.011
4,0,6,1,11,4000,-0.045,0.0063
5,1,4,1,26,3200,-0.0497,-0.011
6,1,2,1,14,1800,-0.0497,-0.011
7,0,5,1,10,3750,0.0478,-0.0047
8,0,6,0,18,2970,-0.0084,0.0283
9,0,4,1,12,3350,-0.0084,0.0283
