### Encoding of Categorical features using Neural Networks embeddings
#### Code adapted from Shivanand Roy's github https://github.com/Shivanandroy

Only works for supervised datasets. 
Datasets: HR_Attrition


In [232]:
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Dropout, Embedding, Activation, Input, concatenate, Reshape, Flatten
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.optimizers import Adam
from keras_tqdm import TQDMNotebookCallback
from keras import backend as K
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_is_fitted
from sklearn.utils import column_or_1d
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import warnings
from tqdm import tqdm_notebook
warnings.filterwarnings("ignore")

In [251]:
# Reading data
df = pd.read_csv("c:\\esma4016\HR_Attrition_Data.csv")
df.shape

(54808, 14)

In [252]:
df.head()

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [281]:
#Definiendo las predictoras y las clases
X = df.drop(['employee_id', 'is_promoted'], axis=1)
y = df['is_promoted']
X.shape
print(type(y))

<class 'pandas.core.series.Series'>


In [254]:
def get_embedding_info(data, categorical_variables=None):
    '''
    this function identifies categorical variables and its embedding size
    :data: input data [dataframe]
    :categorical_variables: list of categorical_variables [default: None]
    if None, it automatically takes the variables with data type 'object'
    embedding size of categorical variables are determined by minimum of 50 or half of the no. of its unique values.
    i.e. embedding size of a column  = Min(50, # unique values of that column)
    '''
    if categorical_variables is None:
        categorical_variables = data.select_dtypes(include='object').columns

    return {col:(data[col].nunique(),min(50,(data[col].nunique()+ 1) //2)) for col in categorical_variables}


In [255]:
# get_embedding_info identifies the categorical variables and the number 
#of unique values and embedding size (half of unique values) and returns a dictionary
embedding_info = get_embedding_info(X)
embedding_info

{'department': (9, 5),
 'region': (34, 17),
 'education': (3, 2),
 'gender': (2, 1),
 'recruitment_channel': (3, 2)}

In [256]:
# Helper functions:

class __LabelEncoder__(LabelEncoder):

    def transform(self, y):

        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)

        unseen = len(self.classes_)

        e = np.array([
                     np.searchsorted(self.classes_, x)
                     if x in self.classes_ else unseen
                     for x in y
                     ])

        if unseen in e:
            self.classes_ = np.array(self.classes_.tolist() + ['unseen'])

        return e



In [257]:
def get_label_encoded_data(data, categorical_variables=None):
    '''
    this function label encodes all the categorical variables using sklearn.preprocessing.labelencoder
    and returns a label encoded dataframe for training
    :data: input data [dataframe]
    :categorical_variables: list of categorical_variables [Default: None]
    if None, it automatically takes the variables with data type 'object'
    '''
    encoders = {}

    df = data.copy()

    if categorical_variables is None:
        categorical_variables = [col for col in df.columns if df[col].dtype == 'object']

    for var in categorical_variables:
        #print(var)
        encoders[var] = __LabelEncoder__()
        df.loc[:, var] = encoders[var].fit_transform(df[var])

    return df, encoders

In [258]:
# get_label_encoded_data integer encodes the categorical variables and prepares it to feed it to neural network
X_encoded,encoders = get_label_encoded_data(X)
X_encoded.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score
0,7,31,2,0,2,1,35,5.0,8,1,0,49
1,4,14,0,1,0,1,30,5.0,4,0,0,60
2,7,10,0,1,2,1,34,3.0,7,0,0,50
3,7,15,0,1,0,2,39,1.0,10,0,0,50
4,8,18,0,1,0,1,45,3.0,2,0,0,73


In [260]:
# Main function:

def get_embeddings(X_train, y_train, categorical_embedding_info, epochs=100, batch_size=256):
    '''
    this function trains a shallow neural networks and returns embeddings of categorical variables
    :X_train: training data [dataframe]
    :y_train: target variable
    :categorical_embedding_info: output of get_embedding_info function [dictionary of categorical variable and it's embedding size]
    :is_classification: True for classification tasks; False for regression tasks
    :epochs: num of epochs to train [default:100]
    :batch_size: batch size to train [default:256]
    It is a 2 layer neural network architecture with 1000 and 500 neurons with 'ReLU' activation
    for classification: loss = 'binary_crossentropy'; metrics = 'accuracy'
    for regression: loss = 'mean_squared_error'; metrics = 'r2'
    '''

    numerical_variables = [x for x in X_train.columns if x not in list(categorical_embedding_info.keys())]

    inputs = []
    flatten_layers = []

    for var, sz in categorical_embedding_info.items():
        input_c = Input(shape=(1,), dtype='int32')
        embed_c = Embedding(*sz, input_length=1)(input_c)
        flatten_c = Flatten()(embed_c)
        inputs.append(input_c)
        flatten_layers.append(flatten_c)
        #print(inputs)

    input_num = Input(shape=(len(numerical_variables),), dtype='float32')
    flatten_layers.append(input_num)
    inputs.append(input_num)

    flatten = concatenate(flatten_layers, axis=-1)

    fc1 = Dense(1000, kernel_initializer='normal')(flatten)
    fc1 = Activation('relu')(fc1)



    fc2 = Dense(500, kernel_initializer='normal')(fc1)
    fc2 = Activation('relu')(fc2)

    output = Dense(1, activation='sigmoid')(fc2)


    nnet = Model(inputs=inputs, outputs=output)

    x_inputs = []
    for col in categorical_embedding_info.keys():
        x_inputs.append(X_train[col].values)

    x_inputs.append(X_train[numerical_variables].values)


    nnet.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    nnet.fit(x_inputs, y_train.values, batch_size=batch_size, epochs=epochs, validation_split=0.2, verbose=0)

    embs = list(map(lambda x: x.get_weights()[0], [x for x in nnet.layers if 'Embedding' in str(x)]))
    embeddings = {var: emb for var, emb in zip(categorical_embedding_info.keys(), embs)}
    return embeddings


In [261]:
def get_embeddings_in_dataframe(embeddings, encoders):
    '''
    this function return the embeddings in pandas dataframe
    :embeddings: output of 'get_embeddings' function
    :encoders: output of 'get_embedding_info' function
    '''

    assert len(embeddings)==len(encoders), "Categorical variables in embeddings does not match with those of encoders"

    dfs={}
    for cat_var in tqdm_notebook(embeddings.keys()):
        df = pd.DataFrame(embeddings[cat_var])
        df.index = encoders[cat_var].classes_
        df.columns = [cat_var +  '_embedding_' + str(num) for num in df.columns]
        dfs[cat_var] = df

    return dfs


def fit_transform(data, embeddings, encoders, drop_categorical_vars=False):
    '''
    this function includes the trained embeddings into your data
    :data: input data [dataframe]
    :embeddings: output of 'get_embeddings' function
    :encoders: output of 'get_embedding_info' function
    :drop_categorical_vars: False to keep the categorical variables in the data along with the embeddings
    if True - drops the categorical variables and replaces them with trained embeddings
    '''

    assert len(embeddings)==len(encoders), "Categorical variables in embeddings does not match with those of encoders"

    dfs={}
    for cat_var in tqdm_notebook(embeddings.keys()):
        df = pd.DataFrame(embeddings[cat_var])
        df.index = encoders[cat_var].classes_
        df.columns = [cat_var +  '_embedding_' + str(num) for num in df.columns]
        data = data.merge(df, how='left', left_on=cat_var, right_index=True)

    if drop_categorical_vars:
        return data.drop(list(embeddings.keys()), axis=1)
    else:
        return data

In [262]:
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y)

# ce.get_embeddings trains NN, extracts embeddings and return a dictionary containing the embeddings
embeddings = get_embeddings(X_train, y_train, categorical_embedding_info=embedding_info,  epochs=100,batch_size=256)

In [263]:
# if you don't like the dictionary format; convert it to dataframe for easy readibility
dfs = get_embeddings_in_dataframe(embeddings=embeddings, encoders=encoders)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [264]:
dfs["region"]

Unnamed: 0,region_embedding_0,region_embedding_1,region_embedding_2,region_embedding_3,region_embedding_4,region_embedding_5,region_embedding_6,region_embedding_7,region_embedding_8,region_embedding_9,region_embedding_10,region_embedding_11,region_embedding_12,region_embedding_13,region_embedding_14,region_embedding_15,region_embedding_16
region_1,0.326519,0.293052,0.121029,-0.179777,0.072135,-0.071767,-0.233651,-0.179902,0.105263,0.170868,0.090231,-0.160602,-0.233412,-0.19867,0.418919,-0.149345,-0.115175
region_10,0.120272,0.151779,0.095948,0.085001,0.210626,-0.250386,-0.158255,-0.131187,0.15945,0.161304,0.224674,-0.169014,-0.053008,-0.147136,0.077855,-0.245066,-0.242285
region_11,0.12038,0.231705,0.217414,-0.293859,0.311623,-0.132549,-0.26326,-0.150221,0.122033,0.17357,0.309538,-0.249481,-0.156371,-0.223606,0.060145,-0.2562,-0.132277
region_12,0.205664,0.21209,0.189947,-0.213719,0.257159,-0.20414,-0.255333,-0.074938,0.016968,0.157903,0.168207,-0.267068,-0.199442,-0.141604,0.206694,-0.169475,-0.113473
region_13,0.157521,0.168113,0.210024,-0.166979,0.179334,-0.215849,-0.154317,-0.190059,0.104052,0.185806,0.165807,-0.240356,-0.213168,-0.160462,0.108603,-0.18558,-0.169779
region_14,0.21522,0.311769,0.208083,-0.04464,0.185808,-0.192552,-0.302009,-0.074651,0.130956,0.146269,0.110694,-0.130813,-0.200802,-0.068413,0.192326,-0.219827,-0.129721
region_15,0.079006,0.149893,0.168628,0.174078,0.100699,-0.231796,-0.0395,-0.216145,0.305998,0.171672,0.068578,-0.079421,-0.182501,-0.192598,0.255556,-0.151426,-0.262622
region_16,0.237774,0.163037,0.22619,-0.071618,0.146131,-0.185012,-0.237939,-0.141864,0.116251,0.118794,0.13635,-0.181505,-0.285432,-0.119487,0.255118,-0.156224,-0.109891
region_17,0.005652,0.062269,0.043288,0.406227,0.133962,-0.314851,-0.037677,-0.191053,0.275578,0.209173,0.232332,-0.143306,-0.112948,-0.153373,0.054404,-0.152121,-0.186045
region_18,0.04307,-0.017537,0.018833,1.66597,-0.086696,-0.199517,0.236457,-0.440262,0.569837,0.075976,-0.030495,0.160429,0.016108,-0.09229,0.654996,-0.064943,-0.361529


In [359]:
#datos=pd.read_csv("http://academic.uprm.edu/eacuna/heartc.csv",sep=",",na_values=["?"])
datos=pd.read_csv("http://academic.uprm.edu/eacuna/heartc1.csv",sep=",")
#datos = pd.read_csv('c://PW-PR/census.csv', sep=',',na_values=['?'])
datos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        297 non-null    int64  
 1   gender                     297 non-null    object 
 2   chest pain                 297 non-null    object 
 3   rest SBP                   297 non-null    int64  
 4   cholesterol                297 non-null    int64  
 5   fasting blood sugar > 120  297 non-null    int64  
 6   rest ECG                   297 non-null    object 
 7   max HR                     297 non-null    int64  
 8   exerc ind ang              297 non-null    int64  
 9   ST by exercise             297 non-null    float64
 10  slope peak exc ST          297 non-null    object 
 11  major vessels colored      297 non-null    int64  
 12  thal                       297 non-null    object 
 13  diameter narrowing         297 non-null    int64  

In [360]:
#Definiendo las predictoras y las clases
X = datos.drop(['diameter narrowing'], axis=1)
y = datos['diameter narrowing']
X.head()
print(type(y))

<class 'pandas.core.series.Series'>


In [361]:
# get_embedding_info identifies the categorical variables and the number 
#of unique values and embedding size (half of unique values) and returns a dictionary
embedding_info = get_embedding_info(X)
embedding_info

{'gender': (2, 1),
 'chest pain': (4, 2),
 'rest ECG': (3, 2),
 'slope peak exc ST': (3, 2),
 'thal': (3, 2)}

In [362]:
# get_label_encoded_data integer encodes the categorical variables and prepares it to feed it to neural network
X_encoded,encoders = get_label_encoded_data(X)
X_encoded.head()

Unnamed: 0,age,gender,chest pain,rest SBP,cholesterol,fasting blood sugar > 120,rest ECG,max HR,exerc ind ang,ST by exercise,slope peak exc ST,major vessels colored,thal
0,63,1,3,145,233,1,1,150,0,2.3,0,0,0
1,67,1,0,160,286,0,1,108,1,1.5,1,3,1
2,67,1,0,120,229,0,1,129,1,2.6,1,2,2
3,37,1,2,130,250,0,2,187,0,3.5,0,0,1
4,41,0,1,130,204,0,1,172,0,1.4,2,0,1


In [363]:
# splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_encoded,y)

In [364]:
# get_embeddings trains NN, extracts embeddings and return a dictionary containing the embeddings
embeddings = get_embeddings(X_train, y_train, categorical_embedding_info=embedding_info,  epochs=50,batch_size=16)

In [365]:
# if you don't like the dictionary format; convert it to dataframe for easy readibility
dfs = get_embeddings_in_dataframe(embeddings=embeddings, encoders=encoders)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [366]:
dfs['chest pain']

Unnamed: 0,chest pain_embedding_0,chest pain_embedding_1
asymptomatic,0.201247,0.188345
atypical ang,-0.036394,-0.069682
non-anginal,-0.183427,-0.186382
typical ang,-0.186579,-0.265329
