## Objective


In many Kaggle competitions or machine learning examples, data with categorical variables is usually solved by a model of the Decision Tree model. However, deep learning can handle categorical variables as effectively as Decision Tree.

![](https://imgur.com/RTV6hAo.png)

### Load Packages

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import get_file

np.set_printoptions(precision=3)

### Load Data

reference : [ucl - bank marketing data set](https://archive.ics.uci.edu/ml/datasets/Bank%2BMarketing)

In [5]:
URL = "https://docs.google.com/uc?id=16Z2Jyg9BPB8kLeuGDRNLpXZdF77W32_p"

fpath = get_file("bank-full.csv", URL)
df = pd.read_csv(fpath, sep=';')

## Data EDA

### Profile Data

* **Categorical inputs**
> job, marital, education, default, housing, loan, contact, month, poutcome
* **Numeric inputs**
> age, balance, day, duration, campaign, pdays, previous

In [6]:
print(f"데이터의 크기 : {df.shape}")
df.head()

데이터의 크기 : (45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
# value list for each category
cat_col_names = ["marital", 'job', 'contact',
                 'education', 'month', "poutcome",
                 "housing", "loan", 'default']

for col_name in cat_col_names:
    cat_values = np.unique(df[col_name])
    print(f"{col_name}: {cat_values}\n")

marital: ['divorced' 'married' 'single']

job: ['admin.' 'blue-collar' 'entrepreneur' 'housemaid' 'management' 'retired'
 'self-employed' 'services' 'student' 'technician' 'unemployed' 'unknown']

contact: ['cellular' 'telephone' 'unknown']

education: ['primary' 'secondary' 'tertiary' 'unknown']

month: ['apr' 'aug' 'dec' 'feb' 'jan' 'jul' 'jun' 'mar' 'may' 'nov' 'oct' 'sep']

poutcome: ['failure' 'other' 'success' 'unknown']

housing: ['no' 'yes']

loan: ['no' 'yes']

default: ['no' 'yes']



In [10]:
# value rang for each numeric input
num_col_names = ['age', 'balance', 'day', 'duration',
                 'campaign','pdays', 'previous']

for col_name in num_col_names:
    print(f"{col_name}: ({df[col_name].min()},{df[col_name].max()})\n")

age: (18,95)

balance: (-8019,102127)

day: (1,31)

duration: (0,4918)

campaign: (1,63)

pdays: (-1,871)

previous: (0,275)



### Transform(preprocess) Data

#### - Transform categorical data

In [12]:
from sklearn.preprocessing import LabelEncoder

category_xs = []
category_encoders = []
for col_name in cat_col_names:
    encoder = LabelEncoder()
    encoded_xs = encoder.fit_transform(df[col_name])

    category_xs.append(encoded_xs)
    category_encoders.append(encoder)

category_xs = np.stack(category_xs, axis=1) 
category_xs

array([[1, 4, 2, ..., 1, 0, 0],
       [2, 9, 2, ..., 1, 0, 0],
       [1, 2, 2, ..., 1, 1, 0],
       ...,
       [1, 5, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 2, 0, ..., 0, 0, 0]])

#### - Transform numeric data

In [13]:
from sklearn.preprocessing import StandardScaler

numeric_encoder = StandardScaler()
numeric_xs = numeric_encoder.fit_transform(df[num_col_names])
numeric_xs

array([[ 1.607,  0.256, -1.298, ..., -0.569, -0.411, -0.252],
       [ 0.289, -0.438, -1.298, ..., -0.569, -0.411, -0.252],
       [-0.747, -0.447, -1.298, ..., -0.569, -0.411, -0.252],
       ...,
       [ 2.925,  1.43 ,  0.143, ...,  0.722,  1.436,  1.05 ],
       [ 1.513, -0.228,  0.143, ...,  0.399, -0.411, -0.252],
       [-0.371,  0.528,  0.143, ..., -0.247,  1.476,  4.524]])

####  - Transform Label($y$)

In [14]:
ys = df.y.map({'yes':True,'no':False}).values
ys

array([False, False, False, ...,  True, False, False])

### Split Train & Test Data

In [15]:
from sklearn.model_selection import train_test_split

splitted = train_test_split(category_xs, numeric_xs, ys, 
                            test_size=0.1,stratify=ys)

train_category_xs, train_numeric_xs, train_ys = splitted[::2]
test_category_xs, test_numeric_xs, test_ys = splitted[1::2]

##  DNN handling categorical variables
---

### Build Model

In [20]:
from tensorflow.keras.layers import Input, Embedding
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from tensorflow.keras.models import Model

embed_size = 4 # Embedding Size of categorical variable

category_inputs = Input((9,), dtype=tf.int32)
numeric_inputs = Input((7,), dtype=tf.float32)

category_embeds = []
for idx, col_name in enumerate(cat_col_names):
    # The num of values in category
    category_size = len(category_encoders[idx].classes_)
        
    # Apply Embedding layer to categorical variables
    embed_value = (
        Embedding(category_size, embed_size, 
                  name=col_name+'_embed')(category_inputs[:,idx]))
    
    category_embeds.append(embed_value)

# Conatenating numeric and categorical variables
inputs_list = category_embeds + [numeric_inputs]
concats = Concatenate(name='embed_concat')(inputs_list)

# connected with 3 layer neural network
hidden = Dense(50,activation='relu', name='hidden1',
               kernel_regularizer=l2(1e-5))(concats)
hidden = Dropout(0.3, name='dropout1')(hidden)
hidden = Dense(50,activation='relu', name='hidden2',
               kernel_regularizer=l2(1e-5))(hidden)
hidden = Dropout(0.3, name='dropout2')(hidden)
output = Dense(1, activation='sigmoid', name='output',
               kernel_regularizer=l2(1e-5))(hidden)

model = Model([category_inputs, numeric_inputs], output)

#### Compile Model

In [21]:
from tensorflow.keras.optimizers import Adagrad
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.metrics import BinaryAccuracy

model.compile(optimizer=Adagrad(1e-2),
              loss=BinaryCrossentropy(),
              metrics=[BinaryAccuracy()])

#### Train Model

In [22]:
train_xs = [train_category_xs,train_numeric_xs]
model.fit(x=train_xs, y=train_ys,
          batch_size=64, epochs=100, 
          validation_split=0.1, verbose=0);

#### Evaluate Model


In [25]:
loss, acc = model.evaluate(x=[test_category_xs, test_numeric_xs], 
                           y=test_ys, verbose=0)
print(f"Test Accuracy : {acc:.3%}")

Test Accuracy : 90.137%


### c.f) Compare with Random Forest Classifier


In [23]:
from sklearn.ensemble import RandomForestClassifier

# Build model
rf_clf = RandomForestClassifier()

# Train model
train_xs = np.concatenate([train_category_xs,train_numeric_xs],axis=1)
rf_clf.fit(train_xs, train_ys)

# Evaluate Model
test_xs = np.concatenate([test_category_xs,test_numeric_xs],axis=1)
rf_clf.score(test_xs, test_ys)

0.901813356921716