In [60]:
# entity_embeddings.py
import os
import gc
import joblib
import copy
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils

In [61]:
df = pd.read_csv("../input/adult_folds.csv")

In [62]:
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,kfold
0,59,Private,184493,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,>50K,0
1,62,Private,81116,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Male,0,1974,40,United-States,<=50K,0
2,74,Private,101590,Prof-school,15,Widowed,Adm-clerical,Not-in-family,Black,Female,0,0,20,United-States,<=50K,0
3,36,?,187167,HS-grad,9,Separated,?,Not-in-family,White,Female,0,0,30,United-States,<=50K,0
4,51,Federal-gov,27166,HS-grad,9,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,<=50K,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,250314,9th,5,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,Guatemala,<=50K,4
48838,39,Self-emp-not-inc,327120,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,Portugal,<=50K,4
48839,67,Private,219687,Some-college,10,Widowed,Sales,Not-in-family,White,Male,0,0,18,United-States,<=50K,4
48840,50,Private,330543,Preschool,1,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,Mexico,<=50K,4


In [63]:
# all columns are features except id, target and kfold columns
features = [
    f for f in df.columns if f not in ("id", "income", "kfold")
]

In [64]:
features

['age',
 'workclass',
 'fnlwgt',
 'education',
 'educational-num',
 'marital-status',
 'occupation',
 'relationship',
 'race',
 'gender',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'native-country']

In [65]:
# map targets to 0s and 1s
target_mapping = {"<=50K": 0,
                  ">50K": 1}

df.loc[:, "income"] = df.income.map(target_mapping)

In [66]:
# fill all NaN values with NONE
# note that I am converting all columns to "strings" # it doesnt matter because all are categories
for col in features:
    df.loc[:, col] = df[col].astype(str).fillna("NONE")

In [67]:
df

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,kfold
0,59,Private,184493,Some-college,10,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,1,0
1,62,Private,81116,Some-college,10,Divorced,Adm-clerical,Not-in-family,White,Male,0,1974,40,United-States,0,0
2,74,Private,101590,Prof-school,15,Widowed,Adm-clerical,Not-in-family,Black,Female,0,0,20,United-States,0,0
3,36,?,187167,HS-grad,9,Separated,?,Not-in-family,White,Female,0,0,30,United-States,0,0
4,51,Federal-gov,27166,HS-grad,9,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,Private,250314,9th,5,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,Guatemala,0,4
48838,39,Self-emp-not-inc,327120,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,Portugal,0,4
48839,67,Private,219687,Some-college,10,Widowed,Sales,Not-in-family,White,Male,0,0,18,United-States,0,4
48840,50,Private,330543,Preschool,1,Married-civ-spouse,Other-service,Husband,White,Male,0,0,40,Mexico,0,4


In [68]:
# encode all features with label encoder individually
# in a live setting you need to save all label encoders
for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values)

In [69]:
# get training data using folds
df_train = df[df.kfold != 0].reset_index(drop=True)

# get validation data using folds
df_valid = df[df.kfold == 0].reset_index(drop=True)

In [70]:
df_train

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,kfold
0,17,2,15013,7,3,2,10,0,4,1,0,0,34,39,1,1
1,37,2,3895,11,15,2,3,0,4,1,0,0,34,39,1,1
2,35,1,13921,9,4,2,11,0,4,1,0,0,39,39,1,1
3,29,4,13119,9,4,2,10,0,4,1,0,0,34,0,1,1
4,24,4,21571,9,4,0,4,1,4,1,0,0,45,39,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39068,10,4,16299,6,11,2,11,0,4,1,0,0,34,13,0,4
39069,22,6,20702,11,15,2,3,0,4,1,0,0,34,32,0,4
39070,50,4,13749,15,1,6,12,1,4,1,0,0,9,39,0,4
39071,33,4,20856,13,0,2,8,0,4,1,0,0,34,26,0,4


In [71]:
# init list of inputs for embeddings
inputs = []
# init list of outputs for embeddings
outputs = []

In [72]:
catcols

['workclass']

In [73]:
catcols = [features[1]]

# loop over all categorical columns
for c in features:
    # find the number of df values in the column
    num_unique_values = int(df[c].nunique())
    print('num_unique_values:',num_unique_values)
    # simple dimension of embedding calculator
    # min size is half of the number of unique values
    # max size is 50. max size depends on the number of unique
    # categories too. 50 is quite sufficient most of the times
    # but if you have millions of unique values, you might need
    # a larger dimension
    embed_dim = int(min(np.ceil((num_unique_values)/2), 50))
    print('embed_dim:',embed_dim)

    # simple keras input layer with size 1
    inp = layers.Input(shape=(1,))

    # add embedding layer to raw input
    # embedding size is always 1 more than unique values in input
    out = layers.Embedding(
    num_unique_values + 1, embed_dim, name=c)(inp)


    # 1-d spatial dropout is the standard for emebedding layers
    # you can use it in NLP tasks too
    out = layers.SpatialDropout1D(0.3)(out)

    # reshape the input to the dimension of embedding
    # this becomes our output layer for current feature
    out = layers.Reshape(target_shape=(embed_dim, ))(out)

    # add input to input list
    inputs.append(inp)

    # add output to output list
    outputs.append(out)

num_unique_values: 74
embed_dim: 37
num_unique_values: 9
embed_dim: 5
num_unique_values: 28523
embed_dim: 50
num_unique_values: 16
embed_dim: 8
num_unique_values: 16
embed_dim: 8
num_unique_values: 7
embed_dim: 4
num_unique_values: 15
embed_dim: 8
num_unique_values: 6
embed_dim: 3
num_unique_values: 5
embed_dim: 3
num_unique_values: 2
embed_dim: 1
num_unique_values: 123
embed_dim: 50
num_unique_values: 99
embed_dim: 50
num_unique_values: 96
embed_dim: 48
num_unique_values: 42
embed_dim: 21


In [74]:
layers.Input(shape=(1,))

<tf.Tensor 'input_51:0' shape=(None, 1) dtype=float32>

In [75]:
layers.Embedding(
    num_unique_values + 1, embed_dim, name=c)(inp)

<tf.Tensor 'native-country_5/Identity:0' shape=(None, 1, 21) dtype=float32>

In [102]:
inp

<tf.Tensor 'input_50:0' shape=(None, 1) dtype=float32>

In [76]:
out

<tf.Tensor 'reshape_46/Identity:0' shape=(None, 21) dtype=float32>

In [77]:
inputs

[<tf.Tensor 'input_37:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_38:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_39:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_40:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_41:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_42:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_43:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_44:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_45:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_46:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_47:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_48:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_49:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'input_50:0' shape=(None, 1) dtype=float32>]

In [78]:
outputs

[<tf.Tensor 'reshape_33/Identity:0' shape=(None, 37) dtype=float32>,
 <tf.Tensor 'reshape_34/Identity:0' shape=(None, 5) dtype=float32>,
 <tf.Tensor 'reshape_35/Identity:0' shape=(None, 50) dtype=float32>,
 <tf.Tensor 'reshape_36/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'reshape_37/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'reshape_38/Identity:0' shape=(None, 4) dtype=float32>,
 <tf.Tensor 'reshape_39/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'reshape_40/Identity:0' shape=(None, 3) dtype=float32>,
 <tf.Tensor 'reshape_41/Identity:0' shape=(None, 3) dtype=float32>,
 <tf.Tensor 'reshape_42/Identity:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'reshape_43/Identity:0' shape=(None, 50) dtype=float32>,
 <tf.Tensor 'reshape_44/Identity:0' shape=(None, 50) dtype=float32>,
 <tf.Tensor 'reshape_45/Identity:0' shape=(None, 48) dtype=float32>,
 <tf.Tensor 'reshape_46/Identity:0' shape=(None, 21) dtype=float32>]

In [79]:
# concatenate all output layers
x = layers.Concatenate()(outputs)

# add a batchnorm layer.
# from here, everything is up to you
# you can try different architectures
# this is the architecture I like to use
# if you have numerical features, you should add # them here or in concatenate layer
x = layers.BatchNormalization()(x)

In [80]:
x

<tf.Tensor 'batch_normalization_8/Identity:0' shape=(None, 296) dtype=float32>

In [81]:
x = layers.Dense(300, activation="relu")(x)

In [82]:
x

<tf.Tensor 'dense_8/Identity:0' shape=(None, 300) dtype=float32>

In [83]:
x = layers.Dropout(0.5)(x)
x = layers.BatchNormalization()(x)

x = layers.Dense(300, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.BatchNormalization()(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
x = layers.BatchNormalization()(x)

# using softmax and treating it as a two class problem
# you can also use sigmoid, then you need to use only one
# output class
y = layers.Dense(2, activation="softmax")(x)

# create final model
model = Model(inputs=inputs, outputs=y)

In [84]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_37 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_38 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_39 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_40 (InputLayer)           [(None, 1)]          0                                            
____________________________________________________________________________________________

In [85]:
model.compile(loss='binary_crossentropy', optimizer='adam')

In [86]:
# our features are lists of lists
xtrain = [
    df_train[features].values[:, k] for k in range(len(features))
]
xvalid = [
    df_valid[features].values[:, k] for k in range(len(features))
]


In [89]:
df_train

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,kfold
0,17,2,15013,7,3,2,10,0,4,1,0,0,34,39,1,1
1,37,2,3895,11,15,2,3,0,4,1,0,0,34,39,1,1
2,35,1,13921,9,4,2,11,0,4,1,0,0,39,39,1,1
3,29,4,13119,9,4,2,10,0,4,1,0,0,34,0,1,1
4,24,4,21571,9,4,0,4,1,4,1,0,0,45,39,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39068,10,4,16299,6,11,2,11,0,4,1,0,0,34,13,0,4
39069,22,6,20702,11,15,2,3,0,4,1,0,0,34,32,0,4
39070,50,4,13749,15,1,6,12,1,4,1,0,0,9,39,0,4
39071,33,4,20856,13,0,2,8,0,4,1,0,0,34,26,0,4


In [87]:
xtrain

[array([17, 37, 35, ..., 50, 33, 15]),
 array([2, 2, 1, ..., 4, 4, 4]),
 array([15013,  3895, 13921, ..., 13749, 20856, 24919]),
 array([ 7, 11,  9, ..., 15, 13, 15]),
 array([ 3, 15,  4, ...,  1,  0,  1]),
 array([2, 2, 2, ..., 6, 2, 4]),
 array([10,  3, 11, ..., 12,  8,  8]),
 array([0, 0, 0, ..., 1, 0, 1]),
 array([4, 4, 4, ..., 4, 4, 4]),
 array([1, 1, 1, ..., 1, 1, 0]),
 array([ 0,  0,  0, ...,  0,  0, 75]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([34, 34, 39, ...,  9, 34, 34]),
 array([39, 39, 39, ..., 39, 26, 39])]

In [100]:
ytrain

array([1, 1, 1, ..., 0, 0, 0])

In [95]:
# fetch target columns
ytrain = df_train.income.values
yvalid = df_valid.income.values

In [96]:
utils.to_categorical(ytrain)

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [97]:
# # convert target columns to categories
# # this is just binarization
ytrain_cat = utils.to_categorical(ytrain)
yvalid_cat = utils.to_categorical(yvalid)

In [98]:
# fit the model
model.fit(xtrain, ytrain_cat,
          validation_data=(xvalid, yvalid_cat), verbose=1,
          batch_size=1024,
          epochs=5
          )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fb33e2211d0>

In [99]:
model.predict(xvalid)

array([[0.90105677, 0.09894323],
       [0.94769645, 0.05230359],
       [0.93998176, 0.06001817],
       ...,
       [0.9074634 , 0.09253663],
       [0.96751887, 0.03248118],
       [0.95046455, 0.0495355 ]], dtype=float32)

In [101]:
valid_preds = model.predict(xvalid)[:, 1]

# print roc auc score
print("AUC is:", metrics.roc_auc_score(yvalid, valid_preds))

AUC is: 0.9150470326432893


In [103]:
pd.date_range('2020-01-06', '2020-01-10', freq='10H').to_series()

2020-01-06 00:00:00   2020-01-06 00:00:00
2020-01-06 10:00:00   2020-01-06 10:00:00
2020-01-06 20:00:00   2020-01-06 20:00:00
2020-01-07 06:00:00   2020-01-07 06:00:00
2020-01-07 16:00:00   2020-01-07 16:00:00
2020-01-08 02:00:00   2020-01-08 02:00:00
2020-01-08 12:00:00   2020-01-08 12:00:00
2020-01-08 22:00:00   2020-01-08 22:00:00
2020-01-09 08:00:00   2020-01-09 08:00:00
2020-01-09 18:00:00   2020-01-09 18:00:00
Freq: 10H, dtype: datetime64[ns]