In [28]:
import sys
from pathlib import Path

PROJECT_DIR = str(Path.cwd().parent)
if PROJECT_DIR not in sys.path:
    sys.path.append(PROJECT_DIR)

import os
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics, preprocessing

from tensorflow.keras import layers, optimizers, callbacks, utils, backend
from tensorflow.keras.models import Model, load_model

from src import config

In [54]:
def create_model(data, categorical_cols):
    """Returns a compiled tf.keras model for entity embedding."""
    inputs = []
    outputs = []

    for category in categorical_cols:
        num_unique = int(data[category].nunique())
        embedding_dim = int(min(np.ceil(num_unique/2), 50))
        input_layer = layers.Input(shape=(1,))
        embedding_layer = layers.Embedding(num_unique+1, embedding_dim, name=category)(input_layer)
        dropout_layer_1 = layers.SpatialDropout1D(0.3)(embedding_layer)
        output_layer = layers.Reshape(target_shape=(embedding_dim, ))(dropout_layer_1)

        inputs.append(input_layer)
        outputs.append(output_layer)

    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(200, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)

    x = layers.Dense(100, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    x = layers.BatchNormalization()(x)

    y = layers.Dense(2, activation="softmax")(x)
    
    model = Model(inputs=inputs, outputs=y)

    model.compile(loss="binary_crossentropy", optimizer="adam")
    
    return model

In [55]:
fold = 0
df = pd.read_csv(config.TRAIN_FILE)

features = [col for col in df.columns if col not in ["target","folds"]]

for feature in features:
    df.loc[:, feature] = df[feature].astype(str).fillna("NONE")
    encoder = preprocessing.LabelEncoder()
    df.loc[:, feature] = encoder.fit_transform(df[feature])

In [56]:
df

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,...,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,folds
0,0,0,1,0,0,2,1,3,3,0,...,0,4,5,0,19,146,5,7,0,3
1,0,0,0,1,0,2,0,3,4,3,...,2,4,5,13,17,34,1,10,0,0
2,0,0,0,0,0,2,1,2,3,3,...,1,0,5,7,15,12,0,6,0,3
3,0,0,1,1,0,0,2,2,4,0,...,2,1,3,13,20,180,2,0,0,0
4,0,0,0,0,0,1,4,2,3,3,...,2,2,5,15,24,140,2,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,0,1,1,0,0,2,6,2,3,3,...,2,4,2,5,16,71,5,10,0,0
149996,0,0,0,0,1,2,5,1,4,3,...,0,1,3,10,2,108,2,5,0,4
149997,1,1,0,1,1,0,4,0,0,3,...,0,0,4,15,15,137,5,9,1,2
149998,0,1,0,0,1,0,0,4,5,3,...,2,2,1,12,0,143,2,3,1,1


In [60]:
x_train = df.loc[df["folds"] != fold, features]
x_valid = df.loc[df["folds"] == fold, features]
y_train = df.loc[df["folds"] != fold, "target"].values
y_valid = df.loc[df["folds"] == fold, "target"].values
y_train_cat = utils.to_categorical(y_train)
y_valid_cat = utils.to_categorical(y_valid)


In [61]:
xtrain = [x_train[features].values[:, k] for k in range(len(features))]
xvalid = [x_valid[features].values[:, k] for k in range(len(features))]


In [62]:
model = create_model(df, features)

In [None]:
model.summary()

In [71]:
model.fit(xtrain, y_train_cat, validation_data=(xvalid, y_valid_cat), verbose=1, batch_size=32, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f82c5742700>

In [72]:
preds = model.predict(xvalid)[:,1]
print(metrics.roc_auc_score(y_valid, preds))
backend.clear_session()

0.7343148098975335
