In [6]:
import pandas as pd
import tensorflow as tf
import numpy as np


In [2]:
ALL_COLS = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
             'marital-status', 'occupation', 'relationship', 'race',
             'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'country']
CONT_COLS = ['age', 'fnlwgt', 'education-num',
               'capital-gain', 'capital-loss', 'hours-per-week']
CAT_COLS = list(set(ALL_COLS).difference(CONT_COLS))

NUM_BINS = 10
BATCH_SIZE = 256
EMBEDDING_SIZE = 5

In [11]:
from itertools import repeat
from sklearn.preprocessing import MinMaxScaler

def get_modified_data(X, all_cols, cont_cols, cat_cols):
    
    X_cat = X[CAT_COLS]
    
    scaler = MinMaxScaler()
    X_ret = pd.DataFrame(scaler.fit_transform(X[CONT_COLS]), columns=CONT_COLS)
    cols_idx = list(range(X_ret.shape[1]))
    cat_start_idx = X_ret.shape[1]
    
    for idx, col in enumerate(X_cat):
        cat_X = pd.get_dummies(X_cat[col], prefix=col, prefix_sep='-')
        cols_idx.extend(repeat(idx + cat_start_idx, cat_X.shape[1]))
        X_ret = pd.concat([X_ret, cat_X], axis=1)
        
    print('X shape: {}'.format(X_ret.shape))

    return cols_idx, X_ret

In [15]:
class FM(tf.keras.layers.Layer):
    def __init__(self, num_feature, num_field, embedding_size, field_index):
        super(FM, self).__init__()
        self.embedding_size = embedding_size    
        self.num_feature = num_feature         
        self.num_field = num_field              
        self.field_index = field_index          

        self.w = tf.Variable(tf.random.normal(shape=[num_feature], mean=0.0, stddev=1.0), name='w')
        self.V = tf.Variable(tf.random.normal(shape=(num_field, embedding_size), mean=0.0, stddev=0.01), name='V')

    def call(self, inputs):
        x_batch = tf.reshape(inputs, [-1, self.num_feature, 1])
        embedding = tf.nn.embedding_lookup(params=self.V, ids=self.field_index)
        #(batch size, feature num, embedding size)
        embedded_inputs = tf.math.multiply(x_batch, embedding)

        #(batch size, )
        ord1_terms = tf.reduce_sum(tf.math.multiply(self.w, inputs), axis=1, keepdims=False)
        ord2_terms = 0.5 * tf.subtract(tf.square(tf.reduce_sum(embedded_inputs, [1, 2])), tf.reduce_sum(tf.square(embedded_inputs), [1, 2]))
        
        
        ord1_terms = tf.reshape(ord1_terms, [-1, 1])
        ord2_terms = tf.reshape(ord2_terms, [-1, 1])

        y_fm = tf.concat([ord1_terms, ord2_terms], 1)

        return y_fm, embedded_inputs

In [17]:
tf.keras.backend.set_floatx('float32')

class DeepFM(tf.keras.Model):

    def __init__(self, num_feature, num_field, embedding_size, field_index):
        super(DeepFM, self).__init__()
        self.embedding_size = embedding_size    # k: 임베딩 벡터의 차원(크기)
        self.num_feature = num_feature          # f: 원래 feature 개수
        self.num_field = num_field              # m: grouped field 개수
        self.field_index = field_index          # 인코딩된 X의 칼럼들이 본래 어디 소속이었는지

        self.fm = FM(num_feature, num_field, embedding_size, field_index)

        self.layer1 = tf.keras.layers.Dense(units=64, activation='relu')
        self.dropout1 = tf.keras.layers.Dropout(rate=0.2)
        self.layer2 = tf.keras.layers.Dense(units=16, activation='relu')
        self.dropout2 = tf.keras.layers.Dropout(rate=0.2)
        self.layer3 = tf.keras.layers.Dense(units=2, activation='relu')

        self.predict = tf.keras.layers.Dense(units=1, activation='sigmoid')

    
    def call(self, inputs):
        y_fm, embedded_inputs = self.fm(inputs)

        embedded_inputs = tf.reshape(embedded_inputs, [-1, self.num_feature*self.embedding_size])

        #deep component
        y_deep = self.layer1(embedded_inputs)
        y_deep = self.dropout1(y_deep)
        y_deep = self.layer2(y_deep)
        y_deep = self.dropout2(y_deep)
        y_deep = self.layer3(y_deep)

        # Concatenation
        y_pred = self.predict(tf.concat([y_fm, y_deep], axis=1))
        y_pred = tf.reshape(y_pred, [-1, ])

        return y_pred

In [13]:
from time import perf_counter
from sklearn.model_selection import train_test_split


def get_data():
    data = pd.read_csv('data/adult.data', header=None)
    X = data.loc[:, 0:13]
    Y = data.loc[:, 14].map({' <=50K': 0, ' >50K': 1})

    X.columns = ALL_COLS
    field_index, X_embedded = get_modified_data(X, ALL_COLS, CONT_COLS, CAT_COLS)

    X_train, X_test, Y_train, Y_test = train_test_split(X_embedded, Y, test_size=0.2, stratify=Y)

    train_ds = tf.data.Dataset.from_tensor_slices((tf.cast(X_train.values, tf.float32), tf.cast(Y_train, tf.float32))).shuffle(30000).batch(BATCH_SIZE)

    test_ds = tf.data.Dataset.from_tensor_slices((tf.cast(X_test.values, tf.float32), tf.cast(Y_test, tf.float32))).shuffle(10000).batch(BATCH_SIZE)

    return train_ds, test_ds, field_index, X.shape[1]


def train_on_batch(model, optimizer, acc, auc, inputs, targets):
    with tf.GradientTape() as tape:
        y_pred = model(inputs)
        loss = tf.keras.losses.binary_crossentropy(from_logits=False, y_true=targets, y_pred=y_pred)

    grads = tape.gradient(target=loss, sources=model.trainable_variables)

    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    acc.update_state(targets, y_pred)
    auc.update_state(targets, y_pred)

    return loss


def train(epochs):
    train_ds, test_ds, field_index, num_field = get_data()

    model = DeepFM(embedding_size=EMBEDDING_SIZE, num_feature=len(field_index), num_field=num_field, field_index=field_index)

    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)

    print("Start Training")
    start = perf_counter()
    for i in range(epochs):
        acc = tf.keras.metrics.BinaryAccuracy(threshold=0.5)
        auc = tf.keras.metrics.AUC()
        loss_history = []

        for x, y in train_ds:
            loss = train_on_batch(model, optimizer, acc, auc, x, y)
            loss_history.append(loss)

        print("Epoch {:03d}: Loss: {:.4f}, Acc: {:.4f}, AUC: {:.4f}".format(i, loss_history[-1], acc.result().numpy(), auc.result().numpy()))

    test_acc = tf.keras.metrics.BinaryAccuracy(threshold=0.5)
    test_auc = tf.keras.metrics.AUC()
    for x, y in test_ds:
        y_pred = model(x)
        test_acc.update_state(y, y_pred)
        test_auc.update_state(y, y_pred)

    print("test ACC: {:.4f}, AUC: {:.4f}".format(test_acc.result().numpy(), test_auc.result().numpy()))
    print("time: {:.3f}".format(perf_counter() - start))
    #model.save_weights('weights/weights-epoch({})-batch({})-embedding({}).h5'.format(epochs, BATCH_SIZE, EMBEDDING_SIZE))



In [18]:
train(epochs=100)

X shape: (32561, 108)
Start Training
Epoch 000: Loss: 0.8519, Acc: 0.6177, AUC: 0.2836
Epoch 001: Loss: 0.8216, Acc: 0.6349, AUC: 0.3102
Epoch 002: Loss: 0.8126, Acc: 0.6488, AUC: 0.3415
Epoch 003: Loss: 0.7942, Acc: 0.6593, AUC: 0.3758
Epoch 004: Loss: 0.8416, Acc: 0.6693, AUC: 0.4125
Epoch 005: Loss: 0.7107, Acc: 0.6806, AUC: 0.4516
Epoch 006: Loss: 0.6836, Acc: 0.6904, AUC: 0.4935
Epoch 007: Loss: 0.6307, Acc: 0.6990, AUC: 0.5364
Epoch 008: Loss: 0.6119, Acc: 0.7062, AUC: 0.5747
Epoch 009: Loss: 0.5196, Acc: 0.7134, AUC: 0.6077
Epoch 010: Loss: 0.6139, Acc: 0.7187, AUC: 0.6369
Epoch 011: Loss: 0.5175, Acc: 0.7238, AUC: 0.6629
Epoch 012: Loss: 0.5328, Acc: 0.7279, AUC: 0.6865
Epoch 013: Loss: 0.5484, Acc: 0.7324, AUC: 0.7081
Epoch 014: Loss: 0.4864, Acc: 0.7381, AUC: 0.7262
Epoch 015: Loss: 0.5446, Acc: 0.7431, AUC: 0.7415
Epoch 016: Loss: 0.5411, Acc: 0.7503, AUC: 0.7545
Epoch 017: Loss: 0.5358, Acc: 0.7578, AUC: 0.7655
Epoch 018: Loss: 0.5124, Acc: 0.7644, AUC: 0.7752
Epoch 019: Lo