In [1]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import *
from deepctr.inputs import  SparseFeat, DenseFeat, get_feature_names

import tensorflow as tf
import tensorflow.keras.backend as K

DeepCTR version 0.8.0 detected. Your version is 0.7.5.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.0


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
    # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
        print(e)


1 Physical GPUs, 1 Logical GPUs


In [3]:
data = pd.read_csv('../data/frappe_all.csv')

In [4]:
sparse_features = ['user', 'item', 'daytime', 'weekday', 'isweekend', 'homework', 'cost',
       'weather', 'country', 'city']
dense_features = []

data[sparse_features] = data[sparse_features].fillna('-1', )
data[dense_features] = data[dense_features].fillna(0, )
target = ['label']

In [5]:
# 1.Label Encoding for sparse features,and do simple Transformation for dense features
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])
mms = MinMaxScaler(feature_range=(0, 1))
# data[dense_features] = mms.fit_transform(data[dense_features])

In [6]:
# 2.count #unique features for each sparse field,and record dense feature field name

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=256)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

In [7]:
# 3.generate input data for model

train, test = train_test_split(data, test_size=0.1)
train, val = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}
val_model_input = {name:val[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

In [8]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 


In [9]:
# 4.Define Model,train,predict and evaluate
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary',
              dnn_dropout=0.3, l2_reg_embedding=0.2, l2_reg_dnn=0.2)
model.compile(optimizer="adam", loss=root_mean_squared_error,
              metrics=[tf.keras.metrics.RootMeanSquaredError(), tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.AUC()], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=128, epochs=5, verbose=1, validation_data=(val_model_input, val[target].values))

Epoch 1/5




Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [10]:
pred_ans = model.predict(test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))

test LogLoss 0.0782
test AUC 0.9971


In [11]:
model.evaluate(test_model_input, test[target].values, batch_size=128)




[0.13974155485630035, 0.1359046846628189, 0.981566846370697, 0.997124969959259]