In [6]:
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat, get_feature_names

if __name__ == "__main__":
    data = pd.read_csv('./criteo_sample.txt')

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=4)
                              for i, feat in enumerate(sparse_features)] + [DenseFeat(feat, 1, )
                                                                            for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2, random_state=2020)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
    # print("input test data", test_model_input)

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile("adam", "binary_crossentropy",
                  metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))


print(model,"it exist")
# Define the new data (replace with your new data)
new_data = pd.DataFrame({
    'I1': [3.0],
    'I2': [260.0],
    'I3': [0.0],
    'I4': [12.0],
    'I5': [2013.0],
    'I6': [164.0],
    'I7': [6.0],
    'I8': [35.0],
    'I9': [523.0],
    'I10': [0.0],
    'I11': [3.0],
    'I12': [0.0],
    'I13': [18.0],
    'C1': ['05db9164'],
    'C2': ['38a947a1'],
    'C3': ['3f55fb72'],
    'C4': ['5de245c7'],
    'C5': ['30903e74'],
    'C6': ['7e0ccccf'],
    'C7': ['b72ec13d'],
    'C8': ['1f89b562'],
    'C9': ['a73ee510'],
    'C10': ['acce978c'],
    'C11': ['3547565f'],
    'C12': ['a5b0521a'],
    'C13': ['12880350'],
    'C14': ['b28479f6'],
    'C15': ['c12fc269'],
    'C16': ['95a8919c'],
    'C17': ['e5ba7672'],
    'C18': ['675c9258'],
    'C19': ['21ddcdc9'],
    'C20': ['b1252a9d'],
    'C21': ['0e8585d2'],
    'C22': ['32c7478e'],
    'C23': ['0d4a6d1a'],
    'C24': ['001f3601'],
    'C25': ['92c878de']
})



# Preprocess the new data
sparse_features = ['C' + str(i) for i in range(1, 27)]
dense_features = ['I' + str(i) for i in range(1, 14)]

# new_data[sparse_features] = new_data[sparse_features]
# new_data[dense_features] = new_data[dense_features]

for feat in sparse_features:
    lbe = LabelEncoder()
    new_data[feat] = lbe.fit_transform(new_data[feat])

mms = MinMaxScaler(feature_range=(0, 1))
new_data[dense_features] = mms.fit_transform(new_data[dense_features])

# Define feature columns for the model
feature_columns = [SparseFeat(feat, vocabulary_size=new_data[feat].max() + 1, embedding_dim=4)
                   for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features]

feature_names = get_feature_names(feature_columns)

# Prepare the input data for model prediction
new_data_input = {name: new_data[name] for name in feature_names}

# Make predictions for the new data
predictions = model.predict(new_data_input)

# Interpret the output (predictions) for CTR estimation
# The 'predictions' variable now contains the predicted CTR values for the new data points.
print("Predicted CTR:", predictions)


Epoch 1/10
1/1 - 7s - loss: 0.6860 - binary_crossentropy: 0.6860 - val_loss: 0.6547 - val_binary_crossentropy: 0.6547
Epoch 2/10
1/1 - 0s - loss: 0.6664 - binary_crossentropy: 0.6664 - val_loss: 0.6441 - val_binary_crossentropy: 0.6441
Epoch 3/10
1/1 - 0s - loss: 0.6473 - binary_crossentropy: 0.6473 - val_loss: 0.6335 - val_binary_crossentropy: 0.6335
Epoch 4/10
1/1 - 0s - loss: 0.6283 - binary_crossentropy: 0.6282 - val_loss: 0.6230 - val_binary_crossentropy: 0.6230
Epoch 5/10
1/1 - 0s - loss: 0.6091 - binary_crossentropy: 0.6091 - val_loss: 0.6130 - val_binary_crossentropy: 0.6130
Epoch 6/10
1/1 - 0s - loss: 0.5901 - binary_crossentropy: 0.5901 - val_loss: 0.6037 - val_binary_crossentropy: 0.6037
Epoch 7/10
1/1 - 0s - loss: 0.5713 - binary_crossentropy: 0.5712 - val_loss: 0.5953 - val_binary_crossentropy: 0.5953
Epoch 8/10
1/1 - 0s - loss: 0.5526 - binary_crossentropy: 0.5526 - val_loss: 0.5881 - val_binary_crossentropy: 0.5881
Epoch 9/10
1/1 - 0s - loss: 0.5342 - binary_crossentropy

KeyError: 'C26'