In [1]:
import random
import numpy as np
import pickle
import pandas as pd
import tensorflow as tf
import sys
import os
sys.path.append('../')

from deepctr.feature_column import SparseFeat, VarLenSparseFeat, VarUserLenSparseFeat, DenseFeat,get_feature_names
from deepctr.models import MIN

from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import log_loss, roc_auc_score
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [2]:
with open('samples.pkl', 'rb') as f: 
    train_data = pickle.load(f)
    test_data = pickle.load(f)
    user_count, item_count, cate_count = pickle.load(f)

In [3]:
random.shuffle(train_data)
random.shuffle(test_data)

In [4]:
len(train_data)+len(test_data)

2000

In [5]:
#(uid, mid, cat, histlen, mid_list, cat_list, mid_neg_list, cat_neg_list, item_uid_list, item_midlen_list,\
                     #item_mid_list, item_cat_list, item_histlen, label)

DeepCTR version 0.9.1 detected. Your version is 0.9.0.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.1


In [6]:
def getfixedseq(data, fixedlen, ulen):
    ts=data#(uid, mid, cat, histlen, mid_list, cat_list, mid_neg_list, cat_neg_list, users_list, ulen_list, user_item_list, user_cat_list,label)
    counts = len(ts)

    u, i, c, sl, y = [], [], [], [], []
    hist_i = []
    hist_c = []
    #user_u = []
    user_sl = []
    user_i = []
    user_c = []
    user_n = []
    

    for t in ts:
        u.append(t[0])
        i.append(t[1])
        c.append(t[2])
        sl.append(t[3])
        hist_i.append(t[4])
        hist_c.append(t[5])
        #user_u.append(t[8])
        user_sl.append(t[9])
        user_n.append(t[12])
        y.append(t[13])
        
    hist_i = pad_sequences(hist_i, maxlen=fixedlen, padding='post', truncating='post')
    hist_c = pad_sequences(hist_c, maxlen=fixedlen, padding='post', truncating='post')
    user_sl = pad_sequences(user_sl, maxlen=ulen, padding='post', truncating='post')
    user_sl = np.expand_dims(user_sl,axis=2)
    
    user_i_tmp = []
    user_c_tmp = []
    for t in ts:
        n = t[12]
        user_i_tmp = t[10] 
        user_i_tmp = pad_sequences(user_i_tmp, maxlen=fixedlen, padding='post', truncating='post')
        user_c_tmp = t[11] 
        user_c_tmp = pad_sequences(user_c_tmp, maxlen=fixedlen, padding='post', truncating='post')
        
        if n>ulen or n==ulen:
            user_i_tmp = user_i_tmp[:ulen]
            user_c_tmp = user_c_tmp[:ulen]
        else:
            ns = ulen-n
            zero = np.zeros([ns, fixedlen], np.int64)
            user_i_tmp = np.row_stack((user_i_tmp,zero))
            user_c_tmp = np.row_stack((user_c_tmp,zero))
            
        user_i.append(user_i_tmp)
        user_c.append(user_c_tmp)

    return u, i, c, sl, hist_i, hist_c, user_sl, user_i, user_c, user_n, y

In [7]:
def getfixedseq_short(data, fixedlen, ulen):
    ts=data
    counts = len(ts)

    u, i, c, sl, y = [], [], [], [], []
    hist_i = []
    hist_c = []
    #user_u = []
    user_sl = []
    user_i = []
    user_c = []
    user_n = []
    

    for t in ts:
        u.append(t[0])
        i.append(t[1])
        c.append(t[2])
        sl.append(t[3])
        hist_i.append(t[4])
        hist_c.append(t[5])
        #user_u.append(t[8])
        user_sl.append(t[9])
        user_n.append(t[12])
        y.append(t[13])
        
    hist_i = pad_sequences(hist_i, maxlen=fixedlen, padding='pre', truncating='pre')## 从后向前截取，向前补零
    hist_c = pad_sequences(hist_c, maxlen=fixedlen, padding='pre', truncating='pre')
    user_sl = pad_sequences(user_sl, maxlen=ulen, padding='post', truncating='post')
    user_sl = np.expand_dims(user_sl,axis=2)
    
    user_i_tmp = []
    user_c_tmp = []
    for t in ts:
        n = t[12]
        user_i_tmp = t[10] 
        user_i_tmp = pad_sequences(user_i_tmp, maxlen=fixedlen, padding='pre', truncating='pre')
        user_c_tmp = t[11] 
        user_c_tmp = pad_sequences(user_c_tmp, maxlen=fixedlen, padding='pre', truncating='pre')
        
        if n>ulen or n==ulen:
            user_i_tmp = user_i_tmp[:ulen]
            user_c_tmp = user_c_tmp[:ulen]
        else:
            ns = ulen-n
            zero = np.zeros([ns, fixedlen], np.int64)
            user_i_tmp = np.row_stack((user_i_tmp,zero))
            user_c_tmp = np.row_stack((user_c_tmp,zero))
            
        user_i.append(user_i_tmp)
        user_c.append(user_c_tmp)

    return u, i, c, sl, hist_i, hist_c, user_sl, user_i, user_c, user_n, y

In [8]:
def get_xy_fd(data, fixedlen, ulen, isshort=False, hash_flag=False):
    
    feature_columns = [SparseFeat('user', user_count, embedding_dim=12, use_hash=hash_flag),
                       SparseFeat('item_id', item_count+1, embedding_dim=8, use_hash=hash_flag),
                       SparseFeat('cate_id', cate_count+1, embedding_dim=4, use_hash=hash_flag)]
    
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=item_count+1, embedding_dim=8, embedding_name='item_id'),
                         maxlen=fixedlen, length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id', cate_count+1, embedding_dim=4, embedding_name='cate_id'), maxlen=fixedlen,
                         length_name="seq_length")]
    
    feature_columns += [
        VarUserLenSparseFeat(SparseFeat('users_item_id', vocabulary_size=item_count+1, embedding_dim=8, embedding_name='item_id'),
                         maxuser=ulen, maxlen=fixedlen, user_length_name="user_length", length_name="user_seq_length"),
        VarUserLenSparseFeat(SparseFeat('users_cate_id', cate_count+1, embedding_dim=4, embedding_name='cate_id'), maxuser=ulen,
                        maxlen=fixedlen,user_length_name="user_length", length_name="user_seq_length")]
    
        
    # Notice: History behavior sequence feature name must start with "hist_".
    behavior_feature_list = ["item_id", "cate_id"]
    
    if isshort:
        u, i, c, sl, hist_i, hist_c, user_sl, user_i, user_c, user_n, y = getfixedseq_short(data,fixedlen,ulen)
        print("short time")
    else:
        u, i, c, sl, hist_i, hist_c, user_sl, user_i, user_c, user_n, y = getfixedseq(data,fixedlen,ulen)
        print("long time")
        
    
    uid = np.array(u)
    iid = np.array(i)  # 0 is mask value
    cate_id = np.array(c)  # 0 is mask value

    hist_iid = np.array(hist_i)
    hist_cate_id = np.array(hist_c)
    seq_length = np.array(sl)
    
    user_hist_iid = np.array(user_i)
    user_hist_cate_id = np.array(user_c)
    user_seq_length = np.array(user_sl)
    user_length = np.array(user_n)

    feature_dict = {'user': uid, 'item_id': iid, 'cate_id': cate_id,
                    'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id, "seq_length": seq_length,
                   'users_item_id':user_hist_iid, 'users_cate_id':user_hist_cate_id, "user_seq_length":user_seq_length,
                   "user_length":user_length}
    
    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array(y)
    return x, y, feature_columns, behavior_feature_list

In [9]:
if __name__ == "__main__":
    if tf.__version__ >= '2.0.0':
        tf.compat.v1.disable_eager_execution()
    
    auclist = []
    loglosslist = []
    repeats = 1
    fixedlen = 15
    ulen =10
    IsShort = True
    
    trainx, trainy, feature_columns, behavior_feature_list = get_xy_fd(train_data, fixedlen, ulen,isshort=IsShort)##默认long time,isshort= False
    testx, testy, feature_columns, behavior_feature_list = get_xy_fd(test_data, fixedlen, ulen,isshort=IsShort)
    
    
    for i in range(repeats):

        model = MIN(feature_columns, behavior_feature_list,itemshort=IsShort, att_head_num=1,dnn_hidden_units=(128, 64),task='binary')
        model.compile('adam', 'binary_crossentropy',
                  metrics=['binary_crossentropy'])

        filepath="./result/grocery/"+str(i)+"-weights-best.tflite"
        checkpointer = ModelCheckpoint(filepath, monitor='val_loss',verbose=1, save_best_only=True,mode='min',save_weights_only=True)
        callbacks_list= [checkpointer]
        
        #训练
        history = model.fit(trainx, trainy, batch_size=128, epochs=10, verbose=2, validation_split=0.2,callbacks=callbacks_list,shuffle=True)
        
        #测试
        loadfile = "./result/grocery/"+str(i)+ "-weights-best.tflite"
        model.load_weights(loadfile)
        pred_ans = model.predict(testx, batch_size=256)
        logloss = round(log_loss(testy, pred_ans,eps = 1e-7),5)
        auc = round(roc_auc_score(testy, pred_ans),5)
        
        loglosslist.append(logloss)
        auclist.append(auc)
         
        print("Test results of the "+str(i)+" training:")
        print("test LogLoss", logloss)
        print("test AUC", auc)
    
    print("loglosslist:",loglosslist)
    print("auclist:",auclist)
    print("Average best LogLoss:", round(np.mean(loglosslist),4))
    print("Average best AUC:", round(np.mean(auclist),4))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


short time
short time
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Train on 1280 samples, validate on 320 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.69154, saving model to ./result/grocery/0-weights-best.tflite
1280/1280 - 8s - loss: 0.7866 - binary_crossentropy: 0.7866 - val_loss: 0.6915 - val_binary_crossentropy: 0.6915
Epoch 2/10

Epoch 00002: val_loss did not improve from 0.69154
1280/1280 - 1s - loss: 0.7056 - binary_crossentropy: 0.7056 - val_loss: 0.6944 - val_binary_crossentropy: 0.6944
Epoch 3/10

Epoch 00003: val_loss did not improve from 0.69154
1280/1280 - 1s - loss: 0.6615 - binary_crossentropy: 0.6615 - val_loss: 0.7267 - val_binary_crossentropy: 0.7267
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.69154
1280/1280 - 1s - loss: 0.6084 - binary_crossentropy: 0.6084 - val_loss: 0.7452 - val_binary_crossentropy: 0.7452
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.69154
1280/1280 - 1s - loss: 0.4914 - binary_crossentropy: 0.4914 - val_loss: 0.7646 - val_binary_crossentropy: 0.7646
Epoch 6/10

Epoch