In [1]:
!rm info/*

In [18]:
# deepfm模型训练逻辑
# 基础依赖
import math
import argparse
import pickle
import boto3
import os
import numpy as np
import itertools
import tarfile
import pandas as pd
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
import deepfm
import random
# 模型相关
from tensorflow.python.keras.models import Model, save_model, load_model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
from deepmatch.layers import custom_objects

In [5]:
tqdm_notebook.pandas()

In [6]:
########################################
# 从s3同步数据
########################################
def sync_s3(file_name_list, s3_folder, local_folder):
    for f in file_name_list:
        print("file preparation: download src key {} to dst key {}".format(os.path.join(
            s3_folder, f), os.path.join(local_folder, f)))
        s3client.download_file(bucket, os.path.join(
            s3_folder, f), os.path.join(local_folder, f))
        
def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:  # Read in binary mode
        return s3client.upload_fileobj(f, bucket, key)

default_bucket = 'sagemaker-us-east-1-002224604296'
default_mk_region = '1'
level_1 = 'recommender-system-film-mk'

parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str, default=default_bucket)
parser.add_argument('--mk-region', type=str, default=default_mk_region)

args, _ = parser.parse_known_args()
bucket = args.bucket
mk_region = args.mk_region

prefix = f"{level_1}/{mk_region}"

print("bucket={}".format(bucket))
print("prefix='{}'".format(prefix))

s3client = boto3.client('s3')
local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)
# 行为数据加载
file_name_list = ['action.csv']
s3_folder = '{}/system/user-data/clean/latest'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
# youtubednn模型数据加载
file_name_list = ['raw_embed_item_mapping.pickle', 'raw_embed_user_mapping.pickle']
s3_folder = '{}/feature/action/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
file_name_list = ['user_embeddings.h5']
s3_folder = 'recommender-system-film-mk/1/model/recall/youtubednn/'
sync_s3(file_name_list, s3_folder, local_folder)
# 倒排列表的pickle文件
file_name_list = ['movie_id_movie_feature_dict.pickle']
s3_folder = '{}/feature/content/inverted-list/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)

# 加载所有人的数据
action_data_pddf = pd.read_csv("info/action.csv", sep='\t')
print("load {} action data".format(len(action_data_pddf)))
# 加载pickle文件
file_to_load = open("info/movie_id_movie_feature_dict.pickle", "rb")
dict_id_feature_pddf = pd.read_pickle(file_to_load)
print("length of movie_id v.s. movie_property {}".format(len(dict_id_feature_pddf)))
file_to_load = open("info/raw_embed_item_mapping.pickle", "rb")
raw_embed_item_mapping = pickle.load(file_to_load)
file_to_load = open("info/raw_embed_user_mapping.pickle", "rb")
raw_embed_user_mapping = pickle.load(file_to_load)
# 加载模型
user_embedding_model = load_model('info/user_embeddings.h5', custom_objects)
embed_dim = 32

bucket=sagemaker-us-east-1-002224604296
prefix='recommender-system-film-mk/1'
file preparation: download src key recommender-system-film-mk/1/system/user-data/clean/latest/action.csv to dst key info/action.csv
file preparation: download src key recommender-system-film-mk/1/feature/action/raw_embed_item_mapping.pickle to dst key info/raw_embed_item_mapping.pickle
file preparation: download src key recommender-system-film-mk/1/feature/action/raw_embed_user_mapping.pickle to dst key info/raw_embed_user_mapping.pickle
file preparation: download src key recommender-system-film-mk/1/model/recall/youtubednn/user_embeddings.h5 to dst key info/user_embeddings.h5
file preparation: download src key recommender-system-film-mk/1/feature/content/inverted-list/movie_id_movie_feature_dict.pickle to dst key info/movie_id_movie_feature_dict.pickle
load 207341 action data
length of movie_id v.s. movie_property 33767
Instructions for updating:
Call initializer instance with the dtype argument instead of p

In [7]:
sample_data_pddf = action_data_pddf[action_data_pddf['programType']==1][['label','userid','programId','timeStamp']]
sample_data_pddf.sort_values('timeStamp',inplace=True)

sample_data_pddf_click = sample_data_pddf[sample_data_pddf['label']==1]
user_click_record = {}
for reviewerID, hist in tqdm(sample_data_pddf_click.groupby('userid')):
    current_user_time_list = {}
    pos_list = hist['programId'].tolist()
    time_list = hist['timeStamp'].tolist()
    for idx, t in enumerate(time_list):
        current_user_time_list[t] = pos_list[idx:]
    user_click_record[str(reviewerID)] = current_user_time_list

sample_data_pddf = sample_data_pddf.drop_duplicates()
sample_data_pddf = sample_data_pddf.reset_index(drop=True)
sample_data_pddf.head()

100%|██████████| 1802/1802 [00:00<00:00, 4025.44it/s]


Unnamed: 0,label,userid,programId,timeStamp
0,0,13396,1690098,1608198651
1,1,13396,1690115,1608198651
2,0,13396,1690113,1608198653
3,0,13396,1690097,1608198653
4,1,13396,1690115,1608198657


In [8]:
def user_id_feat(x, i):
    return x[i]
#     return pd.Series(f_dict)

def min_max_norm(raw_list):
#     current_mean = np.mean(raw_list)
    min_max_norm = [((float)(i)-min(raw_list))/(max(raw_list)-min(raw_list)) for i in raw_list]
    return min_max_norm
#     if math.isclose(current_mean,0) or math.isclose(mean_value,0):
#         return raw_list
#     else:
#         raw_list = current_meanfloat(mean_value) * raw_list
#         return raw_list

def user_embed_func(x, user_click_record=user_click_record, user_embedding_model=user_embedding_model, dict_item_mapping=raw_embed_item_mapping, dict_user_mapping=raw_embed_user_mapping):
    current_time_stamp = x['timeStamp']
    if str(x['userid']) in user_click_record.keys():
        current_click_record = user_click_record[str(x['userid'])]
#         last_ts = list(current_click_record.keys())[0]
#         print("current stamp is {}".format(current_time_stamp))
        input_item_list = []
        for ts in current_click_record.keys():
            if current_time_stamp < ts:
#                 print("found last ts {}".format(ts))
                input_item_list = current_click_record[ts]
                break
        # get embedding value
        user_id = x['userid']
        watch_list_len = 50
        map_input_item_list = np.array([[0]*watch_list_len])
        watch_len = len(input_item_list)
        map_user_id = dict_user_mapping[str(user_id)]
        for cnt, item in enumerate(input_item_list):
            if cnt < 50:
                map_input_item_list[0][cnt] = dict_item_mapping[str(item)]
        model_input = {}
        model_input['user_id'] = np.array([int(map_user_id)])
        model_input['hist_movie_id'] = map_input_item_list
        model_input['hist_len'] = np.array([watch_len])

        # 更新用户的embeddings
    #     print("model input {}".format(model_input))
        updated_user_embs = user_embedding_model.predict(
            model_input, batch_size=2 ** 12)
        # 做归一化
        return min_max_norm(updated_user_embs[0])
    else:
#         print("zero click!!")
        return [0]*embed_dim

In [9]:
dict_id_feature_pddf['programId'] = dict_id_feature_pddf['programId'].astype(int)

In [10]:
sample_data_pddf = pd.merge(left=sample_data_pddf, right=dict_id_feature_pddf.drop_duplicates(), how='left', left_on='programId', right_on='programId')

In [11]:
# user id feature - user embedding
print("根据user_id的历史记录生成userid_feat（嵌入）")
sample_data_pddf['userid_feat'] = sample_data_pddf.progress_apply(user_embed_func, axis=1)

print("将{}维用户嵌入转化为不同的连续型feature".format(embed_dim))
for i in tqdm(range(embed_dim)):
    sample_data_pddf['user_feature_{}'.format(i)] = sample_data_pddf['userid_feat'].apply(lambda x: user_id_feat(x, i))

根据user_id的历史记录生成userid_feat（嵌入）


HBox(children=(FloatProgress(value=0.0, max=200884.0), HTML(value='')))

  3%|▎         | 1/32 [00:00<00:03,  7.83it/s]


将32维用户嵌入转化为不同的连续型feature


100%|██████████| 32/32 [00:04<00:00,  7.99it/s]


In [12]:
mk_data = sample_data_pddf
dense_feature_size = embed_dim
sparse_feature_size = 6
for i in range(dense_feature_size):
    if i < embed_dim:
        mk_data['I{}'.format(i+1)] = mk_data['user_feature_{}'.format(i)]
    
mk_sparse_features = ['C' + str(i)for i in range(1, sparse_feature_size+1)]
mk_dense_features = ['I'+str(i) for i in range(1, dense_feature_size+1)]
mk_data[mk_sparse_features] = mk_data[mk_sparse_features].fillna('-1', )
mk_data[mk_dense_features] = mk_data[mk_dense_features].fillna(0,)

mk_target = ['label']

In [13]:
mk_train, mk_test = train_test_split(mk_data, test_size=0.2)

In [14]:
continous_features = range(1, dense_feature_size+1)
categorial_features = range(1, sparse_feature_size+1)

In [19]:
output_dir = './'
with open(output_dir + 'tr.libsvm', 'w') as out_train:
    with open(output_dir + 'va.libsvm', 'w') as out_valid:
        for row in mk_train.iterrows():
            item_row = row[1]

            feat_vals = []
            for i in range(0, len(continous_features)):
                feat_vals.append(
                    str(continous_features[i]) + ':' + "{0:.6f}".format(item_row[mk_dense_features[i]]))

            for i in range(0, len(categorial_features)):
#                 val = dicts.gen(i, features[mk_sparse_features[i]]) + categorial_feature_offset[i]
                feat_vals.append(str(item_row[mk_sparse_features[i]]) + ':1')

            label = item_row['label']
            if random.randint(0, 9999) % 10 != 0:
                out_train.write("{0} {1}\n".format(label, ' '.join(feat_vals)))
            else:
                out_valid.write("{0} {1}\n".format(label, ' '.join(feat_vals)))

                
with open(output_dir + 'te.libsvm', 'w') as out_test:
    for row in mk_test.iterrows():
        item_row = row[1]

        feat_vals = []
        for i in range(0, len(continous_features)):
            feat_vals.append(
                str(continous_features[i]) + ':' + "{0:.6f}".format(item_row[mk_dense_features[i]]))

        for i in range(0, len(categorial_features)):
#                 val = dicts.gen(i, features[mk_sparse_features[i]]) + categorial_feature_offset[i]
            feat_vals.append(str(item_row[mk_sparse_features[i]]) + ':1')

        label = item_row['label']
        out_test.write("{0} {1}\n".format(label, ' '.join(feat_vals)))

In [22]:
!python deepfm.py --data_dir ./ --servable_model_dir serve --log_steps 10 --num_epochs 1 --field_size 38 --feature_size 117581 --deep_layers '2,2,2'





['deepfm.py', '--data_dir', './', '--servable_model_dir', 'serve', '--log_steps', '10', '--num_epochs', '1', '--field_size', '38', '--feature_size', '117581', '--deep_layers', '2,2,2']
task_type  train
model_dir  
data_dir  ./
dt_dir  20210324
num_epochs  1
feature_size  117581
field_size  38
embedding_size  32
batch_size  64
deep_layers  2,2,2
dropout  0.5,0.5,0.5
loss_type  log_loss
optimizer  Adam
learning_rate  0.0005
batch_norm_decay  0.9
batch_norm  False
l2_reg  0.0001
.//tr*libsvm
tr_files: ['./tr.libsvm']
va_files: ['./va.libsvm']
te_files: ['./te.libsvm']

W0325 07:35:42.667393 140202414319424 module_wrapper.py:139] From deepfm.py:419: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

input model dir is  
W0325 07:35:42.668226 140202414319424 estimator.py:1821] Using temporary folder as model directory: /tmp/tmpd0ks5ry8
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmpd0ks5ry8', '_tf_random_seed': None, '_save_summary_steps': 100, '


W0325 07:35:43.037142 140202414319424 module_wrapper.py:139] From deepfm.py:232: The name tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY is deprecated. Please use tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY instead.

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0325 07:35:43.038656 140202414319424 deprecation.py:323] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

W0325 07:35:43.048244 140202414319424 module_wrapper.py:139] From deepfm.py:247: The name tf.metrics.auc is deprecated. Please use tf.compat.v1.metrics.auc instead.

Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
W0325 07:35:43.118018 140202414319424 depre

INFO:tensorflow:global_step/sec: 149.747
I0325 07:35:56.365188 140202414319424 basic_session_run_hooks.py:692] global_step/sec: 149.747
INFO:tensorflow:loss = 0.26339057, step = 1600 (0.668 sec)
I0325 07:35:56.366428 140202414319424 basic_session_run_hooks.py:260] loss = 0.26339057, step = 1600 (0.668 sec)
INFO:tensorflow:global_step/sec: 153.11
I0325 07:35:57.018269 140202414319424 basic_session_run_hooks.py:692] global_step/sec: 153.11
INFO:tensorflow:loss = 0.17454745, step = 1700 (0.653 sec)
I0325 07:35:57.019295 140202414319424 basic_session_run_hooks.py:260] loss = 0.17454745, step = 1700 (0.653 sec)
INFO:tensorflow:global_step/sec: 154.433
I0325 07:35:57.665833 140202414319424 basic_session_run_hooks.py:692] global_step/sec: 154.433
INFO:tensorflow:loss = 0.18979602, step = 1800 (0.648 sec)
I0325 07:35:57.667066 140202414319424 basic_session_run_hooks.py:260] loss = 0.18979602, step = 1800 (0.648 sec)
INFO:tensorflow:global_step/sec: 157.604
I0325 07:35:58.300317 140202414319424

In [None]:
pred_ans = model.predict(mk_test_model_input, batch_size=256)
print("test LogLoss", round(log_loss(mk_test[mk_target].values, pred_ans), 4))
print("test AUC", round(roc_auc_score(mk_test[mk_target].values, pred_ans), 4))

In [None]:
from tensorflow import keras
model.save("DeepFM")

In [None]:
# 存储/更新模型文件
model_name = 'DeepFM'
tar = tarfile.open("model.tar.gz", "w:gz")
tar.add(model_name)
tar.close()
write_to_s3("model.tar.gz", bucket, "recommender-system-film-mk/1/model/rank/action/deepfm/latest/model.tar.gz")

In [None]:
!rm -r info/*
!python model-update-deepfm.py

# 测试DeepFM

In [18]:
!wget --no-check-certificate https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz

--2021-03-21 10:33:55--  https://s3-eu-west-1.amazonaws.com/kaggle-display-advertising-challenge-dataset/dac.tar.gz
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.62.51
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.62.51|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2021-03-21 10:33:56 ERROR 404: Not Found.



In [2]:
import numpy as np
print(np.__version__)

1.16.4
