In [3]:
!rm -r info/*

rm: cannot remove ‘info/*’: No such file or directory


In [7]:
# portrait batch logic
import argparse
import logging
import pickle
import re
import os

import boto3
import numpy as np
import pandas as pd
from tqdm import tqdm


In [8]:
tqdm_notebook.pandas()

In [9]:
########################################
# 从s3同步数据
########################################
def sync_s3(file_name_list, s3_folder, local_folder):
    for f in file_name_list:
        print("file preparation: download src key {} to dst key {}".format(os.path.join(
            s3_folder, f), os.path.join(local_folder, f)))
        s3client.download_file(bucket, os.path.join(
            s3_folder, f), os.path.join(local_folder, f))


def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:  # Read in binary mode
        return s3client.upload_fileobj(f, bucket, key)


default_bucket = 'sagemaker-us-east-1-002224604296'
default_mk_region = '1'
level_1 = 'recommender-system-film-mk'

parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str, default=default_bucket)
parser.add_argument('--mk-region', type=str, default=default_mk_region)

args, _ = parser.parse_known_args()
bucket = args.bucket
mk_region = args.mk_region

prefix = f"{level_1}/{mk_region}"

print("bucket={}".format(bucket))
print("prefix='{}'".format(prefix))

s3client = boto3.client('s3')
local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)
# 行为数据加载
file_name_list = ['action.csv']
s3_folder = '{}/system/user-data/clean/latest'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)

file_name_list = ['recall_config.pickle']
s3_folder = '{}/model/recall'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)

# 加载用户数据
user_click_records = {}
data_mk = pd.read_csv('info/action.csv',sep='\t')
for reviewerID, hist in tqdm(data_mk[(data_mk['label'] == 1) & (data_mk['programType'] == 1)].groupby('userid')):
    pos_list = hist['programId'].tolist()
    user_click_records[reviewerID] = pos_list
# 加载pickle文件
file_to_load = open("info/recall_config.pickle", "rb")
recall_config = pickle.load(file_to_load)
print("config recall")

bucket=sagemaker-us-east-1-002224604296
prefix='recommender-system-film-mk/1'
file preparation: download src key recommender-system-film-mk/1/system/user-data/clean/latest/action.csv to dst key info/action.csv
file preparation: download src key recommender-system-film-mk/1/model/recall/recall_config.pickle to dst key info/recall_config.pickle


2021-03-31:05:20:08,417 INFO     [utils.py:141] NumExpr defaulting to 4 threads.
100%|██████████| 1802/1802 [00:00<00:00, 7084.04it/s]

config recall





In [10]:
recall_config

{'mt_topn': {'category': 10,
  'director': 10,
  'actor': 10,
  'language': 10,
  'level': 10,
  'year': 10,
  'review': 10,
  'photo': 10,
  'portrait_category': 10,
  'portrait_director': 10,
  'portrait_actor': 10,
  'portrait_language': 10,
  'portrait_ub': 10},
 'pos_weights': {'category': {'w': 0.5, 'b': 0.2},
  'director': {'w': 0.5, 'b': 0.2},
  'actor': {'w': 0.5, 'b': 0.2},
  'language': {'w': 0.5, 'b': 0.2},
  'level': {'w': 0.5, 'b': 0.2},
  'year': {'w': 0.5, 'b': 0.2},
  'portrait_category': {'w': 0.5, 'b': 0.2},
  'portrait_director': {'w': 0.5, 'b': 0.2},
  'portrait_actor': {'w': 0.5, 'b': 0.2},
  'portrait_language': {'w': 0.5, 'b': 0.2}},
 'mt_weights': {'category': 1.0,
  'director': 1.0,
  'actor': 1.0,
  'language': 1.0,
  'level': 1.0,
  'year': 1.0,
  'portrait_category': 1.0,
  'portrait_director': 1.0,
  'portrait_actor': 1.0,
  'portrait_language': 1.0,
  'portrait_ub': 1.0},
 'pop_mt_list': ['category', 'director', 'actor', 'language', 'level', 'year'],
 'po

# update position weights

In [12]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [44]:
# X 为某种策略推荐的位置，比如是策略category，推荐位置为0，则这个数值就为0。假设下面为策略“category”推荐的1/3/5/10位置
X = np.array([[1], [3], [5], [10]])
# Y 为某种策略推荐的位置对应的点击率（该位置点击的数量/该位置总共曝光的数量；如果总共曝光数量太少，可以先用该策略总共曝光的数量为分母），
# 假设下面为策略“category”推荐的1/3/5/10位置的点击率
y = np.array([[0.3], [0.2], [0.001], [0]])
bias = 0
y = y + bias

In [45]:
reg = LinearRegression().fit(X,y)

In [48]:
# 则该方法的更新权重为（'category': {'w': -0.03295531, 'b': 0.28178771}）
print("the w is {} and b is {}".format(reg.coef_, reg.intercept_))

the w is [[-0.03295531]] and b is [0.28178771]


# update method weights

In [70]:
from sklearn.preprocessing import MinMaxScaler
# 假设统计了所有的mt的点击率如下
y = [0.02, 0.04, 0.03, 0.01, 0.05, 0.008, 0.04, 0.01, 0.03, 0.02, 0]
bias = 0.5
y = [e+bias for e in y]
y_sum = np.sum(y)
new_mt_weight = []
for mt_click in y:
    new_mt_weight.append([mt_click/y_sum])

scaler = MinMaxScaler()
scaler.fit(new_mt_weight)
scaler.transform(new_mt_weight)
# 则归一化后每个method的权重为
new_mt_weight

[[0.09030913511635984],
 [0.09378256339006599],
 [0.09204584925321291],
 [0.08857242097950677],
 [0.09551927752691906],
 [0.08822507815213615],
 [0.09378256339006599],
 [0.08857242097950677],
 [0.09204584925321291],
 [0.09030913511635984],
 [0.08683570684265368]]