In [1]:
# importing libraries
import argparse
import logging
import os
import pickle
from random import sample

import boto3

# tqdm.pandas()
# pandarallel.initialize(progress_bar=True)
# bucket = os.environ.get("BUCKET_NAME", " ")
# raw_data_folder = os.environ.get("RAW_DATA", " ")
logger = logging.getLogger()
logger.setLevel(logging.INFO)
s3client = boto3.client('s3')

In [3]:
########################################
# 从s3同步数据
########################################


def sync_s3(file_name_list, s3_folder, local_folder):
    for f in file_name_list:
        print("file preparation: download src key {} to dst key {}".format(os.path.join(
            s3_folder, f), os.path.join(local_folder, f)))
        s3client.download_file(bucket, os.path.join(
            s3_folder, f), os.path.join(local_folder, f))


def write_to_s3(filename, bucket, key):
    print("upload s3://{}/{}".format(bucket, key))
    with open(filename, 'rb') as f:  # Read in binary mode
        # return s3client.upload_fileobj(f, bucket, key)
        return s3client.put_object(
            ACL='bucket-owner-full-control',
            Bucket=bucket,
            Key=key,
            Body=f
        )

def write_str_to_s3(content, bucket, key):
    print("write s3://{}/{}, content={}".format(bucket, key, content))
    s3client.put_object(Body=str(content).encode("utf8"), Bucket=bucket, Key=key, ACL='bucket-owner-full-control')

default_bucket = 'aws-gcr-rs-sol-demo-ap-southeast-1-522244679887'
default_prefix = 'sample-data'
parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str)
parser.add_argument('--prefix', type=str)
args, _ = parser.parse_known_args()
bucket = args.bucket
prefix = args.prefix

print("bucket={}".format(bucket))
print("prefix='{}'".format(prefix))

out_s3_path = "s3://{}/{}/feature/content/inverted-list".format(bucket, prefix)

local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)
# recall & rank 结果加载
file_name_list = ['recall_batch_result.pickle', 'rank_batch_result.pickle']
s3_folder = '{}/feature/recommend-list/news'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
# 倒排列表的pickle文件
file_name_list = ['news_id_news_property_dict.pickle',
                  'news_type_news_ids_dict.pickle']
s3_folder = '{}/feature/content/inverted-list/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
# filter配置项
# file_name_list = ['filter_config.pickle']
# s3_folder = '{}/model/filter/'.format(prefix)
# sync_s3(file_name_list, s3_folder, local_folder)

# 加载pickle文件
file_to_load = open("info/news_id_news_property_dict.pickle", "rb")
dict_id_content = pickle.load(file_to_load)
print("length of news_id v.s. news_property {}".format(len(dict_id_content)))

file_to_load = open("info/news_type_news_ids_dict.pickle", "rb")
dict_type_id = pickle.load(file_to_load)
print("length of news_type v.s. news_ids {}".format(len(dict_type_id)))

# 加载filter配置
filter_config = {}
filter_config['category'] = ['news_story', 'news_culture', 'news_entertainment', 'news_sports', 'news_finance', 'news_house', 'news_car', 'news_edu', 'news_tech', 'news_military', 'news_travel', 'news_world', 'stock', 'news_agriculture', 'news_game']
filter_config['category_diversity_count'] = 10

# 加载recall结果
file_to_load = open("info/recall_batch_result.pickle", "rb")
dict_recall_result = pickle.load(file_to_load)

# 加载rank结果
file_to_load = open("info/rank_batch_result.pickle", "rb")
dict_rank_result = pickle.load(file_to_load)

bucket=aws-gcr-rs-sol-demo-ap-southeast-1-522244679887
prefix='sample-data'
file preparation: download src key sample-data/feature/recommend-list/news/recall_batch_result.pickle to dst key info/recall_batch_result.pickle
file preparation: download src key sample-data/feature/recommend-list/news/rank_batch_result.pickle to dst key info/rank_batch_result.pickle
file preparation: download src key sample-data/feature/content/inverted-list/news_id_news_property_dict.pickle to dst key info/news_id_news_property_dict.pickle
file preparation: download src key sample-data/feature/content/inverted-list/news_type_news_ids_dict.pickle to dst key info/news_type_news_ids_dict.pickle
length of news_id v.s. news_property 2660
length of news_type v.s. news_ids 15


In [117]:
# 返回结果格式设计：
# item_id | recall_type | recall_score | rank_type | rank_score | filter_type | filter_score

# recall_type: [运行时机]_[方法]_[位置]
# [运行时机]: batch/online
# [方法]: category/director/actor/language/level/year/review/photo/ub/portrai_xxx
# [位置]: 数字[0-xxx]

# recall_score: 召回得分，float型

# rank_type: [运行时机]_[方法]_[位置]
# [运行时机]: batch/online
# [数据源头]: action/portrait
# [方法]: deepfm/xgboost
# [位置]: 数字[0-xxx]

# rank_score: 排序得分，float型

# filter_type: [运行时机]_[方法]_[位置]
# [运行时机]: batch/online
# [方法]: recommend/coldstart/disparity
# [位置]: 数字[0-xxx]

# filter_score: 过滤得分，float型

In [12]:
def get_dict_pos(key, dict_var):
    return list(dict_var.keys()).index(str(key))


def calc_filter_score(recall_score, rank_score, recall_mt=None, rank_mt=None, recall_pos=None, rank_pos=None):
    filter_score = min(1.0, recall_score / 40.0 + rank_score)
    return round(filter_score, 2)


def mt_construct(timing, mt, pos):
    type_list = []
    type_list.append(str(timing))
    type_list.append(str(mt))
    type_list.append(str(pos))
    type_name = '_'.join(type_list)
    return type_name


def sort_and_fill_pos(filter_result):
    sort_filter_result = dict(
        sorted(filter_result.items(), key=lambda item: item[1][2], reverse=True))
    filter_pos = 0
    update_filter_result = dict()
    for filter_id, filter_content in sort_filter_result.items():
        current_trace = filter_content[3]
        current_trace_split_list = current_trace.split('|')
        current_filter_type = current_trace_split_list[4]
        current_filter_type_split_list = current_filter_type.split('_')
        update_filter_type_split_list = current_filter_type_split_list
        update_filter_type_split_list[2] = str(filter_pos)
        update_filter_type = '_'.join(update_filter_type_split_list)
        update_trace_split_list = current_trace_split_list
        update_trace_split_list[-2] = update_filter_type
        update_trace = '|'.join(update_trace_split_list)
        update_filter_content = filter_content
        update_filter_content[3] = update_trace
        #         print("update id {} trace {} type {}".format(filter_id, update_trace,update_filter_type_split_list))
        update_filter_result[str(filter_id)] = update_filter_content
        # update filter pos
        filter_pos = filter_pos + 1


def initial_diversity(stats_result, filter_config):
    for cate in filter_config['category']:
        stats_result[cate] = 0


def category_diversity_logic(filter_result, stats_result, dict_category_id, filter_config):
    diversity_count = filter_config['category_diversity_count']
    min_category = None
    min_category_count = 999
    candidate_category_list = []
    for cate, count in stats_result.items():
        if count < min_category_count and count != 0:
            min_category_count = count
            min_category = cate
        elif count == 0:
            candidate_category_list.append(cate)
    if min_category != None:
        candidate_category_list.append(min_category)
    diversity_result_list = []
    diversity_result_content_list = []
    current_diversity_count = 0

    filter_result_list = list(filter_result.keys())
    filter_result_content_list = list(filter_result.values())
    sample_try = 0
    catch_count = 0
    while catch_count < diversity_count:
        for cate in candidate_category_list:
            sample_try = sample_try + 1
            candidate_id = sample(dict_category_id[str(cate)], 1)[0]
            if candidate_id in filter_result_list:
                continue
            else:
                filter_result_list.append(str(candidate_id))
                filter_result_content_list.append([str(candidate_id), 'diversity', 0.0,
                                                   'batch_diversity_{}|{}'.format(len(filter_result_list), cate)])
                catch_count = catch_count + 1
                if catch_count >= diversity_count:
                    break
        if sample_try > 5 * diversity_count:
            logging.error(
                "fail to find enough diversity candidate, need to find {} but only find {}".format(diversity_count,
                                                                                                   catch_count + 1))
            break

    update_filter_result = dict(zip(filter_result_list, filter_result_content_list))
    return update_filter_result


# 同一批次去重/统计
# 运行时机
run_timing = 'batch'
dict_filter_result = {}
for user_id, recall_result in dict_recall_result.items():
    # print("user id {}".format(user_id))
    current_user_result = {}
    current_diversity_result = {}
    initial_diversity(current_diversity_result, filter_config)
    for recall_id, recall_property in recall_result.items():
        # print("item id {}".format(recall_id))
        # print("dict rank result {}".format(dict_rank_result[str(user_id)]))
        # 构建recall_type
        recall_type = mt_construct(run_timing, recall_property[1], recall_property[2])
        # 构建recall_score
        recall_score = round(recall_property[3], 2)
        # 构建rank_type
        rank_pos = str(get_dict_pos(int(recall_id), dict_rank_result[str(user_id)]))
        rank_type = mt_construct(run_timing, 'dkn', rank_pos)
        # 构建rank_score
        rank_score = round(float(dict_rank_result[str(user_id)][str(recall_id)]), 2)
        # 构建filter_type
        filter_type = mt_construct(run_timing, 'recommend', 'TBD')
        # 构建filter_score
        filter_score = calc_filter_score(recall_score, rank_score)
        #         print("{}|{}|{}|{}|{}|{}".format(recall_type,recall_score,rank_type,rank_score))
        #         break
        recommend_trace = "{}|{}|{}|{}|{}|{}".format(recall_type, recall_score, rank_type, rank_score, filter_type,
                                                     filter_score)
        current_user_result[str(recall_id)] = []
        current_user_result[str(recall_id)].append(str(recall_id))
        current_user_result[str(recall_id)].append('recommend')
        current_user_result[str(recall_id)].append(filter_score)
        current_user_result[str(recall_id)].append(recommend_trace)
        # 更新多样性统计
        current_category = dict_id_content[str(recall_id)]['type']
        for cate in current_category:
            if cate is not None:
                current_diversity_result[cate] = current_diversity_result[cate] + 1
    # 根据filter score更新排序
    sort_and_fill_pos(current_user_result)
    update_user_result = category_diversity_logic(current_user_result, current_diversity_result, dict_type_id,
                                                  filter_config)
    dict_filter_result[str(user_id)] = update_user_result

file_name = 'info/filter_batch_result.pickle'
output_file = open(file_name, 'wb')
pickle.dump(dict_filter_result, output_file)
output_file.close()

write_to_s3(file_name,
            bucket,
            '{}/feature/recommend-list/news/filter_batch_result.pickle'.format(prefix))

ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 8
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only find 9
ERROR:root:fail to find enough diversity candidate, need to find 10 but only

upload s3://aws-gcr-rs-sol-demo-ap-southeast-1-522244679887/sample-data/feature/recommend-list/news/filter_batch_result.pickle


{'ResponseMetadata': {'RequestId': 'Y4KPQXCRKF66XPW6',
  'HostId': 'CNHW/sLNPdbLCbcMr5Pzf1fdf8AtoEEOJcnYgSeUkp+JL8RTwRKyh8CIT7SvUokczw4BBUNrRmg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'CNHW/sLNPdbLCbcMr5Pzf1fdf8AtoEEOJcnYgSeUkp+JL8RTwRKyh8CIT7SvUokczw4BBUNrRmg=',
   'x-amz-request-id': 'Y4KPQXCRKF66XPW6',
   'date': 'Mon, 19 Apr 2021 07:55:32 GMT',
   'etag': '"fc8e023cbab51bef9e490a11d8b59573"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"fc8e023cbab51bef9e490a11d8b59573"'}

In [116]:
n = 0
for k, v in sort_id_score_dict.items():
    print("k {} v {}".format(k,v))
    if n > 10:
        break
    n = n + 1

k ['6552465493255520771', '6552333627890336263', '6552375889470947847', '6552317622828925454', '6552129817259541000', '6552390995558793735', '6552430823117685252', '6552401718930309639', '6552322115654124046', '6552358557814096388', '6552339837855203843', '6552678274802123268', '6552272755272712717', '6552414390778331661', '6518876067996893447', '6552447020051726862', '6553038669672874504', '6552299905497432584', '6552351206654607880', '6552231406087438852', '6552296494605533703', '6553135697790763523', '6454027220095598862', '6502025540390617358', '6552421326995325444', '6552370042753778183', '6551608303007302151', '6552440072292008451', '6552317622845702669', '6432979375800451330', '6462887973519098126', '6552300151321395725', '6552414856346075662', '6482621460023099662', '6552319161060557319', '6543027763874365966', '6552326434893857284', '6552303886323941891', '6552289068397363716', '6552377718296543757', '6553432442324124163', '6552463126468493838', '6552435868445966855', '6553986