In [2]:
# portrait batch logic
import argparse
import logging
import pickle
import re
import os

import boto3
import numpy as np
import pandas as pd
from tqdm import tqdm

In [6]:
########################################
# 从s3同步数据
########################################
s3client = boto3.client('s3')


def sync_s3(file_name_list, s3_folder, local_folder):
    for f in file_name_list:
        print("file preparation: download src key {} to dst key {}".format(os.path.join(
            s3_folder, f), os.path.join(local_folder, f)))
        s3client.download_file(bucket, os.path.join(
            s3_folder, f), os.path.join(local_folder, f))


def write_to_s3(filename, bucket, key):
    print("upload s3://{}/{}".format(bucket, key))
    with open(filename, 'rb') as f:  # Read in binary mode
        # return s3client.upload_fileobj(f, bucket, key)
        return s3client.put_object(
            ACL='bucket-owner-full-control',
            Bucket=bucket,
            Key=key,
            Body=f
        )

def write_str_to_s3(content, bucket, key):
    print("write s3://{}/{}, content={}".format(bucket, key, content))
    s3client.put_object(Body=str(content).encode("utf8"), Bucket=bucket, Key=key, ACL='bucket-owner-full-control')

default_bucket = 'aws-gcr-rs-sol-workshop-ap-southeast-1-522244679887'
default_prefix = 'sample-data'
parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str, default=default_bucket)
parser.add_argument('--prefix', type=str, default=default_prefix)
args, _ = parser.parse_known_args()
bucket = args.bucket
prefix = args.prefix

print("bucket={}".format(bucket))
print("prefix='{}'".format(prefix))

out_s3_path = "s3://{}/{}/feature/content/inverted-list".format(bucket, prefix)

local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)
# 行为/物品数据同步
file_name_list = ['action.csv']
s3_folder = '{}/system/action-data'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
file_name_list = ['recall_config.pickle']
s3_folder = '{}/model/recall'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
file_to_load = open("info/recall_config.pickle", "rb")
recall_config = pickle.load(file_to_load)
print("config recall")

bucket=aws-gcr-rs-sol-workshop-ap-southeast-1-522244679887
prefix='sample-data'
file preparation: download src key sample-data/system/action-data/action.csv to dst key info/action.csv
file preparation: download src key sample-data/model/recall/recall_config.pickle to dst key info/recall_config.pickle
config recall


In [5]:
df_filter_action = pd.read_csv('info/action.csv',sep='_!_',names=['user_id','news_id','timestamp','action_type','action'])

  """Entry point for launching an IPython kernel.


In [36]:
df_filter_action.head()

Unnamed: 0,user_id,news_id,timestamp,action_type,action
0,52a23654-9dc3-11eb-a364-acde48001122,6552345461607367172,1618477588,1,0
1,52a23654-9dc3-11eb-a364-acde48001122,6552332581256299016,1618472565,1,0
2,52a23654-9dc3-11eb-a364-acde48001122,6552130363123040771,1618467016,1,0
3,52a23654-9dc3-11eb-a364-acde48001122,6475484594673025293,1618462187,1,1
4,52a238fc-9dc3-11eb-a364-acde48001122,6552277802022863374,1618468013,1,0


In [16]:
recall_config = {}
recall_config['mt_topn'] = {}
recall_config['mt_topn']['type'] = 10
recall_config['mt_topn']['keywords'] = 10
recall_config['mt_topn']['entities'] = 10
recall_config['mt_topn']['words'] = 10
recall_config['mt_topn']['portrait_type'] = 10
recall_config['mt_topn']['portrait_keywords'] = 10
recall_config['pos_weights'] = {}
recall_config['pos_weights']['type'] = {}
recall_config['pos_weights']['type']['w'] = 0.5
recall_config['pos_weights']['type']['b'] = 0.2
recall_config['pos_weights']['keywords'] = {}
recall_config['pos_weights']['keywords']['w'] = 0.5
recall_config['pos_weights']['keywords']['b'] = 0.2
recall_config['pos_weights']['entities'] = {}
recall_config['pos_weights']['entities']['w'] = 0.5
recall_config['pos_weights']['entities']['b'] = 0.2
recall_config['pos_weights']['words'] = {}
recall_config['pos_weights']['words']['w'] = 0.5
recall_config['pos_weights']['words']['b'] = 0.2
recall_config['pos_weights']['portrait_type'] = {}
recall_config['pos_weights']['portrait_type']['w'] = 0.5
recall_config['pos_weights']['portrait_type']['b'] = 0.2
recall_config['pos_weights']['portrait_keywords'] = {}
recall_config['pos_weights']['portrait_keywords']['w'] = 0.5
recall_config['pos_weights']['portrait_keywords']['b'] = 0.2
recall_config['mt_weights'] = {}
recall_config['mt_weights']['type'] = 0.2
recall_config['mt_weights']['keywords'] = 0.5
recall_config['mt_weights']['entities'] = 1.0
recall_config['mt_weights']['words'] = 0.1
recall_config['mt_weights']['portrait_type'] = 0.2
recall_config['mt_weights']['portrait_keywords'] = 0.8
recall_config['pop_mt_list'] = ['type', 'keywords', 'entities', 'words']
recall_config['portrait_mt_list'] = ['type', 'keywords']
recall_config['merge_cnt'] = 100

In [17]:
# 存储recall的结果
file_name = "info/recall_config.pickle"
out_file = open(file_name, 'wb')
pickle.dump(recall_config, out_file)
out_file.close()

write_to_s3(file_name,
            bucket,
            '{}/feature/content/inverted-list/recall_config.pickle'.format(prefix))

upload s3://aws-gcr-rs-sol-workshop-ap-southeast-1-522244679887/sample-data/feature/content/inverted-list/recall_config.pickle


{'ResponseMetadata': {'RequestId': 'FXBJVP0E57BQ5WFD',
  'HostId': 'fy94KiYATpiGyRS4BLelZljgiu/5W5Bw71EHZxvjTjtpPi5VKBxHXjB9PhFeG1/NGZkyn18RDvY=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'fy94KiYATpiGyRS4BLelZljgiu/5W5Bw71EHZxvjTjtpPi5VKBxHXjB9PhFeG1/NGZkyn18RDvY=',
   'x-amz-request-id': 'FXBJVP0E57BQ5WFD',
   'date': 'Mon, 19 Apr 2021 03:02:10 GMT',
   'etag': '"d4da78b65bd3fc273de5ff7ef5687b47"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"d4da78b65bd3fc273de5ff7ef5687b47"'}

In [10]:
n = 0
for k, v in recall_config.items():
    print("k {} v {}".format(k,v))
    if n > 10:
        break
    n = n + 1

k mt_topn v {'type': 10, 'keywords': 10, 'entities': 10, 'words': 10}
k pos_weights v {'type': {'w': 0.5, 'b': 0.2}, 'keywords': {'w': 0.5, 'b': 0.2}, 'entities': {'w': 0.5, 'b': 0.2}, 'words': {'w': 0.5, 'b': 0.2}, 'portrait_type': {'w': 0.5, 'b': 0.2}, 'portrait_keywords': {'w': 0.5, 'b': 0.2}}
k mt_weights v {'type': 1.0, 'keywords': 1.0, 'entities': 1.0, 'words': 1.0, 'portrait_type': 1.0, 'portrait_keywords': 1.0}
k pop_mt_list v ['type', 'keywords', 'entities', 'words']
k portrait_mt_list v ['type', 'keywords']
k merge_cnt v 100


In [None]:
# initial config
