In [4]:
# importing libraries
import argparse
import os
import pickle

import boto3
import faiss
import pandas as pd
from tqdm import tqdm

import service_impl

########################################
# 从s3同步数据
########################################
s3client = boto3.client('s3')


def sync_s3(file_name_list, s3_folder, local_folder):
    for f in file_name_list:
        print("file preparation: download src key {} to dst key {}".format(os.path.join(
            s3_folder, f), os.path.join(local_folder, f)))
        s3client.download_file(bucket, os.path.join(
            s3_folder, f), os.path.join(local_folder, f))


def write_to_s3(filename, bucket, key):
    print("upload s3://{}/{}".format(bucket, key))
    with open(filename, 'rb') as f:  # Read in binary mode
        # return s3client.upload_fileobj(f, bucket, key)
        return s3client.put_object(
            ACL='bucket-owner-full-control',
            Bucket=bucket,
            Key=key,
            Body=f
        )


def write_str_to_s3(content, bucket, key):
    print("write s3://{}/{}, content={}".format(bucket, key, content))
    s3client.put_object(Body=str(content).encode("utf8"), Bucket=bucket, Key=key, ACL='bucket-owner-full-control')


default_bucket = 'sagemaker-us-east-1-002224604296'
default_mk_region = '1'
default_level_1 = 'recommender-system-film-mk'
parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str, default=default_bucket)
parser.add_argument('--mk-region', type=str, default=default_mk_region)
parser.add_argument('--level-one', type=str, default=default_level_1)
args, _ = parser.parse_known_args()
bucket = args.bucket
mk_region = args.mk_region
level_1 = args.level_one

prefix = f"{level_1}/{mk_region}"

print("bucket={}".format(bucket))
print("prefix='{}'".format(prefix))

local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)
# 行为数据加载
file_name_list = ['action.csv']
s3_folder = '{}/system/user-data/clean/latest'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
# ub数据加载
file_name_list = ['ub_item_vector.index', 'embed_raw_item_mapping.pickle']
s3_folder = '{}/feature/action/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
# 用户画像数据加载
file_name_list = ['portrait.pickle']
s3_folder = '{}/feature/recommend-list/portrait'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
# 倒排列表的pickle文件
file_name_list = ['movie_id_movie_property_dict.pickle',
                  'movie_category_movie_ids_dict.pickle',
                  'movie_director_movie_ids_dict.pickle',
                  'movie_actor_movie_ids_dict.pickle',
                  'movie_language_movie_ids_dict.pickle',
                  'movie_level_movie_ids_dict.pickle',
                  'movie_year_movie_ids_dict.pickle']
s3_folder = '{}/feature/content/inverted-list/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)

file_name_list = ['recall_config.pickle']
s3_folder = '{}/model/recall'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)

# 加载pickle文件
file_to_load = open("info/movie_id_movie_property_dict.pickle", "rb")
dict_id_content = pickle.load(file_to_load)
print("length of movie_id v.s. movie_property {}".format(len(dict_id_content)))

file_to_load = open("info/movie_category_movie_ids_dict.pickle", "rb")
dict_category_id = pickle.load(file_to_load)
print("length of movie_category v.s. movie_ids {}".format(len(dict_category_id)))

file_to_load = open("info/movie_director_movie_ids_dict.pickle", "rb")
dict_director_id = pickle.load(file_to_load)
print("length of movie_dicrector v.s. movie_ids {}".format(len(dict_director_id)))

file_to_load = open("info/movie_actor_movie_ids_dict.pickle", "rb")
dict_actor_id = pickle.load(file_to_load)
print("length of movie_actor v.s. movie_ids {}".format(len(dict_actor_id)))

file_to_load = open("info/movie_language_movie_ids_dict.pickle", "rb")
dict_language_id = pickle.load(file_to_load)
print("length of movie_lanugage v.s. movie_ids {}".format(len(dict_language_id)))

file_to_load = open("info/movie_level_movie_ids_dict.pickle", "rb")
dict_level_id = pickle.load(file_to_load)
print("length of movie_level v.s. movie_ids {}".format(len(dict_level_id)))

file_to_load = open("info/movie_year_movie_ids_dict.pickle", "rb")
dict_year_id = pickle.load(file_to_load)
print("length of movie_year v.s. movie_ids {}".format(len(dict_year_id)))

file_to_load = open("info/recall_config.pickle", "rb")
recall_config = pickle.load(file_to_load)
print("config recall")

ub_faiss_index = faiss.read_index('info/ub_item_vector.index')

file_to_load = open("info/embed_raw_item_mapping.pickle", "rb")
ub_idx_mapping = pickle.load(file_to_load)
print("length of item mapping {}".format(len(ub_idx_mapping)))

file_to_load = open("info/portrait.pickle", "rb")
user_portrait = pickle.load(file_to_load)
print("length of user_portrait {}".format(len(user_portrait)))

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


bucket=sagemaker-us-east-1-002224604296
prefix='recommender-system-film-mk/1'
file preparation: download src key recommender-system-film-mk/1/system/user-data/clean/latest/action.csv to dst key info/action.csv
file preparation: download src key recommender-system-film-mk/1/feature/action/ub_item_vector.index to dst key info/ub_item_vector.index
file preparation: download src key recommender-system-film-mk/1/feature/action/embed_raw_item_mapping.pickle to dst key info/embed_raw_item_mapping.pickle
file preparation: download src key recommender-system-film-mk/1/feature/recommend-list/portrait/portrait.pickle to dst key info/portrait.pickle
file preparation: download src key recommender-system-film-mk/1/feature/content/inverted-list/movie_id_movie_property_dict.pickle to dst key info/movie_id_movie_property_dict.pickle
file preparation: download src key recommender-system-film-mk/1/feature/content/inverted-list/movie_category_movie_ids_dict.pickle to dst key info/movie_category_movie_ids_

In [7]:
n = 0
for k, v in ub_idx_mapping.items():
    print("key {}, value {}".format(k,v))
    if n > 10:
        break
    n = n + 1

key 29477, value 1689713
key 29478, value 1689714
key 29479, value 1689715
key 29480, value 1689716
key 29481, value 1689717
key 29482, value 1689718
key 29483, value 1689719
key 29484, value 1689720
key 29485, value 1689721
key 29486, value 1689722
key 29487, value 1689723
key 29488, value 1689724


In [9]:
ub_idx_mapping[2]

'10035'

In [1]:
!rm -r info/*
!python recall-batch.py

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials
bucket=sagemaker-us-east-1-002224604296
prefix='recommender-system-film-mk/1'
file preparation: download src key recommender-system-film-mk/1/system/user-data/clean/latest/action.csv to dst key info/action.csv
file preparation: download src key recommender-system-film-mk/1/feature/action/ub_item_vector.index to dst key info/ub_item_vector.index
file preparation: download src key recommender-system-film-mk/1/feature/action/embed_raw_item_mapping.pickle to dst key info/embed_raw_item_mapping.pickle
file preparation: download src key recommender-system-film-mk/1/feature/recommend-list/portrait/portrait.pickle to dst key info/portrait.pickle
file preparation: download src key recommender-system-film-mk/1/feature/content/inverted-list/movie_id_movie_property_dict.pickle to dst key info/movie_id_movie_property_dict.pickle
file preparation: download src key recommender-system-film-mk/1/feature/content/i

In [2]:
!aws s3 cp info/recall_batch_result.pickle s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1/feature/recommend-list/movie/

upload: info/recall_batch_result.pickle to s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1/feature/recommend-list/movie/recall_batch_result.pickle
