In [1]:
!rm -r info/*

In [2]:
import argparse
import logging
import pickle
import re
import os
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

import boto3
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

s3 = boto3.client('s3')

logging.basicConfig(format='%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
                    datefmt='%Y-%m-%d:%H:%M:%S',
                    level=logging.INFO)

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  import sys


In [3]:
tqdm_notebook.pandas()

In [4]:
parser = argparse.ArgumentParser()
default_bucket = 'sagemaker-us-east-1-002224604296'
default_mk_region = '1'
level_1 = 'recommender-system-film-mk'

parser.add_argument('--bucket', type=str, default=default_bucket)
parser.add_argument('--mk-region', type=str, default=default_mk_region)

args, _ = parser.parse_known_args()
logging.info('Received arguments {}'.format(args))
bucket = args.bucket
mk_region = args.mk_region

out_s3_path = f's3://{bucket}/{level_1}/{mk_region}/feature/content/inverted-list'
input_s3 = f's3://{bucket}/{level_1}/{mk_region}/system/item-data'

# basic_s3_files = [f'{input_s3}/basic/{mk_region}.csv']
# expend_s3_files = [f'{input_s3}/expand/{mk_region}.xlsx']
#
basic_s3_files = []
expend_s3_files = []
all_mk_regions = "1,1271,1347,1686,1770,1940,2434,47,86"
for mk_rg in all_mk_regions.split(","):
    basic_s3_files.append(f's3://{bucket}/{level_1}/{mk_rg}/system/item-data/basic/{mk_rg}.csv')
    expend_s3_files.append(f's3://{bucket}/{level_1}/{mk_rg}/system/item-data/expand/{mk_rg}.xlsx')

logging.info(f"basic_s3_files={basic_s3_files}")
logging.info(f"expend_s3_files={expend_s3_files}")
logging.info(f"out_s3_path={out_s3_path}")

2021-03-18:05:37:18,156 INFO     [<ipython-input-4-157836270fd3>:10] Received arguments Namespace(bucket='sagemaker-us-east-1-002224604296', mk_region='1')
2021-03-18:05:37:18,158 INFO     [<ipython-input-4-157836270fd3>:27] basic_s3_files=['s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1/system/item-data/basic/1.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1271/system/item-data/basic/1271.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1347/system/item-data/basic/1347.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1686/system/item-data/basic/1686.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1770/system/item-data/basic/1770.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1940/system/item-data/basic/1940.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/2434/system/item-data/basic/2434.csv', 's3://sagemaker-us-east-1-0022

In [5]:
########################################
# 从s3同步数据
########################################
def sync_s3(file_name_list, s3_folder, local_folder):
    for f in file_name_list:
        print("file preparation: download src key {} to dst key {}".format(os.path.join(
            s3_folder, f), os.path.join(local_folder, f)))
        s3client.download_file(bucket, os.path.join(
            s3_folder, f), os.path.join(local_folder, f))
        
def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:  # Read in binary mode
        return s3client.upload_fileobj(f, bucket, key)

default_bucket = 'sagemaker-us-east-1-002224604296'
default_mk_region = '1'
level_1 = 'recommender-system-film-mk'

parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str, default=default_bucket)
parser.add_argument('--mk-region', type=str, default=default_mk_region)

args, _ = parser.parse_known_args()
bucket = args.bucket
mk_region = args.mk_region

prefix = f"{level_1}/{mk_region}"

print("bucket={}".format(bucket))
print("prefix='{}'".format(prefix))

s3client = boto3.client('s3')
local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)

# youtubednn模型数据加载
file_name_list = ['raw_embed_item_mapping.pickle', 'raw_embed_user_mapping.pickle', 'ub_item_embeddings.npy']
s3_folder = '{}/feature/action/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
# 倒排列表的pickle文件
file_name_list = ['movie_id_movie_property_dict.pickle']
s3_folder = '{}/feature/content/inverted-list/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)

# 加载pickle文件
file_to_load = open("info/movie_id_movie_property_dict.pickle", "rb")
dict_id_content = pickle.load(file_to_load)
print("length of movie_id v.s. movie_property {}".format(len(dict_id_content)))
file_to_load = open("info/raw_embed_item_mapping.pickle", "rb")
raw_embed_item_mapping = pickle.load(file_to_load)
file_to_load = open("info/raw_embed_user_mapping.pickle", "rb")
raw_embed_user_mapping = pickle.load(file_to_load)
# 加载模型
# user_embedding_model = load_model('info/user_embeddings.h5', custom_objects)
ub_item_embeddings = np.load("info/ub_item_embeddings.npy")
embed_dim = 32

bucket=sagemaker-us-east-1-002224604296
prefix='recommender-system-film-mk/1'
file preparation: download src key recommender-system-film-mk/1/feature/action/raw_embed_item_mapping.pickle to dst key info/raw_embed_item_mapping.pickle
file preparation: download src key recommender-system-film-mk/1/feature/action/raw_embed_user_mapping.pickle to dst key info/raw_embed_user_mapping.pickle
file preparation: download src key recommender-system-film-mk/1/feature/action/ub_item_embeddings.npy to dst key info/ub_item_embeddings.npy
file preparation: download src key recommender-system-film-mk/1/feature/content/inverted-list/movie_id_movie_property_dict.pickle to dst key info/movie_id_movie_property_dict.pickle
length of movie_id v.s. movie_property 33767


In [6]:
def prepare_df(basic_s3_files, expend_s3_files):
    expand_data_df = pd.concat([process_expend_file(expend_s3_path) for expend_s3_path in expend_s3_files],
                               axis=0, ignore_index=True).drop_duplicates('program_id')

    basic_df = pd.concat([process_basic_file(basic_s3_path) for basic_s3_path in basic_s3_files],
                         axis=0, ignore_index=True).drop_duplicates('program_id')

    df_merged = pd.merge(left=basic_df, right=expand_data_df, on='program_id', how='left')
    logging.info("df_merged len: {}".format(len(df_merged)))
    return df_merged

def process_expend_file(expend_s3_path):
    logging.info("process_expend_file() enter, expend_s3_path:{}".format(expend_s3_path))
    expand_dict = pd.read_excel(expend_s3_path, sheet_name=None)  # set sheet_name=None, read all sheets
    selected_columns = ['节目id', '导演', '演员', '受欢迎度', '票数', '评分', '等级']

    expand_data_df_arr = []
    for sheet_name in expand_dict.keys():
        logging.info("now process sheet: {}".format(sheet_name))
        df = expand_dict[sheet_name][selected_columns]
        expand_data_df_arr.append(df)
    expand_data_df = pd.concat(expand_data_df_arr, axis=0, ignore_index=True)
    logging.info("process_expend_file() return dataFrame, len: {}".format(len(expand_data_df)))
    expand_data_renamed_df = expand_data_df.rename(columns={'节目id': 'program_id',
                                                            '导演': 'director',
                                                            '演员': 'actor',
                                                            '受欢迎度': 'popularity',
                                                            '票数': 'ticket_num',
                                                            '评分': 'score',
                                                            '等级': 'level'})
    logging.info("expand df columns={}".format(expand_data_renamed_df.columns))
    return expand_data_renamed_df

def process_basic_file(basic_s3_path):
    logging.info("process_basic_file() enter, process_basic_file:{}".format(process_basic_file))
    df = pd.read_csv(basic_s3_path)
    logging.info("basic df columns={}".format(df.columns))
    return df

def get_actor(actor_str):
    if not actor_str or str(actor_str).lower() in ['nan', 'nr', '']:
        return [None]
    actor_str = re.sub(r"['\"\[\]]", '', actor_str)
    actor_arr = actor_str.split(',')
    return [item.strip().lower() for item in actor_arr]


def get_category(category_property):
    if not category_property or str(category_property).lower() in ['nan', 'nr', '']:
        return [None]
    if not category_property:
        return [None]
    return [item.strip().lower() for item in category_property.split(',')]


def get_single_item(item):
    if not item or str(item).lower().strip() in ['nan', 'nr', '']:
        return [None]
    return [str(item).lower().strip()]


def get_expend_file_from_basic(basic_s3_path):
    return re.sub(r"/basic/(\d+).csv", r"/expand/\1.xlsx", basic_s3_path)


def get_segment(basic_s3_path):
    m = re.search(r"/basic/(\d+).csv", basic_s3_path)
    s = 0
    if m:
        s = m.group(1)
    return s

def gen_movie_id_movie_property_dict(df):
    movie_id_movie_property_dict = {}
    for row in df.iterrows():
        item_row = row[1]
        program_id = str(item_row['program_id'])
        program_dict = {
            'director': get_single_item(item_row['director']),
            'level': get_single_item(item_row['level']),
            'year': get_single_item(item_row['release_year']),
            'actor': get_actor(item_row['actor']),
            'category': get_category(item_row['category_property']),
            'language': get_single_item(item_row['language'])
        }
        movie_id_movie_property_dict[program_id] = program_dict

    result_dict = {
        'movie_id_movie_property_dict': movie_id_movie_property_dict
    }
    return result_dict

def item_embed(x, raw_embed_item_mapping, ub_item_embeddings):
    embed_item_idx = raw_embed_item_mapping[str(x)]
    if  int(embed_item_idx) < len(ub_item_embeddings):
#         print(user_portrait[x])
        return ub_item_embeddings[int(embed_item_idx)]
    else:
        return [0]*embed_dim
    
def item_id_feat(x, i):
    return x[i]
#     return pd.Series(f_dict)

def sparse_item_id_feat(x, mt, dict_id_content=dict_id_content):
    result = dict_id_content[str(x)][mt]
    if result[0] is None:
        return None
    else:
        return '|'.join(result)

In [7]:
basic_s3_path = basic_s3_files
expend_s3_path = expend_s3_files
logging.info(f"gen_pick_files(), "
                 f"\nbasic_s3_path={basic_s3_path}, "
                 f"\nexpend_s3_path={expend_s3_path}, "
                 f"\nout_s3_path={out_s3_path}")

df = prepare_df(basic_s3_path, expend_s3_path)

2021-03-18:05:37:19,363 INFO     [<ipython-input-7-5cbf4cfdaf50>:3] gen_pick_files(), 
basic_s3_path=['s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1/system/item-data/basic/1.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1271/system/item-data/basic/1271.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1347/system/item-data/basic/1347.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1686/system/item-data/basic/1686.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1770/system/item-data/basic/1770.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1940/system/item-data/basic/1940.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/2434/system/item-data/basic/2434.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/47/system/item-data/basic/47.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/86

2021-03-18:05:37:23,932 INFO     [<ipython-input-6-1762c88f3cd9>:19] now process sheet: 3D Movie
2021-03-18:05:37:23,941 INFO     [<ipython-input-6-1762c88f3cd9>:23] process_expend_file() return dataFrame, len: 1243
2021-03-18:05:37:23,943 INFO     [<ipython-input-6-1762c88f3cd9>:31] expand df columns=Index(['program_id', 'director', 'actor', 'popularity', 'ticket_num', 'score',
       'level'],
      dtype='object')
2021-03-18:05:37:23,945 INFO     [<ipython-input-6-1762c88f3cd9>:13] process_expend_file() enter, expend_s3_path:s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/47/system/item-data/expand/47.xlsx
2021-03-18:05:37:25,125 INFO     [<ipython-input-6-1762c88f3cd9>:19] now process sheet: Movie
2021-03-18:05:37:25,129 INFO     [<ipython-input-6-1762c88f3cd9>:23] process_expend_file() return dataFrame, len: 4004
2021-03-18:05:37:25,131 INFO     [<ipython-input-6-1762c88f3cd9>:31] expand df columns=Index(['program_id', 'director', 'actor', 'popularity', 'ticket_nu

In [8]:
movie_id_movie_property_data = {}
row_cnt = 0
for row in df.iterrows():
    item_row = row[1]
    program_id = str(item_row['program_id'])
    program_dict = {
        'director': get_single_item(item_row['director']),
        'level': get_single_item(item_row['level']),
        'year': get_single_item(item_row['release_year']),
        'actor': get_actor(item_row['actor']),
        'category': get_category(item_row['category_property']),
        'language': get_single_item(item_row['language'])
    }
    row_content = []
    row_content.append(str(item_row['program_id']))
    row_content.append(program_dict['director'])
    row_content.append(program_dict['level'])
    row_content.append(program_dict['year'])
    row_content.append(program_dict['actor'])
    row_content.append(program_dict['category'])
    row_content.append(program_dict['language'])
    movie_id_movie_property_data['row_{}'.format(row_cnt)] = row_content 
    row_cnt = row_cnt + 1
    
raw_data_pddf = pd.DataFrame.from_dict(movie_id_movie_property_data, orient='index', columns=['programId', 'director','level','year','actor','actegory','language'])
raw_data_pddf = raw_data_pddf.reset_index(drop=True)

In [9]:
sample_data_pddf = raw_data_pddf

In [10]:
# item id feature - item embedding
print("根据item_id索引itemid_feat（嵌入）")
sample_data_pddf['itemid_feat'] = sample_data_pddf['programId'].progress_apply(lambda x: item_embed(x, raw_embed_item_mapping, ub_item_embeddings))
print("将{}维物品嵌入转化为不同的连续型feature".format(embed_dim))
for i in tqdm(range(embed_dim)):
    sample_data_pddf['item_feature_{}'.format(i)] = sample_data_pddf['itemid_feat'].apply(lambda x: item_id_feat(x, i))
# sparse feature
print("根据item_id对应的content生成离散feature")
popularity_method_list = ['category', 'director',
                          'actor', 'language', 'level', 'year']
for i, mt in tqdm(enumerate(popularity_method_list)):
    sample_data_pddf['sparse_feature_{}'.format(i)] = sample_data_pddf['programId'].apply(lambda x: sparse_item_id_feat(x, mt))

根据item_id索引itemid_feat（嵌入）


HBox(children=(FloatProgress(value=0.0, max=33767.0), HTML(value='')))

  0%|          | 0/32 [00:00<?, ?it/s]


将32维物品嵌入转化为不同的连续型feature


100%|██████████| 32/32 [00:00<00:00, 59.87it/s]
3it [00:00, 27.18it/s]

根据item_id对应的content生成离散feature


6it [00:00, 28.81it/s]


In [11]:
mk_data = sample_data_pddf
dense_feature_size = embed_dim
sparse_feature_size = 6
for i in range(dense_feature_size):
    mk_data['I{}'.format(i+embed_dim)] = mk_data['item_feature_{}'.format(i)]
for i in range(sparse_feature_size):
    mk_data['C{}'.format(i+1)] = mk_data['sparse_feature_{}'.format(i)]
    
mk_sparse_features = ['C' + str(i)for i in range(1, sparse_feature_size+1)]
mk_dense_features = ['I'+str(i+embed_dim-1) for i in range(1, dense_feature_size+1)]
mk_data[mk_sparse_features] = mk_data[mk_sparse_features].fillna('-1', )
mk_data[mk_dense_features] = mk_data[mk_dense_features].fillna(0,)

In [12]:
for feat in mk_sparse_features:
    lbe = LabelEncoder()
    mk_data[feat] = lbe.fit_transform(mk_data[feat])
nms = MinMaxScaler(feature_range=(0,1))
mk_data[mk_dense_features] = nms.fit_transform(mk_data[mk_dense_features])

In [13]:
movie_id_movie_feature_data = {}
for row in mk_data.iterrows():
    item_row = row[1]
#     print(item_row)
#     break
    program_dict = str(item_row['programId'])
    row_content = []
    row_content.append(str(item_row['programId']))
    dense_score = []
    for feat in mk_sparse_features:
        row_content.append(item_row[feat])
    for feat in mk_dense_features:
        row_content.append(item_row[feat])
        dense_score.append(item_row[feat])
    row_content.append(np.mean(dense_score))
    movie_id_movie_feature_data['row_{}'.format(row_cnt)] = row_content 
    row_cnt = row_cnt + 1

col_names = ['programId'] + mk_sparse_features + mk_dense_features + ['item_feat_mean']
mk_item_feature_pddf = pd.DataFrame.from_dict(movie_id_movie_feature_data, orient='index', columns=col_names)
mk_item_feature_pddf = mk_item_feature_pddf.reset_index(drop=True)

In [14]:
file_name = 'info/movie_id_movie_feature_dict.pickle'
mk_item_feature_pddf.to_pickle(file_name)
write_to_s3(file_name, bucket, "{}/feature/content/inverted-list/{}".format(prefix,file_name.split('/')[-1]))

In [16]:
!rm -r info/*
!python item-feature-update-batch.py

2021-03-18:05:43:07,280 INFO     [item-feature-update-batch.py:29] Received arguments Namespace(bucket='sagemaker-us-east-1-002224604296', mk_region='1')
2021-03-18:05:43:07,280 INFO     [item-feature-update-batch.py:46] basic_s3_files=['s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1/system/item-data/basic/1.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1271/system/item-data/basic/1271.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1347/system/item-data/basic/1347.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1686/system/item-data/basic/1686.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1770/system/item-data/basic/1770.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1940/system/item-data/basic/1940.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/2434/system/item-data/basic/2434.csv', 's3://sagemaker-us-east-1-00222460

2021-03-18:05:43:12,43 INFO     [item-feature-update-batch.py:126] now process sheet: New Movie
2021-03-18:05:43:12,45 INFO     [item-feature-update-batch.py:126] now process sheet: Movie
2021-03-18:05:43:12,46 INFO     [item-feature-update-batch.py:126] now process sheet: 3D Movie
2021-03-18:05:43:12,50 INFO     [item-feature-update-batch.py:130] process_expend_file() return dataFrame, len: 692
2021-03-18:05:43:12,51 INFO     [item-feature-update-batch.py:138] expand df columns=Index(['program_id', 'director', 'actor', 'popularity', 'ticket_num', 'score',
       'level'],
      dtype='object')
2021-03-18:05:43:12,52 INFO     [item-feature-update-batch.py:120] process_expend_file() enter, expend_s3_path:s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1770/system/item-data/expand/1770.xlsx
2021-03-18:05:43:12,209 INFO     [item-feature-update-batch.py:126] now process sheet: Movie
2021-03-18:05:43:12,212 INFO     [item-feature-update-batch.py:130] process_expend_file() 

2021-03-18:05:43:16,9 INFO     [item-feature-update-batch.py:144] basic df columns=Index(['program_id', 'program_type', 'program_name', 'sum_series',
       'new_series', 'createtime', 'series_type', 'star_level', 'release_year',
       'hdtv', 'play_level', 'category_property', 'language',
       'original_country', 'description', 'picture_url', 'length',
       'category_name'],
      dtype='object')
2021-03-18:05:43:16,9 INFO     [item-feature-update-batch.py:142] process_basic_file() enter, process_basic_file:<function process_basic_file at 0x7f71980a1950>
2021-03-18:05:43:16,90 INFO     [item-feature-update-batch.py:144] basic df columns=Index(['program_id', 'program_type', 'program_name', 'sum_series',
       'new_series', 'createtime', 'series_type', 'star_level', 'release_year',
       'hdtv', 'play_level', 'category_property', 'language',
       'original_country', 'description', 'picture_url', 'length',
       'category_name'],
      dtype='object')
2021-03-18:05:43:16,90 INF