In [6]:
!pip install -U Pandas
!pip install -U fsspec

Requirement already up-to-date: Pandas in /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages (1.1.5)
Collecting fsspec
  Downloading fsspec-0.8.7-py3-none-any.whl (103 kB)
[K     |████████████████████████████████| 103 kB 15.6 MB/s eta 0:00:01
Installing collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 0.7.1
    Uninstalling fsspec-0.7.1:
      Successfully uninstalled fsspec-0.7.1
Successfully installed fsspec-0.8.7


In [18]:
import argparse
import logging
import pickle
import re

import boto3
import numpy as np
import pandas as pd

s3 = boto3.client('s3')

logging.basicConfig(format='%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
                    datefmt='%Y-%m-%d:%H:%M:%S',
                    level=logging.INFO)

In [19]:
def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:  # Read in binary mode
        s3.upload_fileobj(f, bucket, key)

    return "s3://{}/{}".format(bucket, key)


def download_from_s3(filename, bucket, key):
    with open(filename, 'wb') as f:
        return s3.download_fileobj(bucket, key, f)


def list_s3_by_prefix_v2(s3_path, filter_func=None):
    bucket, key_prefix = get_bucket_key_from_s3_path(s3_path)
    return list_s3_by_prefix(bucket, key_prefix, filter_func)


def list_s3_by_prefix(bucket, key_prefix, filter_func=None):
    next_token = ''
    all_keys = []
    while True:
        if next_token:
            res = s3.list_objects_v2(
                Bucket=bucket,
                ContinuationToken=next_token,
                Prefix=key_prefix)
        else:
            res = s3.list_objects_v2(
                Bucket=bucket,
                Prefix=key_prefix)

        if 'Contents' not in res:
            break

        if res['IsTruncated']:
            next_token = res['NextContinuationToken']
        else:
            next_token = ''

        if filter_func:
            keys = ["s3://{}/{}".format(bucket, item['Key']) for item in res['Contents'] if filter_func(item['Key'])]
        else:
            keys = ["s3://{}/{}".format(bucket, item['Key']) for item in res['Contents']]

        all_keys.extend(keys)

        if not next_token:
            break
    print("find {} files in s3://{}/{}".format(len(all_keys), bucket, key_prefix))
    return all_keys


def get_bucket_key_from_s3_path(s3_path):
    m = re.match(r"s3://(.*?)/(.*)", s3_path)
    return m.group(1), m.group(2)


def process_expend_file(expend_s3_path):
    logging.info("process_expend_file() enter, expend_s3_path:{}".format(expend_s3_path))
    expand_dict = pd.read_excel(expend_s3_path, sheet_name=None)  # set sheet_name=None, read all sheets
    selected_columns = ['节目id', '导演', '演员', '受欢迎度', '票数', '评分', '等级']

    expand_data_df_arr = []
    for sheet_name in expand_dict.keys():
        logging.info("now process sheet: {}".format(sheet_name))
        df = expand_dict[sheet_name][selected_columns]
        expand_data_df_arr.append(df)
    expand_data_df = pd.concat(expand_data_df_arr, axis=0, ignore_index=True)
    logging.info("process_expend_file() return dataFrame, len: {}".format(len(expand_data_df)))
    expand_data_renamed_df = expand_data_df.rename(columns={'节目id': 'program_id',
                                                            '导演': 'director',
                                                            '演员': 'actor',
                                                            '受欢迎度': 'popularity',
                                                            '票数': 'ticket_num',
                                                            '评分': 'score',
                                                            '等级': 'level'})
    logging.info("expand df columns={}".format(expand_data_renamed_df.columns))
    return expand_data_renamed_df


def process_basic_file(basic_s3_path):
    logging.info("process_basic_file() enter, process_basic_file:{}".format(process_basic_file))
    df = pd.read_csv(basic_s3_path)
    logging.info("basic df columns={}".format(df.columns))
    return df


def prepare_df(basic_s3_files, expend_s3_files):
    expand_data_df = pd.concat([process_expend_file(expend_s3_path) for expend_s3_path in expend_s3_files],
                               axis=0, ignore_index=True).drop_duplicates('program_id')

    basic_df = pd.concat([process_basic_file(basic_s3_path) for basic_s3_path in basic_s3_files],
                         axis=0, ignore_index=True).drop_duplicates('program_id')

    df_merged = pd.merge(left=basic_df, right=expand_data_df, on='program_id', how='left')
    logging.info("df_merged len: {}".format(len(df_merged)))
    return df_merged


def gen_pickle_files(basic_s3_path, expend_s3_path, out_s3_path):
    logging.info(f"gen_pick_files(), "
                 f"\nbasic_s3_path={basic_s3_path}, "
                 f"\nexpend_s3_path={expend_s3_path}, "
                 f"\nout_s3_path={out_s3_path}")

    df = prepare_df(basic_s3_path, expend_s3_path)
    dicts_1 = gen_movie_id_movie_property_dict(df)
    dicts_2 = gen_movie_properties_to_movie_ids_dict(df)
    dicts_all = dicts_1
    dicts_all.update(dicts_2)
    bucket, out_prefix = get_bucket_key_from_s3_path(out_s3_path)
    for dict_name, dict_val in dicts_all.items():
        file_name = f'{dict_name}.pickle'
        # print("pickle =>", file_name)
        out_file = open(file_name, 'wb')
        pickle.dump(dict_val, out_file)
        out_file.close()
        # s3_url = S3Uploader.upload(file_name, out_s3_path)
        s3_url = write_to_s3(file_name, bucket, f'{out_prefix}/{file_name}')
        logging.info("write {}".format(s3_url))
    logging.info(f"generated {len(dicts_all)} pickle files")


def gen_movie_id_movie_property_dict(df):
    movie_id_movie_property_dict = {}
    for row in df.iterrows():
        item_row = row[1]
        program_id = str(item_row['program_id'])
        program_dict = {
            'director': get_single_item(item_row['director']),
            'level': get_single_item(item_row['level']),
            'year': get_single_item(item_row['release_year']),
            'actor': get_actor(item_row['actor']),
            'category': get_category(item_row['category_property']),
            'language': get_single_item(item_row['language'])
        }
        movie_id_movie_property_dict[program_id] = program_dict

    result_dict = {
        'movie_id_movie_property_dict': movie_id_movie_property_dict
    }
    return result_dict


def sort_by_score(df):
    df['popularity'].fillna(0, inplace=True)
    df['ticket_num'].fillna(0, inplace=True)
    df['score'].fillna(0, inplace=True)

    df['popularity_log'] = np.log1p(df['popularity'])
    df['ticket_num_log'] = np.log1p(df['ticket_num'])
    popularity_log_max = df['popularity_log'].max()
    popularity_log_min = df['popularity_log'].min()
    ticket_num_log_max = df['ticket_num_log'].max()
    ticket_num_log_min = df['ticket_num_log'].min()

    df = df.drop(['popularity', 'ticket_num'], axis=1)

    score_max = df['score'].max()
    score_min = df['score'].min()
    df['popularity_scaled'] = ((df['popularity_log'] - popularity_log_min) / (
                popularity_log_max - popularity_log_min)) * 10
    df['ticket_num_scaled'] = (df['ticket_num_log'] - ticket_num_log_min) / (
                ticket_num_log_max - ticket_num_log_min) * 10
    df['score_scaled'] = ((df['score'] - score_min) / (score_max - score_min)) * 10

    df['cal_score'] = df['popularity_scaled'] + df['ticket_num_scaled'] + df['score_scaled']

    df_with_score = df.drop(
        ['popularity', 'ticket_num', 'score', 'popularity_scaled', 'ticket_num_scaled', 'score_scaled'], axis=1)
    df_sorted = df_with_score.sort_values(by='cal_score', ascending=False)
    return df_sorted


def gen_movie_properties_to_movie_ids_dict(df):
    df_sorted = sort_by_score(df)

    movie_director_movie_ids_dict = {}
    movie_language_movie_ids_dict = {}
    movie_level_movie_ids_dict = {}
    movie_year_movie_ids_dict = {}

    movie_category_movie_ids_dict = {}
    movie_actor_movie_ids_dict = {}

    for row in df_sorted.iterrows():
        item_row = row[1]
        # program_id = {"id": item_row['program_id'], "score": item_row['cal_score'] }
        program_id = item_row['program_id']
        for key in [item for item in get_single_item(item_row['director']) if item is not None]:
            movie_director_movie_ids_dict.setdefault(key, []).append(program_id)

        for key in [item for item in get_single_item(item_row['level']) if item is not None]:
            movie_level_movie_ids_dict.setdefault(key, []).append(program_id)

        for key in [item for item in get_single_item(item_row['release_year']) if item is not None]:
            movie_year_movie_ids_dict.setdefault(key, []).append(program_id)

        for key in [item for item in get_single_item(item_row['language']) if item is not None]:
            movie_language_movie_ids_dict.setdefault(key, []).append(program_id)

        for key in [item for item in get_category(item_row['category_property']) if item is not None]:
            movie_category_movie_ids_dict.setdefault(key, []).append(program_id)

        for key in [item for item in get_actor(item_row['actor']) if item is not None]:
            movie_actor_movie_ids_dict.setdefault(key, []).append(program_id)

    result_dict = {
        'movie_director_movie_ids_dict': movie_director_movie_ids_dict,
        'movie_language_movie_ids_dict': movie_language_movie_ids_dict,
        'movie_level_movie_ids_dict': movie_level_movie_ids_dict,
        'movie_year_movie_ids_dict': movie_year_movie_ids_dict,
        'movie_category_movie_ids_dict': movie_category_movie_ids_dict,
        'movie_actor_movie_ids_dict': movie_actor_movie_ids_dict
    }

    return result_dict


def get_actor(actor_str):
    if not actor_str or str(actor_str).lower() in ['nan', 'nr', '']:
        return [None]
    actor_str = re.sub(r"['\"\[\]]", '', actor_str)
    actor_arr = actor_str.split(',')
    return [item.strip().lower() for item in actor_arr]


def get_category(category_property):
    if not category_property or str(category_property).lower() in ['nan', 'nr', '']:
        return [None]
    if not category_property:
        return [None]
    return [item.strip().lower() for item in category_property.split(',')]


def get_single_item(item):
    if not item or str(item).lower().strip() in ['nan', 'nr', '']:
        return [None]
    return [str(item).lower().strip()]


def get_expend_file_from_basic(basic_s3_path):
    return re.sub(r"/basic/(\d+).csv", r"/expand/\1.xlsx", basic_s3_path)


def get_segment(basic_s3_path):
    m = re.search(r"/basic/(\d+).csv", basic_s3_path)
    s = 0
    if m:
        s = m.group(1)
    return s

In [20]:
parser = argparse.ArgumentParser()
default_bucket = 'sagemaker-us-east-1-002224604296'
default_mk_region = '1'
level_1 = 'recommender-system-film-mk'

parser.add_argument('--bucket', type=str, default=default_bucket)
parser.add_argument('--mk-region', type=str, default=default_mk_region)

args, _ = parser.parse_known_args()
logging.info('Received arguments {}'.format(args))
bucket = args.bucket
mk_region = args.mk_region

out_s3_path = f's3://{bucket}/{level_1}/{mk_region}/feature/content/inverted-list'
input_s3 = f's3://{bucket}/{level_1}/{mk_region}/system/item-data'

# basic_s3_files = [f'{input_s3}/basic/{mk_region}.csv']
# expend_s3_files = [f'{input_s3}/expand/{mk_region}.xlsx']
#
basic_s3_files = []
expend_s3_files = []
all_mk_regions = "1,1271,1347,1686,1770,1940,2434,47,86"
for mk_rg in all_mk_regions.split(","):
    basic_s3_files.append(f's3://{bucket}/{level_1}/{mk_rg}/system/item-data/basic/{mk_rg}.csv')
    expend_s3_files.append(f's3://{bucket}/{level_1}/{mk_rg}/system/item-data/expand/{mk_rg}.xlsx')

logging.info(f"basic_s3_files={basic_s3_files}")
logging.info(f"expend_s3_files={expend_s3_files}")
logging.info(f"out_s3_path={out_s3_path}")

# gen_pickle_files(basic_s3_files, expend_s3_files, out_s3_path)

2021-03-17:09:01:46,798 INFO     [<ipython-input-20-1356d01b8830>:10] Received arguments Namespace(bucket='sagemaker-us-east-1-002224604296', mk_region='1')
2021-03-17:09:01:46,800 INFO     [<ipython-input-20-1356d01b8830>:27] basic_s3_files=['s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1/system/item-data/basic/1.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1271/system/item-data/basic/1271.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1347/system/item-data/basic/1347.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1686/system/item-data/basic/1686.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1770/system/item-data/basic/1770.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1940/system/item-data/basic/1940.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/2434/system/item-data/basic/2434.csv', 's3://sagemaker-us-east-1-00

In [21]:
basic_s3_path = basic_s3_files
expend_s3_path = expend_s3_files
logging.info(f"gen_pick_files(), "
                 f"\nbasic_s3_path={basic_s3_path}, "
                 f"\nexpend_s3_path={expend_s3_path}, "
                 f"\nout_s3_path={out_s3_path}")

df = prepare_df(basic_s3_path, expend_s3_path)
# dicts_1 = gen_movie_id_movie_property_dict(df)
# dicts_2 = gen_movie_properties_to_movie_ids_dict(df)
# dicts_all = dicts_1
# dicts_all.update(dicts_2)
# bucket, out_prefix = get_bucket_key_from_s3_path(out_s3_path)
# for dict_name, dict_val in dicts_all.items():
#     file_name = f'{dict_name}.pickle'
#     # print("pickle =>", file_name)
#     out_file = open(file_name, 'wb')
#     pickle.dump(dict_val, out_file)
#     out_file.close()
#     # s3_url = S3Uploader.upload(file_name, out_s3_path)
#     s3_url = write_to_s3(file_name, bucket, f'{out_prefix}/{file_name}')
#     logging.info("write {}".format(s3_url))
# logging.info(f"generated {len(dicts_all)} pickle files")


2021-03-17:09:01:48,242 INFO     [<ipython-input-21-78015f09aa6f>:3] gen_pick_files(), 
basic_s3_path=['s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1/system/item-data/basic/1.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1271/system/item-data/basic/1271.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1347/system/item-data/basic/1347.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1686/system/item-data/basic/1686.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1770/system/item-data/basic/1770.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/1940/system/item-data/basic/1940.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/2434/system/item-data/basic/2434.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/47/system/item-data/basic/47.csv', 's3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/8

2021-03-17:09:01:52,498 INFO     [<ipython-input-19-74f10773020b>:69] process_expend_file() return dataFrame, len: 1243
2021-03-17:09:01:52,500 INFO     [<ipython-input-19-74f10773020b>:77] expand df columns=Index(['program_id', 'director', 'actor', 'popularity', 'ticket_num', 'score',
       'level'],
      dtype='object')
2021-03-17:09:01:52,501 INFO     [<ipython-input-19-74f10773020b>:59] process_expend_file() enter, expend_s3_path:s3://sagemaker-us-east-1-002224604296/recommender-system-film-mk/47/system/item-data/expand/47.xlsx
2021-03-17:09:01:53,676 INFO     [<ipython-input-19-74f10773020b>:65] now process sheet: Movie
2021-03-17:09:01:53,681 INFO     [<ipython-input-19-74f10773020b>:69] process_expend_file() return dataFrame, len: 4004
2021-03-17:09:01:53,683 INFO     [<ipython-input-19-74f10773020b>:77] expand df columns=Index(['program_id', 'director', 'actor', 'popularity', 'ticket_num', 'score',
       'level'],
      dtype='object')
2021-03-17:09:01:53,684 INFO     [<ipyt

In [22]:
df.head()

Unnamed: 0,program_id,program_type,program_name,sum_series,new_series,createtime,series_type,star_level,release_year,hdtv,...,description,picture_url,length,category_name,director,actor,popularity,ticket_num,score,level
0,1689713,2,12 Hour Shift,1,1,2020-11-10 06:19:37,0,3,2020,0,...,Bodies start to pile up when a drug-user nurse...,photo/new_movie/12_hour_shift.jpg,86.0,New Movie,Brea Grant,"['Angela Bettis', 'David Arquette', 'Chloe Far...",84.793,83.0,5.1,
1,1689714,2,12 Days Of Christmas,1,1,2020-11-10 06:19:37,0,3,2020,0,...,Childhood friends Amy and Steve come home from...,photo/new_movie/12_days_of_christmas.jpg,86.0,New Movie,Michael Boyle,"['Annie Newton', 'Drew Petriello', 'Katee Shea...",4.33,1.0,6.0,NR
2,1689715,2,American Utopia,1,1,2020-11-10 06:19:37,0,3,2020,0,...,Spike Lee documents the former Talking Heads f...,photo/new_movie/american_utopia.jpg,135.0,New Movie,Spike Lee,"['David Byrne', 'Chris Giarmo', 'Angie Swan', ...",8.748,15.0,7.8,
3,1689716,2,Clouds,1,1,2020-11-10 06:19:37,0,3,2020,0,...,Young musician Zach Sobiech discovers his canc...,photo/new_movie/clouds.jpg,121.0,New Movie,Kara Holden,"['Steffan Argus', 'Sabrina Carpenter', 'Madiso...",85.442,480.0,8.5,PG-13
4,1689717,2,Cut Throat City,1,1,2020-11-10 06:19:37,0,3,2020,0,...,"Set after Hurricane Katrina, four boyhood frie...",photo/new_movie/cut_throat_city.jpg,132.0,New Movie,RZA,"['Terrence Howard', 'Wesley Snipes', 'T.I.', '...",261.266,28.0,6.4,15


In [37]:
def gen_movie_id_movie_property_dict(df):
    movie_id_movie_property_dict = {}
    for row in df.iterrows():
        item_row = row[1]
        program_id = str(item_row['program_id'])
        program_dict = {
            'director': get_single_item(item_row['director']),
            'level': get_single_item(item_row['level']),
            'year': get_single_item(item_row['release_year']),
            'actor': get_actor(item_row['actor']),
            'category': get_category(item_row['category_property']),
            'language': get_single_item(item_row['language'])
        }
        movie_id_movie_property_dict[program_id] = program_dict

    result_dict = {
        'movie_id_movie_property_dict': movie_id_movie_property_dict
    }
    return result_dict

def item_embed(x, raw_embed_item_mapping, ub_item_embeddings):
    embed_item_idx = raw_embed_item_mapping[str(x)]
    if  int(embed_item_idx) < len(ub_item_embeddings):
#         print(user_portrait[x])
        return ub_item_embeddings[int(embed_item_idx)]
    else:
        return [0]*embed_dim
    
def item_id_feat(x, i):
    return x[i]
#     return pd.Series(f_dict)

def sparse_item_id_feat(x, mt, dict_id_content=dict_id_content):
    result = dict_id_content[str(x)][mt]
    if result[0] is None:
        return None
    else:
        return '|'.join(result)

In [24]:
movie_id_movie_property_dict = {}
movie_id_movie_property_data = {}
row_cnt = 0
for row in df.iterrows():
    item_row = row[1]
    program_id = str(item_row['program_id'])
    program_dict = {
        'director': get_single_item(item_row['director']),
        'level': get_single_item(item_row['level']),
        'year': get_single_item(item_row['release_year']),
        'actor': get_actor(item_row['actor']),
        'category': get_category(item_row['category_property']),
        'language': get_single_item(item_row['language'])
    }
    movie_id_movie_property_dict[program_id] = program_dict
    row_content = []
    row_content.append(str(item_row['program_id']))
    row_content.append(program_dict['director'])
    row_content.append(program_dict['level'])
    row_content.append(program_dict['year'])
    row_content.append(program_dict['actor'])
    row_content.append(program_dict['category'])
    row_content.append(program_dict['language'])
    movie_id_movie_property_data['row_{}'.format(row_cnt)] = row_content 
    row_cnt = row_cnt + 1

result_dict = {
    'movie_id_movie_property_dict': movie_id_movie_property_dict
}

In [25]:
for k, v in movie_id_movie_property_data.items():
    if v[0] ==  '1592446':
        print(v)

['1592446', ['julie prendiville roux'], ['pg-13'], ['2018'], ['briana evigan', 'garret dillahunt', 'charlie tahan', 'peggy sheffield', 'mary rachel dudley', 'tom nowicki'], ['drama', 'mystery'], ['korean']]


In [26]:
raw_data_pddf = pd.DataFrame.from_dict(movie_id_movie_property_data, orient='index', columns=['programId', 'director','level','year','actor','actegory','language'])
raw_data_pddf = raw_data_pddf.reset_index(drop=True)
raw_data_pddf.head()

Unnamed: 0,programId,director,level,year,actor,actegory,language
0,1689713,[brea grant],[None],[2020],"[angela bettis, david arquette, chloe farnwort...","[comedy, horror, thriller]",[english]
1,1689714,[michael boyle],[None],[2020],"[annie newton, drew petriello, katee shean, sp...","[comedy, drama]",[english]
2,1689715,[spike lee],[None],[2020],"[david byrne, chris giarmo, angie swan, jacque...","[documentary, music, musical]",[english]
3,1689716,[kara holden],[pg-13],[2020],"[steffan argus, sabrina carpenter, madison ise...","[drama, music]",[english]
4,1689717,[rza],[15],[2020],"[terrence howard, wesley snipes, t.i., eiza go...","[action, crime, drama]",[english]


In [28]:
# deepfm模型训练逻辑
# 基础依赖
import argparse
import pickle
import boto3
import os
import numpy as np
import itertools
import tarfile
import pandas as pd
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook
# 模型相关
from tensorflow.python.keras.models import Model, save_model, load_model
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, DenseFeat,get_feature_names
from deepmatch.layers import custom_objects
tqdm_notebook.pandas()

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if sys.path[0] == '':



In [29]:
########################################
# 从s3同步数据
########################################
def sync_s3(file_name_list, s3_folder, local_folder):
    for f in file_name_list:
        print("file preparation: download src key {} to dst key {}".format(os.path.join(
            s3_folder, f), os.path.join(local_folder, f)))
        s3client.download_file(bucket, os.path.join(
            s3_folder, f), os.path.join(local_folder, f))
        
def write_to_s3(filename, bucket, key):
    with open(filename, 'rb') as f:  # Read in binary mode
        return s3client.upload_fileobj(f, bucket, key)

default_bucket = 'sagemaker-us-east-1-002224604296'
default_mk_region = '1'
level_1 = 'recommender-system-film-mk'

parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str, default=default_bucket)
parser.add_argument('--mk-region', type=str, default=default_mk_region)

args, _ = parser.parse_known_args()
bucket = args.bucket
mk_region = args.mk_region

prefix = f"{level_1}/{mk_region}"

print("bucket={}".format(bucket))
print("prefix='{}'".format(prefix))

s3client = boto3.client('s3')
local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)
# 行为数据加载
file_name_list = ['action.csv']
s3_folder = '{}/system/user-data/clean/latest'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
# youtubednn模型数据加载
file_name_list = ['raw_embed_item_mapping.pickle', 'raw_embed_user_mapping.pickle', 'ub_item_embeddings.npy']
s3_folder = '{}/feature/action/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
file_name_list = ['user_embeddings.h5']
s3_folder = 'recommender-system-film-mk/1/model/recall/youtubednn/'
sync_s3(file_name_list, s3_folder, local_folder)
# 倒排列表的pickle文件
file_name_list = ['movie_id_movie_property_dict.pickle']
s3_folder = '{}/feature/content/inverted-list/'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)

# 加载所有人的数据
action_data_pddf = pd.read_csv("info/action.csv", sep='\t')
print("load {} action data".format(len(action_data_pddf)))
# 加载pickle文件
file_to_load = open("info/movie_id_movie_property_dict.pickle", "rb")
dict_id_content = pickle.load(file_to_load)
print("length of movie_id v.s. movie_property {}".format(len(dict_id_content)))
file_to_load = open("info/raw_embed_item_mapping.pickle", "rb")
raw_embed_item_mapping = pickle.load(file_to_load)
file_to_load = open("info/raw_embed_user_mapping.pickle", "rb")
raw_embed_user_mapping = pickle.load(file_to_load)
# 加载模型
# user_embedding_model = load_model('info/user_embeddings.h5', custom_objects)
ub_item_embeddings = np.load("info/ub_item_embeddings.npy")
embed_dim = 32

bucket=sagemaker-us-east-1-002224604296
prefix='recommender-system-film-mk/1'
file preparation: download src key recommender-system-film-mk/1/system/user-data/clean/latest/action.csv to dst key info/action.csv
file preparation: download src key recommender-system-film-mk/1/feature/action/raw_embed_item_mapping.pickle to dst key info/raw_embed_item_mapping.pickle
file preparation: download src key recommender-system-film-mk/1/feature/action/raw_embed_user_mapping.pickle to dst key info/raw_embed_user_mapping.pickle
file preparation: download src key recommender-system-film-mk/1/feature/action/ub_item_embeddings.npy to dst key info/ub_item_embeddings.npy
file preparation: download src key recommender-system-film-mk/1/model/recall/youtubednn/user_embeddings.h5 to dst key info/user_embeddings.h5
file preparation: download src key recommender-system-film-mk/1/feature/content/inverted-list/movie_id_movie_property_dict.pickle to dst key info/movie_id_movie_property_dict.pickle
load 207341 act

In [30]:
sample_data_pddf = raw_data_pddf

# 生成encoding的逻辑

In [31]:
# !!!应该用用户注册数据来生成encoding map
data_mk = pd.read_csv('info/action.csv',sep='\t')
data_mk.head()

Unnamed: 0,label,userid,programId,programType,action,timeStamp,title,genres
0,1,13396,1690115,1,2,1608198657,jungleland,drama
1,1,13396,1690115,1,2,1608198696,jungleland,drama
2,1,170811011119,1690115,1,2,1610225454,jungleland,drama
3,1,170811012694,1690115,1,2,1612996593,jungleland,drama
4,1,24384,1690115,1,2,1614031172,jungleland,drama


In [32]:
# generate lable encoding/ sparse feature
lbe = LabelEncoder()
sample_data_pddf['encode_id'] = lbe.fit_transform(sample_data_pddf['programId']) + 1
data_mk['encode_id'] = lbe.fit_transform(data_mk['userid']) + 1

In [33]:
# constructu mapping dictionary
raw_user_id_list = list(map(str,data_mk['userid'].values))
code_user_id_list = list(map(int, data_mk['encode_id'].values))
raw_embed_user_id_dict = dict(zip(raw_user_id_list, code_user_id_list))
embed_raw_user_id_dict = dict(zip(code_user_id_list, raw_user_id_list))

raw_item_id_list = list(map(str,sample_data_pddf['programId'].values))
code_item_id_list = list(map(int, sample_data_pddf['encode_id'].values))
raw_embed_item_id_dict = dict(zip(raw_item_id_list, code_item_id_list))
embed_raw_item_id_dict = dict(zip(code_item_id_list, raw_item_id_list))

In [34]:
file_name = 'info/raw_embed_user_mapping.pickle'
output_file = open(file_name, 'wb')
pickle.dump(raw_embed_user_id_dict, output_file)
output_file.close()
write_to_s3(file_name, bucket, "{}/feature/action/{}".format(prefix,file_name.split('/')[-1]))

file_name = 'info/embed_raw_user_mapping.pickle'
output_file = open(file_name, 'wb')
pickle.dump(embed_raw_user_id_dict, output_file)
output_file.close()
write_to_s3(file_name, bucket, "{}/feature/action/{}".format(prefix,file_name.split('/')[-1]))

file_name = 'info/raw_embed_item_mapping.pickle'
output_file = open(file_name, 'wb')
pickle.dump(raw_embed_item_id_dict, output_file)
output_file.close()
write_to_s3(file_name, bucket, "{}/feature/action/{}".format(prefix,file_name.split('/')[-1]))

file_name = 'info/embed_raw_item_mapping.pickle'
output_file = open(file_name, 'wb')
pickle.dump(embed_raw_item_id_dict, output_file)
output_file.close()
write_to_s3(file_name, bucket, "{}/feature/action/{}".format(prefix,file_name.split('/')[-1]))

# 准备item feature的逻辑

In [38]:
sample_data_pddf = raw_data_pddf

In [39]:
# item id feature - item embedding
print("根据item_id索引itemid_feat（嵌入）")
sample_data_pddf['itemid_feat'] = sample_data_pddf['programId'].progress_apply(lambda x: item_embed(x, raw_embed_item_mapping, ub_item_embeddings))
print("将{}维物品嵌入转化为不同的连续型feature".format(embed_dim))
for i in tqdm(range(embed_dim)):
    sample_data_pddf['item_feature_{}'.format(i)] = sample_data_pddf['itemid_feat'].apply(lambda x: item_id_feat(x, i))
# sparse feature
print("根据item_id对应的content生成离散feature")
popularity_method_list = ['category', 'director',
                          'actor', 'language', 'level', 'year']
for i, mt in tqdm(enumerate(popularity_method_list)):
    sample_data_pddf['sparse_feature_{}'.format(i)] = sample_data_pddf['programId'].apply(lambda x: sparse_item_id_feat(x, mt))

根据item_id索引itemid_feat（嵌入）


HBox(children=(FloatProgress(value=0.0, max=33767.0), HTML(value='')))

  0%|          | 0/32 [00:00<?, ?it/s]


将32维物品嵌入转化为不同的连续型feature


100%|██████████| 32/32 [00:00<00:00, 53.62it/s]
3it [00:00, 25.72it/s]

根据item_id对应的content生成离散feature


6it [00:00, 25.44it/s]


In [40]:
mk_data = sample_data_pddf
dense_feature_size = embed_dim
sparse_feature_size = 6
for i in range(dense_feature_size):
    mk_data['I{}'.format(i+embed_dim)] = mk_data['item_feature_{}'.format(i)]
for i in range(sparse_feature_size):
    mk_data['C{}'.format(i+1)] = mk_data['sparse_feature_{}'.format(i)]
    
mk_sparse_features = ['C' + str(i)for i in range(1, sparse_feature_size+1)]
mk_dense_features = ['I'+str(i+embed_dim-1) for i in range(1, dense_feature_size+1)]
mk_data[mk_sparse_features] = mk_data[mk_sparse_features].fillna('-1', )
mk_data[mk_dense_features] = mk_data[mk_dense_features].fillna(0,)

In [41]:
for feat in mk_sparse_features:
    lbe = LabelEncoder()
    mk_data[feat] = lbe.fit_transform(mk_data[feat])
nms = MinMaxScaler(feature_range=(0,1))
mk_data[mk_dense_features] = nms.fit_transform(mk_data[mk_dense_features])

In [42]:
movie_id_movie_feature_data = {}
for row in mk_data.iterrows():
    item_row = row[1]
#     print(item_row)
#     break
    program_dict = str(item_row['programId'])
    row_content = []
    row_content.append(str(item_row['programId']))
    dense_score = []
    for feat in mk_sparse_features:
        row_content.append(item_row[feat])
    for feat in mk_dense_features:
        row_content.append(item_row[feat])
        dense_score.append(item_row[feat])
    row_content.append(np.mean(dense_score))
    movie_id_movie_feature_data['row_{}'.format(row_cnt)] = row_content 
    row_cnt = row_cnt + 1

col_names = ['programId'] + mk_sparse_features + mk_dense_features + ['item_feat_mean']
mk_item_feature_pddf = pd.DataFrame.from_dict(movie_id_movie_feature_data, orient='index', columns=col_names)
mk_item_feature_pddf = mk_item_feature_pddf.reset_index(drop=True)
mk_item_feature_pddf.head()

Unnamed: 0,programId,C1,C2,C3,C4,C5,C6,I32,I33,I34,...,I55,I56,I57,I58,I59,I60,I61,I62,I63,item_feat_mean
0,1689713,488,506,303,5,0,146,0.606309,0.369546,0.318244,...,0.585189,0.663026,0.563234,0.389655,0.576787,0.592338,0.63899,0.66946,0.638478,0.519118
1,1689714,444,2909,355,5,0,146,0.517393,0.199965,0.423809,...,0.426267,0.552058,0.475349,0.473056,0.446144,0.71861,0.466241,0.774546,0.464693,0.470722
2,1689715,591,3894,1193,5,0,146,0.389094,0.185065,0.515736,...,0.308511,0.556993,0.429952,0.367128,0.524741,0.857228,0.365313,0.498664,0.303065,0.536152
3,1689716,652,2336,4582,5,65,146,0.606364,0.369539,0.315198,...,0.586647,0.665346,0.561612,0.390187,0.57756,0.591738,0.638981,0.672033,0.638829,0.519078
4,1689717,111,3688,4761,5,17,146,0.415875,0.259687,0.31887,...,0.736896,0.632151,0.449616,0.490006,0.493057,0.712044,0.706084,0.618382,0.540678,0.520524


In [47]:
file_name = 'info/movie_id_movie_feature_dict.pickle'
mk_item_feature_pddf.to_pickle(file_name)
write_to_s3(file_name, bucket, "{}/feature/content/inverted-list/{}".format(prefix,file_name.split('/')[-1]))

In [97]:
test_pdf = pd.read_pickle(file_name)

In [1]:
test_pdf.head()

NameError: name 'test_pdf' is not defined