In [1]:
import argparse
import logging
import os
import pickle
import re
import argparse
import boto3
import pandas as pd
from sklearn.preprocessing import LabelEncoder

s3client = boto3.client('s3')

logging.basicConfig(format='%(asctime)s,%(msecs)d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
                    datefmt='%Y-%m-%d:%H:%M:%S',
                    level=logging.INFO)

In [4]:
def sync_s3(file_name_list, s3_folder, local_folder):
    for f in file_name_list:
        print("file preparation: download src key {} to dst key {}".format(os.path.join(
            s3_folder, f), os.path.join(local_folder, f)))
        s3client.download_file(bucket, os.path.join(
            s3_folder, f), os.path.join(local_folder, f))


def write_to_s3(filename, bucket, key):
    print("upload s3://{}/{}".format(bucket, key))
    with open(filename, 'rb') as f:  # Read in binary mode
        # return s3client.upload_fileobj(f, bucket, key)
        return s3client.put_object(
            ACL='bucket-owner-full-control',
            Bucket=bucket,
            Key=key,
            Body=f
        )

def write_str_to_s3(content, bucket, key):
    print("write s3://{}/{}, content={}".format(bucket, key, content))
    s3client.put_object(Body=str(content).encode("utf8"), Bucket=bucket, Key=key, ACL='bucket-owner-full-control')

default_bucket = 'aws-gcr-rs-sol-demo-ap-southeast-1-522244679887'
default_prefix = 'sample-data'
parser = argparse.ArgumentParser()
parser.add_argument('--bucket', type=str)
parser.add_argument('--prefix', type=str)
args, _ = parser.parse_known_args()
bucket = args.bucket
prefix = args.prefix

print("bucket={}".format(bucket))
print("prefix='{}'".format(prefix))

out_s3_path = "s3://{}/{}/feature/content/inverted-list".format(bucket, prefix)

local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)
local_folder = 'info'
if not os.path.exists(local_folder):
    os.makedirs(local_folder)
# 行为/物品数据同步
file_name_list = ['user.csv']
s3_folder = '{}/system/user-data'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)
file_name_list = ['item.csv']
s3_folder = '{}/system/item-data'.format(prefix)
sync_s3(file_name_list, s3_folder, local_folder)

df_filter_item = pd.read_csv('info/item.csv',sep='_!_',names=['news_id','type_code','type','title','keywords','popularity','new'])

df_filter_user = pd.read_csv('info/user.csv',sep='_!_',names=['user_id','gender','age','timestamp','user_name'])

df_filter_user.head()

bucket=aws-gcr-rs-sol-demo-ap-southeast-1-522244679887
prefix='sample-data'
file preparation: download src key sample-data/system/user-data/user.csv to dst key info/user.csv
file preparation: download src key sample-data/system/item-data/item.csv to dst key info/item.csv




Unnamed: 0,user_id,gender,age,timestamp,user_name
0,52a23fd2-9dc3-11eb-a364-acde48001122,M,35,1616192903,obsessedLocust2
1,52a2495a-9dc3-11eb-a364-acde48001122,F,60,1616311490,adoringOatmeal0
2,52a26552-9dc3-11eb-a364-acde48001122,F,38,1616462642,chicFish5
3,52a284ce-9dc3-11eb-a364-acde48001122,F,47,1616986620,superiorLeopard9
4,52a28d02-9dc3-11eb-a364-acde48001122,M,57,1616686014,annoyedGelding8


In [5]:
# generate lable encoding/ sparse feature
lbe = LabelEncoder()
df_filter_item['encode_id'] = lbe.fit_transform(df_filter_item['news_id'])
df_filter_user['encode_id'] = lbe.fit_transform(df_filter_user['user_id'])

# constructu mapping dictionary
raw_user_id_list = list(map(str, df_filter_user['user_id'].values))
code_user_id_list = list(map(int, df_filter_user['encode_id'].values))
raw_embed_user_id_dict = dict(zip(raw_user_id_list, code_user_id_list))
embed_raw_user_id_dict = dict(zip(code_user_id_list, raw_user_id_list))

raw_item_id_list = list(map(str, df_filter_item['news_id'].values))
code_item_id_list = list(map(int, df_filter_item['encode_id'].values))
raw_embed_item_id_dict = dict(zip(raw_item_id_list, code_item_id_list))
embed_raw_item_id_dict = dict(zip(code_item_id_list, raw_item_id_list))

file_name = 'info/raw_embed_user_mapping.pickle'
output_file = open(file_name, 'wb')
pickle.dump(raw_embed_user_id_dict, output_file)
output_file.close()
write_to_s3(file_name, bucket, "{}/feature/action/{}".format(prefix, file_name.split('/')[-1]))

file_name = 'info/embed_raw_user_mapping.pickle'
output_file = open(file_name, 'wb')
pickle.dump(embed_raw_user_id_dict, output_file)
output_file.close()
write_to_s3(file_name, bucket, "{}/feature/action/{}".format(prefix, file_name.split('/')[-1]))

file_name = 'info/raw_embed_item_mapping.pickle'
output_file = open(file_name, 'wb')
pickle.dump(raw_embed_item_id_dict, output_file)
output_file.close()
write_to_s3(file_name, bucket, "{}/feature/action/{}".format(prefix, file_name.split('/')[-1]))

file_name = 'info/embed_raw_item_mapping.pickle'
output_file = open(file_name, 'wb')
pickle.dump(embed_raw_item_id_dict, output_file)
output_file.close()
write_to_s3(file_name, bucket, "{}/feature/action/{}".format(prefix, file_name.split('/')[-1]))

upload s3://aws-gcr-rs-sol-demo-ap-southeast-1-522244679887/sample-data/feature/action/raw_embed_user_mapping.pickle
upload s3://aws-gcr-rs-sol-demo-ap-southeast-1-522244679887/sample-data/feature/action/embed_raw_user_mapping.pickle
upload s3://aws-gcr-rs-sol-demo-ap-southeast-1-522244679887/sample-data/feature/action/raw_embed_item_mapping.pickle
upload s3://aws-gcr-rs-sol-demo-ap-southeast-1-522244679887/sample-data/feature/action/embed_raw_item_mapping.pickle


{'ResponseMetadata': {'RequestId': '1196EK6G1JSN1N11',
  'HostId': 'T3/277oB1jf4MUCsneG0jtHQGQMt+d2mQMkA4ZL5m0ACGpjoaGCS+BNM1EDuzRNw8EJnfp4gFmc=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'T3/277oB1jf4MUCsneG0jtHQGQMt+d2mQMkA4ZL5m0ACGpjoaGCS+BNM1EDuzRNw8EJnfp4gFmc=',
   'x-amz-request-id': '1196EK6G1JSN1N11',
   'date': 'Wed, 21 Apr 2021 06:06:52 GMT',
   'etag': '"36fe985b0d0d5a76d07a546c527a7733"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"36fe985b0d0d5a76d07a546c527a7733"'}

In [11]:
n = 0
for k, v in rd['news_id_news_property_dict'].items():
    print("k {} v {}".format(k,v))
    if n > 10:
        break
    n = n + 1

k 6552418723179790856 v {'title': ['谢娜三喜临门何炅送祝福吴昕送祝福只有沈梦辰不一样'], 'type': ['news_entertainment'], 'keywords': ['杜海涛', '谢娜', '何炅', '沈梦辰', '吴昕', '快本'], 'tfidf': {'杜海涛': 1.1915032285682567, '谢娜': 0.8605173146234215, '何炅': 0.9546056150797302, '沈梦辰': 1.3327195386327908, '吴昕': 1.1915032285682567, '快本': 1.1161723746110805}, 'entities': [40191, 0, 46990, 1871, 5802, 162743, 1871, 5802, 315, 390701, 28, 302, 0, 0, 0, 0], 'words': [559632, 0, 613175, 0, 0, 754092, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
k 6552390851157295629 v {'title': ['杨幂景甜徐冬冬唐嫣不好好穿衣却美的有趣又撩人'], 'type': ['news_entertainment'], 'keywords': ['杨幂', '徐冬冬', '背带裙', '大唐荣耀', '唐嫣', '景甜'], 'tfidf': {'杨幂': 1.1161723746110805, '徐冬冬': 1.3327195386327908, '背带裙': 1.5158215867441425, '大唐荣耀': 1.5158215867441425, '唐嫣': 1.3327195386327908, '景甜': 1.3327195386327908}, 'entities': [20585, 130577, 193876, 71718, 28, 2798, 20784, 382, 727, 2, 5876, 121, 67692, 0, 0, 0], 'words': [0, 0, 359872, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
k 6552309039697494532 v {'ti