# 首先，我们先生成物品对应的特征，我们构建一个物品到特征对应的字典，方便后面构建用户画像的时候使用

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from pandas import  DataFrame
# 为每个物品生成对应的特征，这里我们只用到了product_code、product_type_no、graphical_appearance_no、
# colour_group_code、perceived_colour_value_id、perceived_colour_master_id这6个特征。
art = pd.read_csv("./data/rawdata/articles.csv")
article_dict = dict()  # {12:{id1,id2,...,id_k}, 34:{id1,id2,...,id_k}}, 这里面每个物品对应的特征权重都一样
for _, row in art.iterrows():
    article_id = row['article_id']
    product_code = row['product_code']
    product_type_no = row['product_type_no']
    graphical_appearance_no = row['graphical_appearance_no']
    colour_group_code = row['colour_group_code']
    perceived_colour_value_id = row['perceived_colour_value_id']
    perceived_colour_master_id = row['perceived_colour_master_id']
    feature_dict = dict()
    feature_dict['product_code'] = product_code
    feature_dict['product_type_no'] = product_type_no
    feature_dict['graphical_appearance_no'] = graphical_appearance_no
    feature_dict['colour_group_code'] = colour_group_code
    feature_dict['perceived_colour_value_id'] = perceived_colour_value_id
    feature_dict['perceived_colour_master_id'] = perceived_colour_master_id
    article_dict[article_id] = feature_dict
# print(article_dict)
np.save("./output/article_dict.npy", article_dict)

# 第二步是基于物品相关信息，为每个特征生成对应的倒排索引字典（key是对应的特征，value是具备该特征的所有物品集合）

In [None]:
# 基于物品的特征，为每个特征生成对应的倒排索引
# 需要生成倒排索引的特征包括如下几个：
# product_code, 产品code，7位数字字符，如 0108775，是 article_id 的前 7 位。
# prod_name, 产品名，如 Strap top（系带上衣）
# product_type_no, 产品类型no，2位或者3位数字，有 -1 值。
# product_type_name, 产品类型名。如 Vest top（背心）
# graphical_appearance_no, 图案外观no，如 1010016。
# graphical_appearance_name, 图案外观名，如 Solid（固体;立体图形）
# colour_group_code, 颜色组code，如 09，2位数字
# colour_group_name, 颜色组名称， 如 Black。
# perceived_colour_value_id, 感知颜色值id。-1，1，2，3，4，5，6，7，一共这几个值。
# perceived_colour_value_name, 感知颜色值名称。如 Dark（黑暗的），Dusty Light等
# perceived_colour_master_id, 感知颜色主id。1位或者2位数字。
# perceived_colour_master_name, 感知颜色主名称。如 Beige（浅褐色的）

art = pd.read_csv("./data/rawdata/articles.csv")
product_code_unique = np.unique(art[["product_code"]])  # 取某一列的所有唯一值，array([108775, 111565, ..., 959461])
product_type_no_unique = np.unique(art[["product_type_no"]])
graphical_appearance_no_unique = np.unique(art[["graphical_appearance_no"]])
colour_group_code_unique = np.unique(art[["colour_group_code"]])
perceived_colour_value_id_unique = np.unique(art[["perceived_colour_value_id"]])
perceived_colour_master_id_unique = np.unique(art[["perceived_colour_master_id"]])
product_code_portrait_dict = dict()  # {12:{id1,id2,...,id_k}, 34:{id1,id2,...,id_k}}, 这里面每个物品对应的特征权重都一样
product_type_no_portrait_dict = dict()
graphical_appearance_no_portrait_dict = dict()
colour_group_code_portrait_dict = dict()
perceived_colour_value_id_portrait_dict = dict()
perceived_colour_master_id_portrait_dict = dict()
for _, row in art.iterrows():
    article_id = row['article_id']
    product_code = row['product_code']
    product_type_no = row['product_type_no']
    graphical_appearance_no = row['graphical_appearance_no']
    colour_group_code = row['colour_group_code']
    perceived_colour_value_id = row['perceived_colour_value_id']
    perceived_colour_master_id = row['perceived_colour_master_id']
    if product_code in product_code_portrait_dict:
        product_code_portrait_dict[product_code].add(article_id)
    else:
        product_code_portrait_dict[product_code] = set([article_id])
    if product_type_no in product_type_no_portrait_dict:
        product_type_no_portrait_dict[product_type_no].add(article_id)
    else:
        product_type_no_portrait_dict[product_type_no] = set([article_id])
    if graphical_appearance_no in graphical_appearance_no_portrait_dict:
        graphical_appearance_no_portrait_dict[graphical_appearance_no].add(article_id)
    else:
        graphical_appearance_no_portrait_dict[graphical_appearance_no] = set([article_id])
    if colour_group_code in colour_group_code_portrait_dict:
        colour_group_code_portrait_dict[colour_group_code].add(article_id)
    else:
        colour_group_code_portrait_dict[colour_group_code] = set([article_id])
    if perceived_colour_value_id in perceived_colour_value_id_portrait_dict:
        perceived_colour_value_id_portrait_dict[perceived_colour_value_id].add(article_id)
    else:
        perceived_colour_value_id_portrait_dict[perceived_colour_value_id] = set([article_id])
    if perceived_colour_master_id in perceived_colour_master_id_portrait_dict:
        perceived_colour_master_id_portrait_dict[perceived_colour_master_id].add(article_id)
    else:
        perceived_colour_master_id_portrait_dict[perceived_colour_master_id] = set([article_id])

np.save("./output/product_code_portrait_dict.npy", product_code_portrait_dict)
np.save("./output/product_type_no_portrait_dict.npy", product_type_no_portrait_dict)
np.save("./output/graphical_appearance_no_portrait_dict.npy", graphical_appearance_no_portrait_dict)
np.save("./output/colour_group_code_portrait_dict.npy", colour_group_code_portrait_dict)
np.save("./output/perceived_colour_value_id_portrait_dict.npy", perceived_colour_value_id_portrait_dict)
np.save("./output/perceived_colour_master_id_portrait_dict.npy", perceived_colour_master_id_portrait_dict)

# 基于用户的行为数据（transactions_train.csv）构建用户的兴趣画像

In [3]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from pandas import  DataFrame
# 基于用户行为数据，为每个用户生成用户画像。
trans = pd.read_csv("./data/rawdata/transactions_train.csv")
user_portrait = dict()
article_dict = np.load("./output/article_dict.npy", allow_pickle=True).item()
for _, row in trans.iterrows():
    customer_id = row['customer_id']
    article_id = row['article_id']
    feature_dict = article_dict[article_id]
    # article_dict[957375001]
    # {'product_code': 957375, 'product_type_no': 72,
    # 'graphical_appearance_no': 1010016, 'colour_group_code': 9,
    # 'perceived_colour_value_id': 4, 'perceived_colour_master_id': 5}
    product_code = feature_dict['product_code']
    product_type_no = feature_dict['product_type_no']
    graphical_appearance_no = feature_dict['graphical_appearance_no']
    colour_group_code = feature_dict['colour_group_code']
    perceived_colour_value_id = feature_dict['perceived_colour_value_id']
    perceived_colour_master_id = feature_dict['perceived_colour_master_id']
    if customer_id in user_portrait:
        portrait_dict = user_portrait[customer_id]
        # { 'product_code': set([108775, 116379])
        #   'product_type_no': set([253, 302, 304, 306])
        #   'graphical_appearance_no': set([1010016, 1010017])
        #   'colour_group_code': set([9, 11, 13])
        #   'perceived_colour_value_id': set([1, 3, 4, 2])
        #   'perceived_colour_master_id': set([11, 5 ,9])
        #   }
        if 'product_code' in portrait_dict:
            portrait_dict['product_code'].add(product_code)
        else:
            portrait_dict['product_code'] = set([product_code])
        if 'product_type_no' in portrait_dict:
            portrait_dict['product_type_no'].add(product_type_no)
        else:
            portrait_dict['product_type_no'] = set([product_type_no])
        if 'graphical_appearance_no' in portrait_dict:
            portrait_dict['graphical_appearance_no'].add(graphical_appearance_no)
        else:
            portrait_dict['graphical_appearance_no'] = set([graphical_appearance_no])
        if 'colour_group_code' in portrait_dict:
            portrait_dict['colour_group_code'].add(colour_group_code)
        else:
            portrait_dict['colour_group_code'] = set([colour_group_code])
        if 'perceived_colour_value_id' in portrait_dict:
            portrait_dict['perceived_colour_value_id'].add(perceived_colour_value_id)
        else:
            portrait_dict['perceived_colour_value_id'] = set([perceived_colour_value_id])
        if 'perceived_colour_master_id' in portrait_dict:
            portrait_dict['perceived_colour_master_id'].add(perceived_colour_master_id)
        else:
            portrait_dict['perceived_colour_master_id'] = set([perceived_colour_master_id])
        user_portrait[customer_id] = portrait_dict
    else:
        portrait_dict = dict()
        portrait_dict['product_code'] = set([product_code])
        portrait_dict['product_type_no'] = set([product_type_no])
        portrait_dict['graphical_appearance_no'] = set([graphical_appearance_no])
        portrait_dict['colour_group_code'] = set([colour_group_code])
        portrait_dict['perceived_colour_value_id'] = set([perceived_colour_value_id])
        portrait_dict['perceived_colour_master_id'] = set([perceived_colour_master_id])
        user_portrait[customer_id] = portrait_dict
np.save("./output/user_portrait.npy", user_portrait)

# 基于用户兴趣的种子物品召回



In [1]:
def seeds_recall(seeds, rec_num):
    """
    基于用户喜欢的种子物品，为用户召回关联物品。
    :param seeds: list，用户种子物品 ~ [item1,item2, ..., item_i]
    :param rec_num: 最终召回的物品数量
    :return: list ~ [(item1,score1),(item2,score2), ..., (item_k,score_k)]
    """
    jaccard_sim_rec_path = "./data/output/netflix_prize_jaccard_sim_rec.npy"
    sim = np.load(jaccard_sim_rec_path, allow_pickle=True).item()
    recalls = []
    for seed in seeds:
        recalls.extend(sim[seed])
    # 可能不同召回的物品有重叠，那么针对重叠的，可以将score累加，然后根据score降序排列。
    tmp_dict = dict()
    for (i, s) in recalls:
        if i in tmp_dict:
            tmp_dict[i] = tmp_dict[i] + s
        else:
            tmp_dict[i] = s
    rec = sorted(tmp_dict.items(), key=lambda item: item[1], reverse=True)
    return rec[0:rec_num]

# 基于标签的用户画像召回

In [2]:
import pandas as pd
import numpy as np
import random

rec_num = 30
user_portrait = np.load("./output/user_portrait.npy", allow_pickle=True).item()
product_code_portrait_dict = np.load("./output/product_code_portrait_dict.npy", allow_pickle=True).item()
product_type_no_portrait_dict = np.load("./output/product_type_no_portrait_dict.npy", 
                                        allow_pickle=True).item()
graphical_appearance_no_portrait_dict = np.load("./output/graphical_appearance_no_portrait_dict.npy", 
                                                allow_pickle=True).item()
colour_group_code_portrait_dict = np.load("./output/colour_group_code_portrait_dict.npy", 
                                          allow_pickle=True).item()
perceived_colour_value_id_portrait_dict = np.load("./output/perceived_colour_value_id_portrait_dict.npy", 
                                                  allow_pickle=True).item()
perceived_colour_master_id_portrait_dict = np.load("./output/perceived_colour_master_id_portrait_dict.npy", 
                                                   allow_pickle=True).item()
# {12:{id1,id2,...,id_k}, 34:{id1,id2,...,id_k}}, 这里面每个物品对应的特征权重都一样
customer_rec = dict()
k =0
###取前一千个用户
for customer in user_portrait.keys():
    if k>1000:
        break
    portrait_dict = user_portrait[customer]
    # { 'product_code': set([108775, 116379])
    #   'product_type_no': set([253, 302, 304, 306])
    #   'graphical_appearance_no': set([1010016, 1010017])
    #   'colour_group_code': set([9, 11, 13])
    #   'perceived_colour_value_id': set([1, 3, 4, 2])
    #   'perceived_colour_master_id': set([11, 5 ,9])
    #   }
    product_code_rec = set()
    product_type_no_rec = set()
    graphical_appearance_no_rec = set()
    colour_group_code_rec = set()
    perceived_colour_value_id_rec = set()
    perceived_colour_master_id_rec = set()
    rec = []
    # 针对6类特征画像类型，用户在某个类型中都可能有兴趣点，针对每个兴趣点获得对应的物品id，将同一个画像类型
    # 中所有的兴趣点的物品推荐聚合到一起，最后对该兴趣画像类型，只取 rec_num 个推荐。
    # 最后，对6个兴趣画像类型的推荐，最终合并在一起，只取 rec_num 个作为最终的推荐。
    if 'product_code' in portrait_dict:
        product_code_set = portrait_dict['product_code']
        for product_code in product_code_set:
            product_code_rec = product_code_rec | product_code_portrait_dict[product_code]
        if len(product_code_rec) > rec_num:
            s = random.sample(product_code_rec, rec_num)
        else:
            s = product_code_rec
        rec.append(s)
    if 'product_type_no' in portrait_dict:
        product_type_no_set = portrait_dict['product_type_no']
        for product_type_no in product_type_no_set:
            product_type_no_rec = product_type_no_rec | product_type_no_portrait_dict[product_type_no]
        if len(product_code_rec) > rec_num:
            s = random.sample(product_code_rec, rec_num)
        else:
            s = product_code_rec
        rec.append(s)
    if 'graphical_appearance_no' in portrait_dict:
        graphical_appearance_no_set = portrait_dict['graphical_appearance_no']
        for graphical_appearance_no in graphical_appearance_no_set:
            graphical_appearance_no_rec = graphical_appearance_no_rec | graphical_appearance_no_portrait_dict[graphical_appearance_no]
        if len(product_code_rec) > rec_num:
            s = random.sample(product_code_rec, rec_num)
        else:
            s = product_code_rec
        rec.append(s)
    if 'colour_group_code' in portrait_dict:
        colour_group_code_set = portrait_dict['colour_group_code']
        for colour_group_code in colour_group_code_set:
            colour_group_code_rec = colour_group_code_rec | colour_group_code_portrait_dict[colour_group_code]
        if len(product_code_rec) > rec_num:
            s = random.sample(product_code_rec, rec_num)
        else:
            s = product_code_rec
        rec.append(s)
    if 'perceived_colour_value_id' in portrait_dict:
        perceived_colour_value_id_set = portrait_dict['perceived_colour_value_id']
        for perceived_colour_value_id in perceived_colour_value_id_set:
            perceived_colour_value_id_rec = perceived_colour_value_id_rec | perceived_colour_value_id_portrait_dict[perceived_colour_value_id]
        if len(product_code_rec) > rec_num:
            s = random.sample(product_code_rec, rec_num)
        else:
            s = product_code_rec
        rec.append(s)
    if 'perceived_colour_master_id' in portrait_dict:
        perceived_colour_master_id_set = portrait_dict['perceived_colour_master_id']
        for perceived_colour_master_id in perceived_colour_master_id_set:
            perceived_colour_master_id_rec = perceived_colour_master_id_rec | perceived_colour_master_id_portrait_dict[perceived_colour_master_id]
        if len(product_code_rec) > rec_num:
            s = random.sample(product_code_rec, rec_num)
        else:
            s = product_code_rec
        rec.append(s)
    if len(rec) > rec_num:
        rec = random.sample(rec, rec_num)
        
    customer_rec[customer] = rec
    k+=1
np.save("./output/customer_rec.npy", customer_rec)

In [11]:
customer_rec = np.load("./output/customer_rec.npy", allow_pickle=True).item()

sub =  pd.DataFrame(list(customer_rec.items()), columns=['customer_id', 'pre'])
sub.head()

Unnamed: 0,customer_id,pre
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[[750424007, 852643003, 852643006, 578020007, ..."
1,00007d2de826758b65a93dd24ce629ed66842531df6699...,"[[633150001, 633150004, 657291003, 693387002, ..."
2,00083cda041544b2fbb0e0d2905ad17da7cf1007526fb4...,"[[599580044, 509210025, 688873007, 687034007, ..."
3,0008968c0d451dbc5a9968da03196fe20051965edde741...,"[[736530006, 817491004, 516000072, 796210002, ..."
4,000aa7f0dc06cd7174389e76c9e132a67860c5f65f9706...,"[[692454002, 717490010, 685687004, 685687002, ..."


In [13]:
sub.to_csv('./output/用户画像sub_前一千个用户.csv',index=None)