# 基于物品信息构建物品特征矩阵

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from pandas import  DataFrame

art = pd.read_csv("./data/rawdata/articles.csv")
# 我们只关注下面6个类别特征
art = art[['article_id', 'product_code', 'product_type_no', 'graphical_appearance_no',
           'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id']]
# 'product_code' : 47224 个不同的值。
# 'product_type_no'：132 个不同的值。
# 'graphical_appearance_no'：30 个不同的值。
# 'colour_group_code'：50 个不同的值。
# 'perceived_colour_value_id'：8 个不同的值。
# 'perceived_colour_master_id'：20 个不同的值。
# product_code：取出现次数最多的前10个，后面的合并。
most_freq_top10_prod_code = np.array(Counter(art.product_code).most_common(10))[:, 0]
# 如果color不是最频繁的10个color,那么就给定一个默认值0，减少one-hot编码的维度
art['product_code'] = art['product_code'].apply(lambda t: t if t in most_freq_top10_prod_code else -1)
# product_type_no：取出现次数最多的前10个，后面的合并。
most_frequent_top10_product_type_no = np.array(Counter(art.product_type_no).most_common(10))[:, 0]
# 如果color不是最频繁的10个color,那么就给定一个默认值0，减少one-hot编码的维度
art['product_type_no'] = art['product_type_no'].apply(
    lambda t: t if t in most_frequent_top10_product_type_no else -1)
one_hot = OneHotEncoder(handle_unknown='ignore')
one_hot_data = art[['product_code', 'product_type_no', 'graphical_appearance_no',
                    'colour_group_code', 'perceived_colour_value_id', 'perceived_colour_master_id']]
one_hot.fit(one_hot_data)
feature_array = one_hot.transform(np.array(one_hot_data)).toarray()
# 两个ndarray水平合并，跟data['id']合并，方便后面两个DataFrame合并
feature_array_add_id = np.hstack((np.asarray([art['article_id'].values]).T, feature_array))
df_train = DataFrame(feature_array_add_id, columns=np.hstack((np.asarray(['article_id']),
                                                            one_hot.get_feature_names())))

df_train['article_id'] = df_train['article_id'].apply(lambda t: int(t))

df_train.to_csv('./output/kmeans_train.csv', index=0)

# 基于物品关联的召回

In [3]:
import pandas as pd
df_train = pd.read_csv('./output/kmeans_train.csv') 

In [41]:
from sklearn.cluster import KMeans
import random
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
from pandas import  DataFrame
n_clusters = 1000
# X = np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]])
# k_means = KMeans(n_clusters=2, random_state=0).fit(X)
# n_clusters: 一共聚多少类，默认值8
# init：选择中心点的初始化方法，默认值k-means++
# n_init：算法基于不同的中心点运行多少次，最后的结果基于最好的一次迭代的结果，默认值10
# max_iter: 最大迭代次数，默认值300
k_means = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10,
                 max_iter=300).fit(df_train.drop(columns=['article_id']).values)
                 
result_array = np.hstack((np.asarray([df_train['article_id'].values]).T,
                          np.asarray([k_means.labels_]).T))
# 将物品id和具体的类别转化为DataFrame。
cluster_result = DataFrame(result_array, columns=['article_id', 'cluster'])
cluster_result.to_csv('./output/kmeans.csv', index=0)
df_cluster = pd.read_csv('./output/kmeans.csv')
# 每个id对应的cluster的映射字典。
id_cluster_dict = dict(df_cluster.values)
tmp = df_cluster.values
cluster_ids_dict = {}
for i in range(tmp.shape[0]):
    [id_, cluster_] = tmp[i]
    if cluster_ in cluster_ids_dict.keys():
        cluster_ids_dict[cluster_] = cluster_ids_dict[cluster_] + [id_]
    else:
        cluster_ids_dict[cluster_] = [id_]



In [42]:
# 基于聚类，为每个物品关联k个最相似的物品。
def article_similar_recall(art_id, k):
    rec = cluster_ids_dict.get(id_cluster_dict.get(art_id))
    #if art_id in rec:
        #rec.remove(art_id)
    if len(rec)>k:
        return random.sample(rec, k)
    else:
        return rec

In [43]:
art = pd.read_csv("./data/rawdata/articles.csv")
art_id = art['article_id']

In [48]:
#top_id=[]
rec_num = 20
sub={}
for id in set(art_id.values):
    #top_id.append(article_similar_recall(id, rec_num))
    sub[id] = article_similar_recall(id, rec_num)



In [None]:
sub

In [50]:
d={}
d['art_id']=[]
d['pre']=[]
for k in sub:
    d['art_id'].append(k)
    d['pre'].append(sub[k])

In [54]:
df = pd.DataFrame(d,columns=['art_id','pre'])

In [55]:
df.head()

Unnamed: 0,art_id,pre
0,522453001,"[554529002, 730683019, 620621002, 570489001, 6..."
1,522453002,"[645422003, 857762001, 845095001, 838901001, 7..."
2,651690001,"[727180003, 834059001, 698437004, 888229006, 4..."
3,749994001,"[734319002, 664405015, 685920001, 561758008, 5..."
4,651690003,"[828631002, 559971001, 895421001, 841473004, 6..."


In [57]:
df.to_csv('./output/相似物品推荐.csv',index=None)

In [1]:
import pandas as pd

items = pd.read_csv('./output/相似物品推荐.csv')
items.head()

Unnamed: 0,art_id,pre
0,522453001,"[554529002, 730683019, 620621002, 570489001, 6..."
1,522453002,"[645422003, 857762001, 845095001, 838901001, 7..."
2,651690001,"[727180003, 834059001, 698437004, 888229006, 4..."
3,749994001,"[734319002, 664405015, 685920001, 561758008, 5..."
4,651690003,"[828631002, 559971001, 895421001, 841473004, 6..."


In [13]:
items.rename(columns={'art_id':'article_id'},inplace=True)

In [11]:
items.head()

Unnamed: 0,art_id,pre
0,522453001,"[554529002, 730683019, 620621002, 570489001, 6..."
1,522453002,"[645422003, 857762001, 845095001, 838901001, 7..."
2,651690001,"[727180003, 834059001, 698437004, 888229006, 4..."
3,749994001,"[734319002, 664405015, 685920001, 561758008, 5..."
4,651690003,"[828631002, 559971001, 895421001, 841473004, 6..."


In [4]:
t_t = pd.read_csv('./data/rawdata/transactions_train.csv')
t_t.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [14]:
d = pd.merge(t_t.loc[:,['customer_id','article_id']] , items , on='article_id')

In [20]:
d.head()

Unnamed: 0,customer_id,article_id,pre
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,"[662677002, 904423001, 649900001, 673552002, 8..."
1,3681748607f3287d2c3a65e00bb5fb153de30e9becf158...,663713001,"[662677002, 904423001, 649900001, 673552002, 8..."
2,4ef5967ff17bf474bffebe5b16bd54878e1d4105f7b4ed...,663713001,"[662677002, 904423001, 649900001, 673552002, 8..."
3,6b7b10d2d47516c82a6f97332478dab748070f09693f09...,663713001,"[662677002, 904423001, 649900001, 673552002, 8..."
4,8ac137752bbe914aa4ae6ad007a9a0c5b67a1ab2b2d474...,663713001,"[662677002, 904423001, 649900001, 673552002, 8..."


In [23]:
sub = d.loc[:,['customer_id','pre']]

sub.to_csv('./output/sub_物品聚类.csv')