In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [2]:
# order_products__prior.csv：订单与商品信息
# 字段：order_id, product_id, add_to_cart_order, reordered
# products.csv：商品信息
# 字段：product_id, product_name, aisle_id, department_id
# orders.csv：用户的订单信息
# 字段：order_id,user_id,eval_set,order_number,….
# aisles.csv：商品所属具体物品类别
# 字段： aisle_id, aisle

In [3]:
# 1.获取数据
order_products = pd.read_csv('../../data/instacart/order_products__prior.csv')
products = pd.read_csv('../../data/instacart/products.csv')
orders = pd.read_csv('../../data/instacart/orders.csv')
aisles = pd.read_csv('../../data/instacart/aisles.csv')

In [4]:
order_products.head()  # 订单与商品信息

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [5]:
order_products.shape

(32434489, 4)

In [6]:
products.head()  # 商品信息

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [7]:
products.shape

(49688, 4)

In [8]:
orders.head()  # 用户订单信息

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [9]:
orders.shape

(3421083, 7)

In [10]:
aisles.head()  # 商品所属具体物品类别

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [11]:
aisles.shape

(134, 2)

In [12]:
# 合并数据集 将商品信息和 订单与商品信息  和 订单信息 和商品所属类别信息 进行合并
index_count = order_products.shape[0]//2
data = pd.merge(left=order_products.iloc[:index_count, :], right=products, on=['product_id'], how='inner')

In [13]:
data = pd.merge(left=data, right=orders, on=['order_id'], how='inner')

In [14]:
index_count

16217244

In [15]:
data = pd.merge(left=data, right=aisles, on=['aisle_id'], how='inner')

In [16]:
data.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle
0,2,33120,1,1,Organic Egg Whites,86,16,202279,prior,3,5,9,8.0,eggs
1,26,33120,5,0,Organic Egg Whites,86,16,153404,prior,2,0,16,7.0,eggs
2,120,33120,13,0,Organic Egg Whites,86,16,23750,prior,11,6,8,10.0,eggs
3,327,33120,5,1,Organic Egg Whites,86,16,58707,prior,21,6,9,8.0,eggs
4,390,33120,28,1,Organic Egg Whites,86,16,166654,prior,48,0,12,9.0,eggs


In [17]:
data.shape

(16217244, 14)

In [18]:
# 建立user_id和aisle的交叉表
data_ct = pd.crosstab(data['user_id'], data['aisle'])
data = data_ct

In [19]:
data_ct.head()

aisle,air fresheners candles,asian foods,baby accessories,baby bath body care,baby food formula,bakery desserts,baking ingredients,baking supplies decor,beauty,beers coolers,...,spreads,tea,tofu meat alternatives,tortillas flat bread,trail mix snack mix,trash bags liners,vitamins supplements,water seltzer sparkling water,white wines,yogurt
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,3,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,26
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,2,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [20]:
data_ct.shape

(201001, 134)

In [30]:
# 取一部分数据做建模演示
data = data_ct.iloc[:10000,:]

In [31]:
data.shape

(10000, 134)

In [32]:
# 降维PCA  转换器
transfer = PCA(n_components=0.95)
res = transfer.fit_transform(data)

In [33]:
res.shape

(10000, 45)

In [34]:
# 聚类
estimator = KMeans(n_clusters=8)
# 训练模型
estimator.fit(data)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=8, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [35]:
# 模型评估 SC系数（轮廓系数）
# 获取分类结果
y_predict = estimator.predict(data)

In [36]:
y_predict.shape

(10000,)

In [37]:
data.shape

(10000, 134)

In [38]:
# 计算轮廓系数
silhouette_score(data, y_predict)  # 平均轮廓系数

0.3219823064057587