In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import calinski_harabaz_score, silhouette_score

# 1. 加载数据
# 2.数据基本处理
# 2.1. 数据合并
# 2.2 交叉表统计
# 3. 特征工程(特征降维-主成分分析)
# 4. 机器学习(模型训练-KMeans)
# 5. 模型评估

In [2]:
# 1. 加载数据
orders = pd.read_csv('data/instacart/orders.csv')
order_products__prior = pd.read_csv('./data/instacart/order_products__prior.csv')
products = pd.read_csv('./data/instacart/products.csv')
aisles = pd.read_csv('./data/instacart/aisles.csv')

In [3]:
# 2.数据基本处理
# 2.1. 数据合并
table1 = pd.merge(orders, order_products__prior, on='order_id')
table2 = pd.merge(table1, products, on='product_id')
table3 = pd.merge(table2, aisles, on='aisle_id')

In [5]:
# 2.2 交叉表统计
# table3.shape
table = pd.crosstab(index=table3['user_id'], columns=table3['aisle'])
table.shape

(206209, 134)

In [7]:
# 3. 特征工程(特征降维-主成分分析)
# table.head()
transfer = PCA(n_components=0.9)
datas = transfer.fit_transform(table)
datas.shape

(206209, 27)

In [8]:
# 4. 机器学习(模型训练-KMeans)
estimator = KMeans(n_clusters=8)
y_pred = estimator.fit_predict(datas)

In [9]:
# 5. 模型评估
print('CH系数', calinski_harabaz_score(datas, y_pred))

CH系数 37475.744666570485


In [10]:
print('轮廓系数', silhouette_score(datas, y_pred))

# 1. 加载数据
# 2.数据基本处理
# 2.1. 数据合并
# 2.2 交叉表统计
# 3. 特征工程(特征降维-主成分分析)
# 4. 机器学习(模型训练-KMeans)
# 5. 模型评估

轮廓系数 0.33496077349842696
