In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [8]:
rfmMoreCut = pd.read_csv('morefeaCut/rfmMoreCUt.csv')
rfmMoreCut=rfmMoreCut.set_index('会员编号')

In [9]:
rfmMoreCut.head()

Unnamed: 0_level_0,money,recent,frequency,sex,age
会员编号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1027478498,439.5,4,20,2,21
1030794986,1.0,57,1,1,37
1030797014,248.0,38,15,1,31
1030904479,4.0,50,2,2,36
1033252014,1309.0,39,29,2,36


In [10]:
rfm_log = rfmMoreCut[['money', 'recent','frequency','sex','age']].apply(np.log, axis = 1).round(2)
rfm_log.describe()

Unnamed: 0,money,recent,frequency,sex,age
count,5314.0,5314.0,5314.0,5314.0,5314.0
mean,6.82719,2.69369,3.086415,0.382006,3.464037
std,0.763662,0.943848,0.30396,0.343042,0.237059
min,0.0,0.0,0.0,0.0,2.4
25%,6.39,2.2,2.94,0.0,3.33
50%,6.88,3.0,3.09,0.69,3.47
75%,7.35,3.43,3.26,0.69,3.61
max,8.52,4.09,4.5,0.69,4.81


In [11]:
rfm_log_zs = (rfm_log -rfm_log.mean())/rfm_log.std()
rfm_log_zs.describe()

Unnamed: 0,money,recent,frequency,sex,age
count,5314.0,5314.0,5314.0,5314.0,5314.0
mean,1.141361e-14,-6.422429e-14,-4.422106e-14,-6.499714e-14,-8.011991e-14
std,1.0,1.0,1.0,1.0,1.0
min,-8.940064,-2.853945,-10.15402,-1.113584,-4.488488
25%,-0.5724918,-0.5230611,-0.481692,-1.113584,-0.5654141
50%,0.06915302,0.3245329,0.01179389,0.8978321,0.02515615
75%,0.6846083,0.7801146,0.5710779,0.8978321,0.6157264
max,2.216699,1.47938,4.650561,0.8978321,5.677757


In [12]:
from sklearn.metrics import pairwise
class KernelKMeans(object):

    def __init__(self,
                 n_clusters=8,
                 max_iter=300,
                 kernel=pairwise.linear_kernel):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.kernel = kernel

    def _initialize_cluster(self, X):
        self.N = np.shape(X)[0]
        self.y = np.random.randint(low=0, high=self.n_clusters, size=self.N)
        self.K = self.kernel(X)

    def fit_predict(self, X):
        self._initialize_cluster(X)
        for _ in range(self.max_iter):
            obj = np.tile(np.diag(self.K).reshape((-1, 1)), self.n_clusters)
            N_c = np.bincount(self.y)
            for c in range(self.n_clusters):
                obj[:, c] -= 2 * \
                    np.sum((self.K)[:, self.y == c], axis=1) / N_c[c]
                obj[:, c] += np.sum((self.K)[self.y == c][:, self.y == c]) / \
                    (N_c[c] ** 2)
            self.y = np.argmin(obj, axis=1)
        return self.y

In [None]:
'''
from sklearn.cluster import KMeans
ks = range(1, 17)
inertias = []
kc = KernelKMeans(n_clusters = k,max_iter=100,kernel=lambda X: pairwise.rbf_kernel(X, gamma=0.1))
kc.fit_predict(rfm_log_zs)
'''

In [26]:
clusterNum = 4
model = KernelKMeans(n_clusters = clusterNum, max_iter=100,kernel=lambda X: pairwise.rbf_kernel(X, gamma=0.1))
cluster_labels = model.fit_predict(rfm_log_zs)
rfm_kn = rfmMoreCut.assign(cluster = cluster_labels)
rfm_kn_describe=rfm_kn.groupby('cluster').agg({'money':['mean','count'], 'recent':'mean','frequency':'mean'}).round(2)

In [27]:
# drawing
labels = ['cluster0', 'cluster1','cluster2','cluster3']
sizes = rfm_kn_describe['money']['count'].to_numpy()
explode = (0, 0, 0,0)
plt.pie(sizes, explode = explode, labels = labels, autopct='%1.1f%%', shadow = False, startangle = 90)
plt.title('user proportion ')

<IPython.core.display.Javascript object>

Text(0.5,1,'user proportion ')

In [29]:
customer_sales = rfm_kn.groupby('cluster').agg({'money': 'sum'})
valueSizes = customer_sales['money'].to_numpy()
valueExplode = (0, 0, 0,0)
plt.pie(valueSizes, explode = valueExplode, labels = labels, autopct='%1.1f%%', shadow = False, startangle = 90)
plt.title('value proportion ')

<IPython.core.display.Javascript object>

Text(0.5,1,'value proportion ')

In [30]:
rfm_kn_describe

Unnamed: 0_level_0,money,money,recent,frequency
Unnamed: 0_level_1,mean,count,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1809.56,1227,21.15,27.32
1,1082.39,1984,22.96,22.06
2,1411.84,899,3.89,25.04
3,568.43,1204,26.46,18.14


In [17]:
%matplotlib notebook
from mpl_toolkits.mplot3d import proj3d
ax = plt.subplot(111, projection = '3d')
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=3)
for i in range(clusterNum):
    clusterIndex=rfm_kn.loc[rfm_kn['cluster']==i]
    ax.scatter(clusterIndex.iloc[:, 0], clusterIndex.iloc[:, 1], clusterIndex.iloc[:, 2], c = cmap(norm(clusterIndex.iloc[0, 5])),label='cluster{}'.format(i))
ax.set_xlabel('M')
ax.set_xlim([0,3000])
ax.set_ylabel('R')
ax.set_ylim([0,40])
ax.set_zlabel('F')
ax.set_zlim([0, 50])
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x11db00f0>

In [23]:
clusterNum = 3
model = KernelKMeans(n_clusters = clusterNum, max_iter=100,kernel=lambda X: pairwise.rbf_kernel(X, gamma=0.1))
cluster_labels = model.fit_predict(rfm_log_zs)
rfm_kn = rfmMoreCut.assign(cluster = cluster_labels)
rfm_kn_describe=rfm_kn.groupby('cluster').agg({'money':['mean','count'], 'recent':'mean','frequency':'mean'}).round(2)

In [22]:

from mpl_toolkits.mplot3d import proj3d
ax = plt.subplot(111, projection = '3d')
cmap = plt.cm.Spectral
norm = plt.Normalize(vmin=0, vmax=3)
for i in range(clusterNum):
    clusterIndex=rfm_kn.loc[rfm_kn['cluster']==i]
    ax.scatter(clusterIndex.iloc[:, 0], clusterIndex.iloc[:, 1], clusterIndex.iloc[:, 2], c = cmap(norm(clusterIndex.iloc[0, 5])),label='cluster{}'.format(i))
ax.set_xlabel('M')
ax.set_xlim([0,3000])
ax.set_ylabel('R')
ax.set_ylim([0,40])
ax.set_zlabel('F')
ax.set_zlim([0, 50])
ax.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x169dab50>

In [24]:
# drawing
labels = ['cluster0', 'cluster1','cluster2']
sizes = rfm_kn_describe['money']['count'].to_numpy()
explode = (0, 0.2, 0.1)
plt.pie(sizes, explode = explode, labels = labels, autopct='%1.1f%%', shadow = False, startangle = 90)
plt.title('user proportion ')

<IPython.core.display.Javascript object>

Text(0.5,1,'user proportion ')

In [25]:
customer_sales = rfm_kn.groupby('cluster').agg({'money': 'sum'})
valueSizes = customer_sales['money'].to_numpy()
valueExplode = (0, 0.2, 0.1)
plt.pie(valueSizes, explode = valueExplode, labels = labels, autopct='%1.1f%%', shadow = False, startangle = 90)
plt.title('value proportion ')

<IPython.core.display.Javascript object>

Text(0.5,1,'value proportion ')

In [31]:
rfm_kn_describe

Unnamed: 0_level_0,money,money,recent,frequency
Unnamed: 0_level_1,mean,count,mean,mean
cluster,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1809.56,1227,21.15,27.32
1,1082.39,1984,22.96,22.06
2,1411.84,899,3.89,25.04
3,568.43,1204,26.46,18.14
