In [1]:
import pandas as pd
import numpy as np
from numpy.linalg import *
import scipy as sp
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.decomposition import *
from sklearn.metrics import *
from sklearn.neighbors import *
from collections import *
from pyclust import *
from sklearn.manifold import *
from sklearn.cluster import KMeans
from module.EKmeans import *
from module.CTree import *
import datetime
from collections import *

DATA_PATH = './trade_new.csv'

_col = {
    'uid':'uid',
    'tran_time':'sldatime', #
    'gender_age':'cmrid',
    'vipno':'vipno', #
    'itemno':'pluno', #
    'amount':'amt', #
    'quantity':'qty', #
    'brandno':'bndno', #
}

_col_class = {
    'class1':'class1',
    'class2':'class2',
    'class3':'class3',
    'class4':'class4',
    'class5':'class5',
}

_col2 ={**_col,**_col_class}

print('columns',_col2)

CTree.py init
columns {'uid': 'uid', 'tran_time': 'sldatime', 'gender_age': 'cmrid', 'vipno': 'vipno', 'itemno': 'pluno', 'amount': 'amt', 'quantity': 'qty', 'brandno': 'bndno', 'class1': 'class1', 'class2': 'class2', 'class3': 'class3', 'class4': 'class4', 'class5': 'class5'}


In [2]:
df = pd.read_csv(DATA_PATH)
df = df[_col.values()]
new_df = pd.DataFrame([],columns=_col2.values())


category = df[_col2['itemno']].values.astype('str')
category = np.array(list(map(lambda x:np.array([x[:2],x[:3],x[:4],x[:5],x[5:]]),category)))
category = pd.DataFrame(category,columns=_col_class.values())
if (category.index.start == df.index.start) and \
    (category.index.stop == df.index.stop):
    new_df = df.join(category)

df_amount_sum = new_df.groupby([_col2['vipno'],_col2['class1'],_col2['class2'],_col2['class3'],_col2['class4']],as_index=False).agg({
    _col2['amount']:sum
})
df_amount_sum3 = new_df.groupby([_col2['vipno'],_col2['class1'],_col2['class2'],_col2['class3']],as_index=False).agg({
    _col2['amount']:sum
})
df_amount_sum2 = new_df.groupby([_col2['vipno'],_col2['class1'],_col2['class2']],as_index=False).agg({
    _col2['amount']:sum
})
df_amount_sum1 = new_df.groupby([_col2['vipno'],_col2['class1']],as_index=False).agg({
    _col2['amount']:sum
})
df_amount = [df_amount_sum1, df_amount_sum2, df_amount_sum3, df_amount_sum]
del df
# df_amount_sum.to_csv('./result/a1-amount_sum.csv')

In [4]:
set_vipno = set(df_amount_sum[_col2['vipno']])
mapping_index_v = {i:idx for (idx,i) in zip(range(len(set_vipno)),set_vipno)}


def aggByUser(category='class4'):
    global set_vipno,df_amount,_col2
    map_single = {v:{} for v in set_vipno}
    df_temp = df_amount[ int(category[-1])-1 ]
    for v in set_vipno:
        lst_v = df_temp.loc[ df_temp[_col2['vipno']]==v ]
        for idx,row in lst_v.iterrows():
            map_single[v][ row[ _col2[ category ] ] ] = row[ _col2['amount'] ]
    return map_single


def computeUserFeatureOfClass(category='class4'):
    global df_amount,mapping_index_v,_col2
    set_class_i = set(new_df[_col2[ category ]])
    mapping_index_class_i = {i:idx for (idx,i) in zip(range(len(set_class_i)),set_class_i)}
    mat_userFeature = np.array([[0.0 for i in range(len(set_class_i))] for j in range(len(set_vipno))])
    for idx,r in df_amount[ int(category[-1])-1 ].iterrows():
        mat_userFeature[ mapping_index_v[ r[ _col2['vipno'] ] ] ][ mapping_index_class_i[ r[ _col2[category] ] ] ] = r[ _col2['amount'] ]
    return mat_userFeature


def computeJaccardDistance(a,b):
    return 1-np.sum(np.min([a,b],0))/np.sum(np.max([a,b],0))


def computeJaccardMatrix(mat_userFeature):
    global set_vipno,map_single,mapping_index_v,D
    mat_jaccard = np.array([[0 for i in range(len(set_vipno))] for j in range(len(set_vipno))]).astype('float')
    for v1 in set_vipno:
        for v2 in set_vipno:
            jaccard_pairwise_dist = compute_distance(a=mat_userFeature[mapping_index_v[v1]],b=mat_userFeature[mapping_index_v[v2]],D=D,metric='cos')
            mat_jaccard[mapping_index_v[v1]][mapping_index_v[v2]] = mat_jaccard[mapping_index_v[v2]][mapping_index_v[v1]] = jaccard_pairwise_dist
    return mat_jaccard

def compute_distance(a,b,D,metric='jaccard'):
    if metric=='jaccard':
        return computeJaccardDistance(a,b)
    elif metric=='euclidean':
        return norm(a-b)
    elif metric=='cos':
        score = [computeJaccardDistance(a[:D[0]],b[:D[0]]),
                computeJaccardDistance(a[D[0]:D[1]],b[D[0]:D[1]]),
                computeJaccardDistance(a[D[1]:D[2]],b[D[1]:D[2]]),
                computeJaccardDistance(a[D[2]:D[3]],b[D[2]:D[3]])]
#             print(score)
        return sum(score)/4

class EKmeans():
    def __init__(self,X,D,method='jaccard',n_centers=2,max_iters=1000,tol=1e-20,verbose=0):
        self.n_centers = n_centers
        self.max_iters = max_iters
        self.D = D
        self.method = method
        self.X = X
        self.clusters = np.array([-1 for i in range(X.shape[0])])
        self.centers = self.X[np.random.permutation(X.shape[0])[:self.n_centers]]
        self.pre_centers = []
        self.debug = []
        self.lr = 0.34
        self.tol = tol
        self.verbose = verbose
        return
    
    def metrics_compactness(self):
        s = 0.0
        for c in range(self.n_centers):
            idx_cluster = np.where(self.clusters==c)[0]
            for a in self.X[ idx_cluster ]:
                dst = compute_distance(a=a,b=self.centers[c],D=self.D,metric=self.method)
                s += dst*1.0/len(idx_cluster)
        return s
                
    def __update_centers(self,smooth=True):
        self.pre_centers = np.array(self.centers)
        for c in range(self.n_centers):
            temp = self.X[ np.where(self.clusters==c)[0] ]
            if len(temp):
                self.centers[c] = np.array([ np.mean( temp, 0 ) ])
#             print(self.centers,self.X[ np.where(self.clusters==c)[0] ]) ### debug
        if smooth:
            self.centers = (self.centers-self.pre_centers)*self.lr + self.pre_centers
        return
    
    def __log(self,x):
        if self.verbose:
            print(x)
        return
    
    def fit(self):
        for kiter in range(self.max_iters):
            self.__log('iteration %d'%kiter)
            for i in range(self.X.shape[0]):
                d_min_dist = 1e10
                record_node = -1
                for idx,c in zip(range(len(self.centers)),self.centers):
                    dst = compute_distance(a=self.X[i],b=c,D=self.D,metric=self.method)
                    if dst<d_min_dist:
                        d_min_dist = dst
                        record_node = idx
                self.clusters[i] = record_node
            self.__update_centers()
            move_dist = np.mean(norm(self.pre_centers-self.centers,axis=1))
            self.debug.append(move_dist)
            if (kiter - kiter//20 *20 ==1) and move_dist<self.tol:
                break
            if kiter==10:
#                 raise Exception('stop')
                pass
        return


# map_single = aggByUser()
mat_userFeature = [computeUserFeatureOfClass('class1'),
                   computeUserFeatureOfClass('class2'),
                   computeUserFeatureOfClass('class3'),
                   computeUserFeatureOfClass('class4')
                  ]

D =np.cumsum(
    [
        mat_userFeature[0].shape[1],
        mat_userFeature[1].shape[1],
        mat_userFeature[2].shape[1],
        mat_userFeature[3].shape[1]
    ]
)

mat_jaccard = computeJaccardMatrix(np.hstack(mat_userFeature))

def KmeansCluster(n_centers,method='cos',max_iters=500):
    global mat_userFeature,D
    if method=='jaccard':
        kmeans = EKmeans(mat_userFeature[-1],None,n_centers=n_centers,max_iters=max_iters)
    elif method=='cos':
        kmeans = EKmeans(np.hstack(mat_userFeature),D,n_centers=n_centers,max_iters=max_iters,method='cos')
    elif method=='euclidean':
        kmeans = EKmeans(mat_userFeature[-1],None,n_centers=n_centers,max_iters=max_iters,method='euclidean')
    kmeans.fit()
    return kmeans


# kmeans = KMeans(n_clusters=4)
# clusters = kmeans.fit_predict(mat_userFeature[-1])
# silhouette_score(mat_userFeature[-1],labels=clusters)
# kmeans.cluster_centers_

array([[ 4.69117647e-01,  2.07983193e-02,  7.94117647e-02, ...,
         2.31092437e-02,  7.54201681e-02,  2.42226891e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  8.62500000e-01,  0.00000000e+00, ...,
        -3.46944695e-18,  0.00000000e+00,  2.77555756e-17]])

In [None]:
res = []
for nn in range(2,20):
    print('param ',nn)
    kmeans = KmeansCluster(nn,'cos',1000)
    sil = silhouette_score(mat_jaccard,labels=kmeans.clusters,metric='precomputed')
    com = kmeans.metrics_compactness() 
    res.append({'sil':sil,'com':com,'nn':nn})

# tsne = TSNE(n_jobs=-1,method='exact',n_components=2)
# mat_userFeature_t = tsne.fit_transform(np.hstack(mat_userFeature))

# sns.set(style="ticks")
# sns.pairplot(pd.DataFrame(mat_userFeature[:,100:110]),kind='reg')