In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import Binarizer
from sklearn import preprocessing
from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_selection import VarianceThreshold

In [None]:
def km(x,k,b):
    """ kmeans聚类, 我用 k=6,b=1
    
    Args:
        x (np.array): numpy数组，N*D
        k (int): 聚类数量
        b (int): 二值化阈值
    
    Returns:
        TYPE: skelearn.model对象
    """ 
    model = KMeans(n_clusters=k, n_init=1, init='random')
    model = model.fit(Binarizer(threshold=b).fit_transform(x))  # 二值化
    labels = model.labels_
    return model

In [None]:
def save_cluster(x,k,model):
    """ 保存每个类别的样本到文件
    
    Args:
        x (np.array): 原始数据集
        k (int): 聚类数量
        model (sklearn.kmeans): keans模型
    """
    for i in range(k):
        index = np.squeeze(np.argwhere(model.labels_==k))
        data = x.ix[index,:]
        data.to_csv(str(i)+'.txt')

## app聚类（线上用测试集聚类）

In [None]:
# 读取数据，用测试集训练聚类
data = pd.read_csv('/data/topic1/test_x.csv')
x = data[data.columns[8:308]].values # 转为 np.array
model = KMeans(n_clusters=20, n_init=5, init='random')
model = model.fit(Binarizer(threshold=1).fit_transform(x))  # 二值化

In [None]:
# 输出每个类的数量
for i in range(10):
    print str(i) + ':' + str(len(np.squeeze(np.argwhere(model.labels_==i))))

In [None]:
# 输出为特征
trainData = pd.read_csv('/data/topic1/train_x.csv')
testData = pd.read_csv('/data/topic1/test_x.csv')
trainData['cluster'] = model.predict(trainData[trainData.columns[8:308]])
testData['cluster'] = model.predict(testData[testData.columns[8:308]])
trainData[['用户标识','cluster']].to_csv('/data/topic1/feature_cluster.csv', index=False)
testData[['用户标识','cluster']].to_csv('/data/topic1/feature_test_cluster.csv', index=False)

## web聚类（线上用训练集聚类）

In [None]:
x = trainData[trainData.columns[1:-1]].values # 转为 np.array
model = KMeans(n_clusters=20, n_init=5, init='random')
model = model.fit(Binarizer(threshold=1).fit_transform(x))  # 二值化

In [None]:
# 输出每个类的数量
for i in range(10):
    print str(i) + ':' + str(len(np.squeeze(np.argwhere(model.labels_==i))))

In [None]:
# 输出为特征
# trainData = pd.read_csv('/data/topic1/train_x.csv')
testData = pd.read_csv('/data/topic1/test_x.csv')
trainData['webcluster'] = model.predict(trainData[trainData.columns[313:]])
testData['webcluster'] = model.predict(testData[testData.columns[313:]])
trainData[['用户标识','webcluster']].to_csv('/data/topic1/feature_webcluster.csv', index=False)
testData[['用户标识','webcluster']].to_csv('/data/topic1/feature_test_webcluster.csv', index=False)