In [2]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from collections import Counter

In [2]:
# 定义SOM类
class SOM(object):
    """
    Self-Organizing Feature Map
    """
    def __init__(self, Round, Rate, Iters, T):
        self.maxRound = Round
        self.minRound = Round / 5
        self.maxRate = Rate
        self.minRate = Rate / 100
        self.steps = Iters
        
        self.RateList = []                  #存放每轮迭代学习率的容器
        self.RoundList = []                 #存放每轮迭代优胜半径的容器
        self.X = 0                          #训练数据集（归一化后）
        
        self.gridLocation = 0               #竞争层的神经元节点位置坐标
        self.w = 0                          #神经元节点权重
        self.gridDist = 0                   #神经元节点之间的距离

    #计算欧式距离函数
    def edist(self,X1,X2):
        return np.sqrt(np.sum(np.square(X1 - X2), axis= 1))
    
    #计算各个节点之间的距离
    def calGdist(self, grid):
        m = len(grid)
        Gdist = np.zeros((m,m))
        for i in range(m):
            for j in range(i+1, m):
                    Gdist[i,j] = np.sum(np.abs(grid[i] - grid[j]))
                    Gdist[j,i] = Gdist[i,j]
        return Gdist
 
    #初始化竞争层
    def init_grid(self,M,N):
        grid = np.zeros((M*N,2))      #分成M*N类，两个维度
        k = 0
        for i in range(M):
            for j in range(N):
                grid[k,:] = np.array([i,j])
                k += 1
        return grid

    #学习率和优胜半径的递减函数
    def changeRate(self,i):
        # Rate = self.maxRate - (self.maxRate-self.minRate)*(i+1)/self.steps
        # Round = self.maxRound - (self.maxRound-self.minRound)*(i+1)/self.steps
        Rate = self.maxRate
        Round = self.maxRound
        return Rate, Round

    # 更新权重
    def update(self, data, nebor, dist):
        res = np.zeros(nebor.shape)
        
        for i in range(nebor.shape[0]):
            res[i] = nebor[i] + self.RateList[-1]*np.dot(data - nebor[i], np.exp(-dist[i] ** 2 / 2))
            j = np.argwhere(res[i] > 1).squeeze()
            res[i][j] = 1
            j = np.argwhere(res[i] < 0).squeeze()
            res[i][j] = 0

        return res
    
    #训练函数
    def train(self, X, M, N):
        self.X = X
        n_samples, n_features = X.shape

        ##1 初始化
        ###各个节点位置，以及各节点之间的位置
        self.gridLocation = self.init_grid(M,N)
        self.gridDist = self.calGdist(self.gridLocation)

        ###初始化各个节点对应的权值
        w = []
        idx_sample = np.random.randint(0, X.shape[0], M*N)
        for i in range(M*N):
            w.append(X[idx_sample[i]])
        w = np.array(w)

        ###确定迭代次数，不小于样本数的5倍
        if self.steps<5*n_samples:
            self.steps = 5*n_samples

        ##2 竞争
        for i in range(self.steps):
            ###随机选取样本计算距离
            data = X[np.random.randint(0, n_samples, 1)[0], :]
            Xdist = self.edist(data,w)
            ###找到优胜节点
            winnerPointIdx = np.argmin(Xdist)

            ##3 迭代
            ###确定学习率和节点优胜半径，并保存
            Rate, Round = self.changeRate(i)
            self.RateList.append(Rate)
            self.RoundList.append(Round)
            ###圈定优胜邻域内的所有节点
            winnerRoundIdx = np.nonzero(self.gridDist[winnerPointIdx]<Round)[0]
            ###对节点权值进行调整，为了简化运算这里暂不考虑节点的更新约束
            ww = w[winnerRoundIdx]
            w[winnerRoundIdx] = self.update(data, ww, Xdist[winnerRoundIdx])

        self.w = w

    #聚类标签
    def cluster(self, X):
        m = X.shape[0]
        cluster_labels = []
        for i in range(m):
            yi = self.edist(X[i], self.w)
            cluster_labels.append(np.argmin(yi))
        return np.array(cluster_labels)

In [3]:
# 导入数据
df = pd.read_csv('.\dataset_diabetes\proprecessing_data.csv')

In [4]:
## 数据补充处理
# 删除性别未知的数据
dropID = []
for value in df.index.values:
    if df.loc[value,'gender'] == 'Unknown/Invalid':
        dropID.append(value)
df = df.drop(dropID)   
# 离散值替换：转为darray类型
data_set = df.to_numpy().T
# 标签替换
le = preprocessing.LabelEncoder()    #获取一个LabelEncoder
idx_BN = np.argwhere(T == False).squeeze()
for i in idx_BN:
    type = np.unique(data_set[i])
    le = le.fit(type)
    data_set[i] = le.transform(data_set[i])

# 数据归一化
idx_mix = [14, 15, 16]
for i in idx_mix:
    subdata_set = np.array([str(j) for j in data_set[i]])
    invalid_char = ['V','E','?']
    idx_nan = np.array([])
    for c in invalid_char:
        char_count = np.char.count(subdata_set, c)
        idx_nan = np.argwhere(char_count != 0).squeeze()
        subdata_set[idx_nan] = '0'
    data_set[i] = subdata_set.astype(float)

data_set = data_set.T
scaler = preprocessing.MinMaxScaler() 
data_set = scaler.fit_transform(data_set)

In [16]:
#SOM聚类
M, N = 10, 10
som_self = SOM(7, 0.01, 400000)
som_self.train(data_set, M= M, N= N)
som_self_cluster = som_self.cluster(data_set)

In [17]:
#保存结果
np.save('./result/7_01_10_10_cluster_new.npy', som_self_cluster)  
np.save('./result/7_01_10_10_w_new.npy', som_self.w)  