In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing
from collections import Counter

In [None]:
# 定义SOM类
class SOM(object):
    """
    Self-Organizing Feature Map
    """
    def __init__(self, R, LR, S, T):
        self.R = R
        self.LR = LR
        self.S = S
                        
        self.data = 0                          
                      
        self.w = 0                       
        self.outputDist = 0                   

        self.T = T                          #记录属性是离散值还是连续值
        self.idx_BN = np.argwhere(self.T == False).squeeze()    #离散值属性编号
        self.idx_OI = np.argwhere(self.T == True).squeeze()     #连续值属性编号
    
    #离散值距离计算
    def BNd(self, x1, x2):    #T:False
        res = np.ones(x2.shape)
        x1 = np.tile(x1, x2.shape[0]).reshape(-1,x2.shape[1])
        res[x1 == x2] = 0
        return res
    
    #连续值距离计算
    def OId(self, x1, x2):    #T:True
        return np.nan_to_num(np.abs(x1 - x2), nan= 1)

    #计算融合距离函数
    def calDist(self,X1,X2):
        #分离离散值与数值型属性
        X1_BN = X1[self.idx_BN].astype(str)
        X2_BN = X2[self.idx_BN].astype(str)
        X1_OI = X1[self.idx_OI].astype(float)
        X2_OI = X2[self.idx_OI].astype(float)

        # 计算该属性值是否有效
        delta = np.ones(X2.shape)
        idx = np.argwhere(X1_BN == '?').squeeze()
        delta[self.idx_BN[idx]] = 0
        idx = np.argwhere(X2_BN == '?').squeeze()
        if len(idx.reshape(-1,1)) == 2:
            delta[self.idx_BN[idx[0]]][idx[1]] = 0
        else:
            for i in idx:
                delta[self.idx_BN[i[0]]][i[1]] = 0
        '''idx = np.argwhere(np.isnan(X1_OI)).squeeze()
        delta[self.idx_OI[idx]] = 0
        idx = np.argwhere(np.isnan(X2_OI)).squeeze()
        if len(idx.reshape(-1,1)) == 2:
            delta[self.idx_OI[idx[0]]][idx[1]] = 0
        else:
            for i in idx:
                delta[self.idx_OI[i[0]]][i[1]] = 0'''
        dSum = np.sum(delta, axis=0)

        # 计算不同属性间距离
        mSum = np.zeros(X2.shape[1])
        BNd = self.BNd(X1_BN, X2_BN.T)
        mSum += np.diagonal(np.dot(BNd, delta[self.idx_BN]))
        OId = self.OId(X1_OI, X2_OI.T)
        mSum += np.diagonal(np.dot(OId, delta[self.idx_OI]))

        return mSum * 10 / dSum

    # 更新权重
    def update(self, data, nebor, dist):
        res = nebor.copy()
        data_OI = data[self.idx_OI].astype(float)
        nebor_OI = nebor[self.idx_OI].astype(float)

        #数值型权重更新
        for i in range(len(self.idx_OI)):
            res[self.idx_OI[i]] = nebor_OI[i] + self.LR*np.dot(data_OI[i] - nebor_OI[i], np.exp(-dist ** 2 / (2 * self.R)))
            temp = res[self.idx_OI[i]].astype(float)
            j = np.argwhere(temp > 1).squeeze()
            res[self.idx_OI[i]][j] = 1
            j = np.argwhere(temp < 0).squeeze()
            res[self.idx_OI[i]][j] = 0
        
        #离散型权重更新
        dist /= 10
        d = min(np.average(dist),1)
        d = max(d, 0)
        p = np.array([1- d, d])
        for i in self.idx_BN:
            B = np.random.choice([True, False], p = p.ravel())
            if B:
                res[i] = np.repeat(data[i], nebor.shape[1])
        return res
    
    #训练函数
    def train(self, data, M, N):
        self.data = data

        #计算节点间的距离
        self.outputDist = np.zeros((M*N,M*N))
        for i in range(M*N):
            for j in range(i+1, M*N):
                    row_i = i / M
                    col_i = i % M
                    row_j = j / M
                    col_j = j % M
                    self.outputDist[i,j] = np.abs(row_i - row_j) + np.abs(col_i - col_j)
                    self.outputDist[j,i] = self.outputDist[i,j]

        w = []
        #随机生成权重
        '''Nominal_dict = [
            ['Caucasian', 'AfricanAmerican', '?', 'Hispanic', 'Other', 'Asian'],
            ['Female', 'Male'],
            [i for i in range(0,10)],
            [1, 2, 3, 5, 6],
            [1, 2, 3, 5, 6, 18, 22, 25],
            [1, 2, 4, 6, 7, 17],
            ['?', 'InternalMedicine', 'Family/GeneralPractice', 'Emergency/Trauma', 'Cardiology', 'Surgery-General', 'Orthopedics', 'Orthopedics-Reconstructive'
                'Radiologist', 'Nephrology'],
            ['None', '>8', 'Norm', '>7'],
            ['No', 'Steady', 'Up', 'Down'],
            ['No', 'Steady', 'Up', 'Down'],
            ['No', 'Steady', 'Up', 'Down'],
            ['No', 'Steady', 'Up', 'Down'],
            ['No', 'Steady'],
            ['No', 'Steady'],
            ['No', 'Steady', 'Up', 'Down'],
            ['No', 'Ch'],
            ['Yes', 'No']
        ]
        cnt = 0
        for b in self.T:
            if b:
                w.append(np.random.random(M*N))
            else:
                j = np.random.randint(0,len(Nominal_dict[cnt]),(1,M*N)).squeeze()
                w.append(np.array([Nominal_dict[cnt][k] for k in j]))
                cnt += 1'''
        #抽取样本作为权重
        idx_sample = np.random.randint(0, data.shape[0], M*N)
        for i in range(M*N):
            w.append(data[idx_sample[i]])
        w = np.array(w).T

        ##2 竞争
        for i in range(self.S):
            data = self.data[np.random.randint(0, self.data.shape[0], 1)[0], :]
            dist = self.calDist(data,w)
            winPointIdx = np.argmin(dist)

            ##3 迭代
            winR = np.nonzero(self.outputDist[winPointIdx]<self.R)[0]
            ww = []
            for k in range(len(w)):
                ww.append(w[k][winR])
            ww = self.update(data, np.array(ww), dist[winR])
            for k in range(len(w)):
                w[k][winR] = ww[k]

        self.w = w

    #聚类标签
    def cluster(self, data):
        res = []
        for i in range(data.shape[0]):
            d = self.dist(data[i], self.w)
            res.append(np.argmin(d))
        return np.array(res)

In [None]:
# 导入数据
df = pd.read_csv('.\dataset_diabetes\proprecessing_data.csv')   

T = np.array([False, False, False, False, False, False, True, False, True, True, True, True, True,
        True, True, True, True, True, False, False, False, False, False, False, False, False, False, 
        False])

In [None]:
## 数据补充处理
# 删除性别未知的数据
dropID = []
for value in df.index.values:
    if df.loc[value,'gender'] == 'Unknown/Invalid':
        dropID.append(value)
df = df.drop(dropID)

# 离散值替换：转为darray类型
data_set = df.to_numpy().T

# Ordianl变量改为Nominal类型
le = preprocessing.LabelEncoder()    #获取一个LabelEncoder
le = le.fit(["[0-10)", "[10-20)", "[20-30)", "[30-40)", "[40-50)", "[50-60)", "[60-70)", "[70-80)", "[80-90)", "[90-100)"])      #训练LabelEncoder
data_set[2] = le.transform(data_set[2])                #使用训练好的LabelEncoder对原数据进行编码
scaler = preprocessing.MinMaxScaler() 

# Interval-based数据归一化
idx = [6, 8, 9, 10, 11, 12, 13, 17]
idx_mix = [14, 15, 16]
scaler = scaler.fit(data_set[idx].T) 
data_set[idx] = scaler.transform(data_set[idx].T).T

for i in idx_mix:   #无效值替代
    subdata_set = np.array([str(j) for j in data_set[i]])
    invalid_char = ['V','E','?']
    idx_nan = np.array([])
    for c in invalid_char:
        char_count = np.char.count(subdata_set, c)
        idx_nan = np.append(idx_nan, np.argwhere(char_count != 0))
    idx_nan = np.unique(idx_nan).astype(int)
    subdata_set[idx_nan] = '0'
    subdata_set = subdata_set.astype(float)
    data_set[i] = scaler.fit_transform(subdata_set.reshape(-1,1)).reshape(1,-1).squeeze()

data_set = data_set.T

In [None]:
#SOM聚类
M, N = 7, 7                                     #输出层拓扑形状
som = SOM(5, 0.001, 400000, T)             
som.train(data_set, M= M, N= N)
cluster_label = som.cluster(data_set)

In [None]:
#保存结果
np.save('./result/5_001_10_10_cluster_s3.npy', cluster_label)  
np.save('./result/5_001_10_10_w_s3.npy', som.w)  