In [2]:
import numpy as np
from itertools import combinations

# 标准化

In [None]:
def normalization(x, method="min-max"):
    '''
    :param x: 变量列表，请确定不是常数或空值
    :param method: 标准化方法
    :return: 标准化结果， 或者-1（错误输入）
    '''
    if method == "min-max":
        x_min = min(x)
        x_max = max(x)
        d = x_max - x_min
        x2 = [(i-x_min)*1.0/d  for i in x]
        return x2
    elif method == "zero-score":
        mu = np.mean(x)
        std = np.std(x)
        x2 = [(i - mu) * 1.0 / std for i in x]
        return x2
    else:
        print("请指定标准化方法：min-max或者zero-score")
    

# one-hot编码

In [15]:
def oneHotEncoding(df, old_field):
    """
    param df: 包含需要独热编码特征的dataframe
    param old_field: the raw filed
    return: 独热编码结果
    """
    distinct_vals = list(set(reduce(lambda x,y: x+y, df["old_field"])))
    cnt = len(distinct_vals)
    new_fields = [old_field + "_" + str(i) for i in cnt]
    for i in range(cnt):
        df[new_fields[i]] = 0
        df[new_fields[i]] = df[old_field].map(lambda x: int(distinct_vals[i] in x))
    del df[old_field]
    return 1

# 闵可夫斯基距离

In [None]:
def minkovDist(x, y, p):
    """
    :param x, y: 两个样本的numpy数组
    :param p: 距离的维度
    :return: 距离或者-1
    """
    if p >= 1:
        return (sum((x-y) ** p))**(1.0/p)
    else:
        print("p必须大于0")
        return -1

# VDM(value diffrence metric)

In [16]:
def VDM(*c):
    """
    :param c: c: the tuple of groups, like {'M': 100, 'F': 80}, {'M': 150, 'F': 160}, {'M': 50, 'F': 60}
    :return: 共同属性的距离
    """
    c0 = c[0]
    attribute = c0.keys()
    mutual_keys = list(combinations(attribute, 2))
    VDM_dict = {i:0 for i in mutual_keys}
    values = {k:0 for k in attribute}
    for cc in c:
        for a in attribute:
            values[a] += cc[a]
    for (k1, k2) in mutual_keys:
        s = 0
        for cc in c:
            d = np.abs(cc[k1]*1.0/values[k1] - cc[k2]*1.0/values[k2])
            s += d**p
        VDM_dict[(k1,k2)] = s
    return VDM_dict