In [2]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
import matplotlib.pyplot as plt
from scipy.interpolate import lagrange   # 导入拉格朗日插值函数
from sklearn.cluster import KMeans      # 导入一维聚类分析函数

### 拉格朗日法插补缺失值

In [3]:
# 数据路径的导入与输出
inputfiles = '../data/catering_sale.xls'   # 销量数据路径
outputfiles = '../tmp/sales.xls'           # 输出数据路径


In [4]:
data = pd.read_excel(inputfiles)          # 读入数据
data[u'销量'][(data[u'销量']<400) | (data[u'销量']>5000)] = None   # 过滤异常值


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [5]:
# 定义列向量插值函数
# s为列向量， n为被插值的位置，k为取前后数据的个数
def ployinterp_column(s, n, k = 5):
    y = s[list(range(n-k, n))+list(range(n+1, n+1+k))]  # 取数
    y = y[y.notnull()]    # 剔除空值
    return lagrange(y.index, list(y))(n)   # 插值并返回结果

# 逐个判断元素是否需要插值
for i in  data.columns:
    for j in range(len(data)):
        if (data[i].isnull())[j]:   # 如果为空即插值
            data[i][j] = ployinterp_column(data[i], j)
data.to_excel(outputfiles)   # 输出结果写入文件

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


### 数据规范化

In [6]:
datafile = '../data/normalization_data.xls'   # 获取参数路径

In [7]:
data  = pd.read_excel(datafile,header=None)  # 读取数据

In [8]:
(data - data.min())/(data.max()-data.min())  # 最小规范化


Unnamed: 0,0,1,2,3
0,0.07438,0.937291,0.92352,1.0
1,0.619835,0.0,0.0,0.850941
2,0.214876,0.119565,0.813322,0.0
3,0.0,1.0,1.0,0.563676
4,1.0,0.942308,0.996711,0.804149
5,0.264463,0.838629,0.814967,0.90931
6,0.636364,0.84699,0.786184,0.929571


In [9]:
(data - data.mean())/data.std()     # 零均值规范化

Unnamed: 0,0,1,2,3
0,-0.905383,0.635863,0.464531,0.798149
1,0.604678,-1.587675,-2.193167,0.36939
2,-0.516428,-1.30403,0.147406,-2.078279
3,-1.111301,0.784628,0.684625,-0.456906
4,1.657146,0.647765,0.675159,0.234796
5,-0.37915,0.401807,0.152139,0.537286
6,0.650438,0.421642,0.069308,0.595564


In [10]:
         
data/10**np.ceil(np.log10(data.abs().max())) # 小数定标规范化

Unnamed: 0,0,1,2,3
0,0.078,0.521,0.602,0.2863
1,0.144,-0.6,-0.521,0.2245
2,0.095,-0.457,0.468,-0.1283
3,0.069,0.596,0.695,0.1054
4,0.19,0.527,0.691,0.2051
5,0.101,0.403,0.47,0.2487
6,0.146,0.413,0.435,0.2571


### 连续属性离散化

In [11]:
datafile = '../data/discretization_data.xls'  # 参数初始化

In [12]:
data = pd.read_excel(datafile)
data = data[u'肝气郁结证型系数'].copy()
k = 4


In [14]:
d1 = pd.cut(data, k, labels= range(k))   # 等宽离散化，依次命名1,2,3,4
w = [1.0*i/k for i in range(k+1)]
w = data.describe(percentiles=w)[4:4+k+1]   # 自动计算四分位数
w[0] = w[0]*(1-e-10)       # 此处e的次方书写错误

d2 = pd.cut(data,w,labels=range(k))
kmodel = KMeans(n_clusters = k, n_jobs = 4)  # 建立聚类分析模型，n_jobs是并行数，一班等于CPU数
kmodel.fit(data.reshape((len(data), 1)))     # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_).sort(0)  # 输出聚类中心，并且排序
w = pd.rolling_mean(c, 2).iloc[1:]   # 相邻两项求中点，作为边界点
w = [0]+list(w[0])+[data.max()]      # 把末尾便捷点加上
d3 = pd.cut(data, w, labels=range(k))

def cluster_plot(d,k): # 自定义作图函数来显示聚类结果
    plt.figure(figsize=(8,3))
    for j in range(0,k):
        plt.plot(data[d==j], [j for i in d[d==j]], 'o')
    plt.ylim(-0.5,k-0.5)
    return plt
cluster_plot(d1,k).show()
cluster_plot(d2, k).show()
cluster_plot(d3, k).show()

NameError: name 'e' is not defined

### 主成分分析降维

In [15]:
inputfiles = '../data/principal_component.xls'
outputfiles = '../tmp/dimention_reducted.xls'

In [16]:
data = pd.read_excel(inputfiles, header=None)
from sklearn.decomposition import PCA   # 导入主成分分析算法


In [18]:
pca = PCA()
pca.fit(data)
pca.components_  # 返回模型各个特征量



array([[ 0.56788461,  0.2280431 ,  0.23281436,  0.22427336,  0.3358618 ,
         0.43679539,  0.03861081,  0.46466998],
       [ 0.64801531,  0.24732373, -0.17085432, -0.2089819 , -0.36050922,
        -0.55908747,  0.00186891,  0.05910423],
       [-0.45139763,  0.23802089, -0.17685792, -0.11843804, -0.05173347,
        -0.20091919, -0.00124421,  0.80699041],
       [-0.19404741,  0.9021939 , -0.00730164, -0.01424541,  0.03106289,
         0.12563004,  0.11152105, -0.3448924 ],
       [-0.06133747, -0.03383817,  0.12652433,  0.64325682, -0.3896425 ,
        -0.10681901,  0.63233277,  0.04720838],
       [ 0.02579655, -0.06678747,  0.12816343, -0.57023937, -0.52642373,
         0.52280144,  0.31167833,  0.0754221 ],
       [-0.03800378,  0.09520111,  0.15593386,  0.34300352, -0.56640021,
         0.18985251, -0.69902952,  0.04505823],
       [-0.10147399,  0.03937889,  0.91023327, -0.18760016,  0.06193777,
        -0.34598258, -0.02090066,  0.02137393]])

In [19]:
pca.explained_variance_ratio_     # 返回各个成分各自的方差百分比

array([7.74011263e-01, 1.56949443e-01, 4.27594216e-02, 2.40659228e-02,
       1.50278048e-03, 4.10990447e-04, 2.07718405e-04, 9.24594471e-05])

In [20]:
pca =PCA(3)
pca.fit(data)
low_d = pca.transform(data)    # 降低维度
pd.DataFrame(low_d).to_excel(outputfiles)  # 保存结果
pca.inverse_transform(low_d)     # 复原数据

array([[41.81945026, 17.92938537,  7.42743613,  6.38423781,  7.51911186,
         7.95581778,  1.89450158, 22.64634237],
       [26.03033486,  8.31048339, 11.0923029 , 10.50941053, 13.73592734,
        19.29219354,  1.55616178, 10.69991334],
       [12.8912027 ,  4.7200299 ,  4.15574756,  3.88084002,  4.15590258,
         5.95354081,  0.63142514,  3.10031979],
       [21.95107023,  7.86983692,  5.61296149,  5.00363184,  5.46598715,
         7.32692984,  1.00043437,  6.90279388],
       [33.2494621 , 16.9295226 ,  6.97070109,  6.54184048,  8.78799069,
         9.47854775,  1.76803069, 25.48379317],
       [35.30223656, 14.31635159, 16.19611986, 15.83211443, 22.51688172,
        30.25654088,  2.46591519, 25.94480913],
       [22.0404299 ,  7.67212745,  9.96458085,  9.59042702, 12.69748404,
        17.7402549 ,  1.39886681, 10.62704002],
       [47.82344306, 16.03581175, 11.11907058,  9.5362307 , 11.08119152,
        14.24461981,  2.12478649, 16.79265084],
       [40.72333307, 17.98533192