In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from scipy.spatial.distance import cdist
from sklearn.feature_selection import SelectPercentile
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import AdaBoostRegressor,ExtraTreesRegressor,RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

In [3]:

path = r'../A2_data/A2_output/处理数据.xlsx'
df = pd.read_excel(path,sheet_name='Au20')
X,y = df.iloc[:,3:],df.iloc[:,2]
X,y = X.values,y.values    # X特征参数、y能量标签属性

def xyz(n):
    """xyz[]:第n个样本原子的坐标值"""
    xyz = []
    for i in range(20):
        xyz.append([X[n,0 + 3 * i], X[n,1 + 3 * i], X[n,2 + 3 * i]])
    return xyz

def distmatrix(xyz):
    """计算单个团簇样本的距离矩阵"""
    dist=cdist(xyz,xyz,metric='euclidean')
    return dist

def kulun(Z,sample):
    """采用库伦矩阵对单个团簇样本的坐标降维：c = N*N
    Z：原子序号"""
    c = np.zeros((20,20))
    for i in range(20):
        for j in range(20):
            if i == j:
                c[i,j] = 0.5*(Z**2.4)
            else:
                c[i,j] = (Z**2)/np.linalg.norm(np.asarray(sample[i])-np.asarray(sample[j]))
    return c

def eig_(maxtri):
    """计算矩阵的特征值和特征向量"""
    return np.linalg.eig(maxtri)

"""生成衍生特征"""
def dict_index(dist):
    """距离矩阵相关指标：最大距离，最小距离，平均距离，平均距离，中位数距离"""
    mindist, maxdist, sumdist, meandist, meddist = [], [], [], [], []
    for i in range(dist.shape[0]):
        temp = []
        for j in range(dist.shape[1]):
            if j == i:
                continue
            temp.append(dist[i, j])
        mindist = min(temp)
        maxdist = max(temp)
        meandist = np.mean(temp)
        meddist = np.median(temp)
        sumdist = sum(temp)
    return mindist, maxdist, meandist, meddist, sumdist

def close_matrix(dist, index):
    """根据距离矩阵得到邻接矩阵"""
    matrix = np.where(dist > index, 1, 0)
    return matrix

def netgraph_index(matrix):
    """网络图结构指标分析 """
    G = nx.Graph(matrix)
    Gnum = G.number_of_edges()
    mean_cluster = nx.average_clustering(G)  # 平均聚类系数
    netrans = nx.transitivity(G)  # 网络传递性
    mean_degrcenter = np.average(list(nx.degree_centrality(G).values()))  # 平均度中心性
    mean_closcenter = np.average(list(nx.closeness_centrality(G).values()))  # 平均接近中心性
    mean_betwcenter = np.average(list(nx.betweenness_centrality(G).values()))  # 平均中介中心性
    return Gnum, mean_cluster, netrans, mean_degrcenter, mean_closcenter, mean_betwcenter

r_cut = 0.6
yita = 1
r_s = 3
def f_c(r_ij):
    """分子性质衍生特征的切断函数f_c"""
    if r_cut > r_ij:
        f = (np.cos(np.pi*r_ij/r_cut)+1)/2
    else:
        f = 0
    return f

def feature_vector():
    """特征向量的预处理"""
    data = []
    for i in range(len(y)):
        """库伦矩阵降维"""
        sample = xyz(i)
        c = kulun(79, sample)
        a, b = eig_(c)  # a为特征值、b为特征向量
        a = list(a)
        """距离衍生特征"""
        dist = distmatrix(sample)  # 距离矩阵
        mindist, maxdist, meandist, meddist, sumdist = dict_index(dist)  # 距离指标
        a = a + [mindist, maxdist, meandist, meddist, sumdist]
        """复杂网络衍生特征"""
        closematrix = np.where(dist > meandist, 1, 0)  # 计算邻接矩阵
        Gnum, mean_cluster, netrans, mean_degrcenter, mean_closcenter, mean_betwcenter = netgraph_index(closematrix)  # 网络结构指标
        a = a + [Gnum, mean_cluster, netrans, mean_degrcenter, mean_closcenter, mean_betwcenter]
        # """分子性质衍生特征"""
        # a.append(np.average([np.exp(-yita*(np.linalg.norm(np.asarray(sample[j]) - np.asarray(sample[k])-r_s)**2))*
        #                      f_c(np.linalg.norm(np.asarray(sample[j]) - np.asarray(sample[k])))
        #                      for j in range(20) for k in range(20)]))   # 函数G2
        # a.append(np.average([f_c(np.linalg.norm(np.asarray(sample[j]) - np.asarray(sample[k])))
        #                      for j in range(20) for k in range(20)]))   # 函数G1
        data.append(list(a))
    data = np.array(data,dtype=float)
    return data


In [4]:
data = feature_vector()   # 特征向量
X_train,X_test,y_train,y_test = train_test_split(data,y,random_state=42)

"""提取特征值"""
select = SelectPercentile(percentile=100)   # 百分比
select.fit(X_train,y_train)
X_train_selected = select.transform(X_train)
X_test_selected = select.transform(X_test)

  data = np.array(data,dtype=float)


In [5]:

"""线性回归"""
lr = LinearRegression().fit(X_train_selected,y_train)
print('lr.coef_:{}'.format(lr.coef_))
print('lr.intercept_:{}'.format(lr.intercept_))
print('Training set score of lr:{:.2f}'.format(lr.score(X_train_selected,y_train)))
print('Test set score of lr:{:.2f}'.format(lr.score(X_test_selected,y_test)))

lr.coef_:[ 3.21338552e+08  3.21338552e+08  3.21338552e+08  3.21338552e+08
  3.21338552e+08  3.21338552e+08  3.21338552e+08  3.21338552e+08
  3.21338552e+08  3.21338552e+08  3.21338552e+08  3.21338552e+08
  3.21338552e+08  3.21338552e+08  3.21338552e+08  3.21338552e+08
  3.21338552e+08  3.21338552e+08  3.21338552e+08  3.21338552e+08
 -1.54845646e+01 -4.11397632e-01  3.70298215e-03 -4.24050434e-01
  7.03566762e-02  2.71092072e-02 -1.43324327e+00  2.35158384e+00
  1.42679581e-04 -4.50631726e+00  3.26930705e+01]
lr.intercept_:-115151036514786.03
Training set score of lr:0.73
Test set score of lr:0.69


In [6]:

"""岭回归"""
ridge = Ridge(alpha=10).fit(X_train_selected,y_train)
print('ridge.coef_:{}'.format(ridge.coef_))
print('ridge.intercept_:{}'.format(ridge.intercept_))
print('Training set score of ridge:{:.2f}'.format(ridge.score(X_train_selected,y_train)))
print('Test set score of ridge:{:.2f}'.format(ridge.score(X_test_selected,y_test)))

ridge.coef_:[-9.38142679e-04 -5.78438097e-04 -4.01585618e-04 -3.51324750e-04
 -1.82068644e-04 -1.41933710e-04  5.56759115e-06 -9.13914091e-05
 -6.66016665e-05  6.99106321e-05 -2.02200735e-05  1.00464364e-05
  2.16101796e-04 -1.65786276e-04  4.69393203e-04  7.66751161e-05
  4.92301195e-04  3.72938520e-04  2.15715337e-04  1.00884351e-03
 -3.28724705e-01 -4.06204703e-01  2.80796612e-03 -2.42297139e-01
  5.33513563e-02  3.50671839e-04 -6.30838227e-02  2.94650127e-01
  1.84564126e-06 -1.06440096e-01  5.50016804e-03]
ridge.intercept_:-1513.7796319480365
Training set score of ridge:0.72
Test set score of ridge:0.69


In [7]:
"""Lasso回归"""
lasso = Lasso().fit(X_train_selected,y_train)
print('Training set score of lasso:{:.2f}'.format(lasso.score(X_train_selected,y_train)))
print('Test set score of lasso:{:.2f}'.format(lasso.score(X_test_selected,y_test)))
print("Training set MSE of lasso:{:.2f}".format(mean_squared_error(y_train,lasso.predict(X_train_selected))))
print("Test set MSE of lasso:{:.2f}".format(mean_squared_error(y_test,lasso.predict(X_test_selected))))

Training set score of lasso:0.71
Test set score of lasso:0.68
Training set MSE of lasso:2.39
Test set MSE of lasso:2.73


In [8]:

"""随机森林"""
rf = RandomForestRegressor().fit(X_train,y_train)
print('Training set score:{:.2f}'.format(rf.score(X_train,y_train)))
print('Test set score:{:.2f}'.format(rf.score(X_test,y_test)))
print("Training set MSE:{:.2f}".format(mean_squared_error(y_train,rf.predict(X_train))))
print("Test set MSE:{:.2f}".format(mean_squared_error(y_test,rf.predict(X_test))))

Training set score:0.97
Test set score:0.75
Training set MSE:0.26
Test set MSE:2.14


In [9]:

"""神经网络"""
# 引入标准化函数
x_MinMax = preprocessing.MinMaxScaler()
y_MinMax = preprocessing.MinMaxScaler()
y = np.array(y).reshape(len(y),1)
X_ = x_MinMax.fit_transform(X)
y_ = y_MinMax.fit_transform(y)
X_train_, X_test_, y_train_, y_test_ = train_test_split(X_,y_,test_size = 0.2)
mlp = MLPRegressor(solver='adam',activation='relu',random_state=0,hidden_layer_sizes=[10,100],alpha=0.1).fit(X_train_,y_train_)
#计算训练集mse
pred1_train = mlp.predict(X_train_)
mse_1 = mean_squared_error(pred1_train,y_train_)
print ("Train ERROR = ", mse_1)
#计算测试集mse
pred1_test = mlp.predict(X_test_)
mse_2 = mean_squared_error(pred1_test,y_test_)
print ("Test ERROR = ", mse_2)
print('Accuracy on training set:{:.2f}'.format(mlp.score(X_train_,y_train_)))
print('Accuracy on test set:{:.2f}'.format(mlp.score(X_test_,y_test_)))


  return f(**kwargs)


Train ERROR =  0.010741414908111993
Test ERROR =  0.009878760616176674
Accuracy on training set:0.13
Accuracy on test set:0.02
