In [None]:
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as ConstantKernel

In [None]:
from sklearn.kernel_ridge import KernelRidge

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Load data and cross validation randomly

In [None]:
data = pd.read_excel('./results/data_scaled.xlsx')
y_index = ['itr']
y = data[y_index]
data.drop('itr', axis=1, inplace=True)
X = data.drop('Interface', axis=1)
X_index = X.columns

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optimize hyperparameter

## Decision tree

In [None]:
estimator = DecisionTreeRegressor()
parameters = {'max_depth':[5, 6, 7, 8, 9, 10, 11], 'min_samples_leaf':[1, 2, 3, 4]}
grid_search = GridSearchCV(estimator, parameters, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

optimized hyperparameter: {'max_depth': 8, 'min_samples_leaf': 2}

In [None]:
dt = DecisionTreeRegressor(max_depth=8, min_samples_leaf=2, random_state=42)
dt.fit(X_train, y_train)
importances = dt.feature_importances_

In [None]:
X_index = X.columns
importances = {index: importance for index, importance in zip(X_index, importances)}
importances = sorted(importances.items(), key=lambda x: x[1], reverse=True)

In [None]:
importances

## Gaussian process regressor

In [None]:
estimator = GaussianProcessRegressor(random_state=42)
parameters = {'alpha':[0.05, 0.1, 0.5, 1], 'kernel':[RBF(0.001), RBF(0.005), RBF(0.01), RBF(0.05), RBF(0.1), RBF(0.5), RBF(1)]}
grid_search = GridSearchCV(estimator, parameters, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

optimized hyperparameters: {'alpha': 0.5, 'kernel': RBF(length_scale=0.5)}

## Kernel ridge regressor

In [None]:
estimator = KernelRidge(kernel='rbf')
alpha_range=np.logspace(-2,10,13)
gamma_range=np.logspace(-9,3,13)
parameters = {'alpha':alpha_range, 'gamma':gamma_range}
grid_search = GridSearchCV(estimator, parameters, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

optimized hyperparameters: {'alpha': 0.01, 'gamma': 0.1}

## KNearest neighbors

In [None]:
estimator = KNeighborsRegressor()
parameters = {'n_neighbors':[i for i in range(1,11)], 'weights':['uniform', 'distance'], 'p':[i for i in range(1, 6)]}
grid_search = GridSearchCV(estimator, parameters, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

optimized hyperparameters: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}

# Performance of random validation by descriptors selected by decision tree

In [None]:
def performance(estimator, X_train, y_train, X_test, y_test):
    estimator.fit(X_train, y_train)
    R_square_train = estimator.score(X_train, y_train)
    R_square_test = estimator.score(X_test, y_test)
    y_train_predict = estimator.predict(X_train)
    mse_train = mean_squared_error(y_train_predict, y_train)
    y_test_predict = estimator.predict(X_test)
    mse_test = mean_squared_error(y_test_predict, y_test)
    print('R_sqaure_train is %f, R_square_test is %f, mse_train is %f,mse_test is %f' % (
       R_square_train, R_square_test, mse_train, mse_test))

## All descriptors

In [None]:
dt = DecisionTreeRegressor(max_depth=8, min_samples_leaf=2, random_state=42)
gpr = GaussianProcessRegressor(alpha=0.5, kernel=RBF(0.5), random_state=42)
krr = KernelRidge(kernel='rbf', alpha=0.01, gamma=0.1)
knn = KNeighborsRegressor(n_neighbors=3, p=1, weights='distance')

In [None]:
performance(dt, X_train, y_train, X_test, y_test)

performance(gpr, X_train, y_train, X_test, y_test)

performance(krr, X_train, y_train, X_test, y_test)

performance(knn, X_train, y_train, X_test, y_test)

## Top20 descriptors

In [None]:
X_index_selected = [feature for feature, importance in importances[:20]]
X_train_ = X_train[X_index_selected]
X_test_ = X_test[X_index_selected]

performance(dt, X_train_, y_train, X_test_, y_test)

performance(gpr, X_train_, y_train, X_test_, y_test)

performance(krr, X_train_, y_train, X_test_, y_test)

performance(knn, X_train_, y_train, X_test_, y_test)

## Top15 descriptors

In [None]:
X_index_selected = [feature for feature, importance in importances[:15]]
X_train_ = X_train[X_index_selected]
X_test_ = X_test[X_index_selected]

performance(dt, X_train_, y_train, X_test_, y_test)

performance(gpr, X_train_, y_train, X_test_, y_test)

performance(krr, X_train_, y_train, X_test_, y_test)

performance(knn, X_train_, y_train, X_test_, y_test)

## Top10 descriptors

In [None]:
X_index_selected = [feature for feature, importance in importances[:10]]
X_train_ = X_train[X_index_selected]
X_test_ = X_test[X_index_selected]

performance(dt, X_train_, y_train, X_test_, y_test)

performance(gpr, X_train_, y_train, X_test_, y_test)

performance(krr, X_train_, y_train, X_test_, y_test)

performance(knn, X_train_, y_train, X_test_, y_test)

## Top5 descriptors

In [None]:
X_index_selected = [feature for feature, importance in importances[:5]]
X_train_ = X_train[X_index_selected]
X_test_ = X_test[X_index_selected]

performance(dt, X_train_, y_train, X_test_, y_test)

performance(gpr, X_train_, y_train, X_test_, y_test)

performance(krr, X_train_, y_train, X_test_, y_test)

performance(knn, X_train_, y_train, X_test_, y_test)

# Performance of random validation by descriptors selected by decision tree and univariate testing

In [None]:
dtuv_15 = ['T',
 'fAC1x',
 'fENc',
 'fEb',
 'fdensity',
 'fheatcap',
 'fmelt',
 'fthick',
 'funit',
 'sAC1y',
 'sENc',
 'sIPc',
 'sheatcap',
 'smelt',
 'sunit']
dtuv_5 = ['fheatcap', 'fmelt', 'funit', 'sENc', 'sheatcap']

## DTUV_15

In [None]:

X_train_ = X_train[dtuv_15]
X_test_ = X_test[dtuv_15]

performance(dt, X_train_, y_train, X_test_, y_test)

performance(gpr, X_train_, y_train, X_test_, y_test)

performance(krr, X_train_, y_train, X_test_, y_test)

performance(knn, X_train_, y_train, X_test_, y_test)

## DTUV-5

In [None]:

X_train_ = X_train[dtuv_5]
X_test_ = X_test[dtuv_5]

performance(dt, X_train_, y_train, X_test_, y_test)

performance(gpr, X_train_, y_train, X_test_, y_test)

performance(krr, X_train_, y_train, X_test_, y_test)

performance(knn, X_train_, y_train, X_test_, y_test)