In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

In [None]:
import random

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

# Load data

In [None]:
data = pd.read_excel('./results/data_scaled.xlsx')

# Group validation

In [None]:
counts = data['Interface'].value_counts()

In [None]:
interface_counts = dict(counts)

In [None]:
interfaces = list(interface_counts.keys())

In [None]:
random.shuffle(interfaces)

In [None]:
test = []

In [None]:
curr_count = 0
curr = []
test = []
for interface in interfaces:
    curr_count += interface_counts[interface]
    curr.append(interface)
    if curr_count >= 140:
        curr_count = 0
        test.append(curr)
        print(curr)
        curr = []
test.append(curr)

In [None]:
cv = []
for test_interface in test:
    train_index = data[~data.Interface.isin(test_interface)].index
    test_index = data[data.Interface.isin(test_interface)].index
    cv.append([train_index, test_index])

In [None]:
X = data.drop(['Interface', 'itr'], axis=1)
y = data['itr']

# Optimize hyperparameter

## Decision tree

In [None]:
estimator = DecisionTreeRegressor()
parameters = {'max_depth':[2, 3, 5, 6, 7, 8, 9], 'min_samples_leaf':[1, 2, 3, 4, 5, 6, 7]}
grid_search = GridSearchCV(estimator, parameters, scoring='neg_mean_squared_error', cv=cv)
grid_search.fit(X, y)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

optimized parameter: {'max_depth': 9, 'min_samples_leaf': 5} 

## Gaussian process regressor

In [None]:
estimator = GaussianProcessRegressor(random_state=42)
parameters = {'alpha':[0.05, 0.1, 0.5, 1, 5, 10], 'kernel':[RBF(0.001), RBF(0.005), RBF(0.01), RBF(0.05), RBF(0.1), RBF(0.5), RBF(1), RBF(10), RBF(100)]}
grid_search = GridSearchCV(estimator, parameters, scoring='neg_mean_squared_error', cv=cv)
grid_search.fit(X, y)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

optimized hyperparameter: {'alpha': 1, 'kernel': RBF(length_scale=0.5)}

## Kernel ridge regressor

In [None]:
estimator = KernelRidge(kernel='rbf')
alpha_range=[0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
gamma_range=[0.0001, 0.0005, 0.01, 0.05, 0.1, 0.5, 1]
parameters = {'alpha':alpha_range, 'gamma':gamma_range}
grid_search = GridSearchCV(estimator, parameters, scoring='neg_mean_squared_error', cv=cv)
grid_search.fit(X, y)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

optimized hyperparameter: {'alpha': 0.05, 'gamma': 0.01}

## KNearest neighbors

In [None]:
estimator = KNeighborsRegressor()
parameters = {'n_neighbors':[i for i in range(1,20)], 'weights':['uniform', 'distance'], 'p':[i for i in range(1, 6)]}
grid_search = GridSearchCV(estimator, parameters, scoring='neg_mean_squared_error', cv=cv)
grid_search.fit(X, y)

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_params_

optimized hyperparameter: {'n_neighbors': 6, 'p': 1, 'weights': 'uniform'}

# Performance of group validation by descriptors selected by decision tree

In [None]:
dt = DecisionTreeRegressor(max_depth=9, min_samples_leaf=5, random_state=42)
gpr = GaussianProcessRegressor(alpha=1, kernel=RBF(0.5), random_state=42)
krr = KernelRidge(kernel='rbf', alpha=0.05, gamma=0.01)
knn = KNeighborsRegressor(n_neighbors=6, p=1, weights='uniform')

In [None]:
dt_dt = []
gpr_dt = []
krr_dt = []
knn_dt = []

In [None]:
importances=[('fmelt', 0.5097206136660818),
 ('fthick', 0.1826317239645447),
 ('sheatcap', 0.10621889559441318),
 ('fheatcap', 0.0793520891284305),
 ('sdensity', 0.024888283196101767),
 ('funit', 0.024091762378890956),
 ('sAC1y', 0.021371980756532648),
 ('T', 0.016594557893137206),
 ('sENc', 0.01374813695858899),
 ('sunit', 0.008929258409001887),
 ('interlayer', 0.004503754650566429),
 ('fAC1x', 0.003071434986695499),
 ('fEb', 0.0030248751201656047),
 ('sAC2x', 0.0005730668760265949),
 ('fIPa', 0.0005344525219175029),
 ('sIPc', 0.00044787622369259444),
 ('fENc', 0.00023319897367195038),
 ('sR1', 3.465008293823693e-05),
 ('smelt', 2.9224069554527495e-05),
 ('fdensity', 1.645490476126332e-07),
 ('fmass', 0.0),
 ('sEb', 0.0),
 ('smass', 0.0),
 ('fAC1y', 0.0),
 ('fAC2x', 0.0),
 ('fAC2y', 0.0),
 ('fIPc', 0.0),
 ('sAC1x', 0.0),
 ('sAC2y', 0.0),
 ('sIPa', 0.0),
 ('fR1', 0.0),
 ('fR2', 0.0),
 ('fENa', 0.0),
 ('sR2', 0.0),
 ('sENa', 0.0)]

In [None]:
def performance(best_estimator, X):
    r_squares = []
    mean_squared_errors = []
    for train_index, test_index in cv:
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        best_estimator.fit(X_train, y_train)
        r_square = best_estimator.score(X_test, y_test)
        y_test_predict = best_estimator.predict(X_test)
        mse_test = mean_squared_error(y_test_predict, y_test)
        r_squares.append(r_square)
        mean_squared_errors.append(mse_test)
    r_square = sum(r_squares)/len(r_squares)
    mse = sum(mean_squared_errors)/len(mean_squared_errors)
    return r_square, mse

In [None]:
r_square_dt, mse_dt = performance(dt, X)
r_square_gpr, mse_gpr = performance(gpr, X)
r_square_krr, mse_krr = performance(krr, X)
r_square_knn, mse_knn = performance(knn, X)
dt_dt.append(mse_dt)
gpr_dt.append(mse_gpr)
krr_dt.append(mse_krr)
knn_dt.append(mse_knn)

In [None]:
r_square_dt

In [None]:
mse_dt

In [None]:
r_square_gpr

In [None]:
mse_gpr

In [None]:
r_square_knn

In [None]:
mse_knn

In [None]:
r_square_krr

In [None]:
mse_krr

## Top20 descriptors

In [None]:
X_index_20 = [feature for feature, importance in importances[:20]]
X_selected_20 = X[X_index_20]
r_square_dt, mse_dt = performance(dt, X_selected_20)
r_square_gpr, mse_gpr = performance(gpr, X_selected_20)
r_square_krr, mse_krr = performance(krr, X_selected_20)
r_square_knn, mse_knn = performance(knn, X_selected_20)

In [None]:
dt_dt.append(mse_dt)
gpr_dt.append(mse_gpr)
krr_dt.append(mse_krr)
knn_dt.append(mse_knn)

In [None]:
mse_dt

In [None]:
mse_gpr

In [None]:
mse_knn

In [None]:
mse_krr

## Top15 descriptors

In [None]:
X_index_15 = [feature for feature, importance in importances[:15]]
X_selected_15 = X[X_index_15]
r_square_dt, mse_dt = performance(dt, X_selected_15)
r_square_gpr, mse_gpr = performance(gpr, X_selected_15)
r_square_krr, mse_krr = performance(krr, X_selected_15)
r_square_knn, mse_knn = performance(knn, X_selected_15)

In [None]:
dt_dt.append(mse_dt)
gpr_dt.append(mse_gpr)
krr_dt.append(mse_krr)
knn_dt.append(mse_knn)

In [None]:
mse_dt

In [None]:
mse_gpr

In [None]:
mse_knn

In [None]:
mse_krr

## Top10 descriptors

In [None]:
X_index_10 = [feature for feature, importance in importances[:10]]
X_selected_10 = X[X_index_10]
r_square_dt, mse_dt = performance(dt, X_selected_10)
r_square_gpr, mse_gpr = performance(gpr, X_selected_10)
r_square_krr, mse_krr = performance(krr, X_selected_10)
r_square_knn, mse_knn = performance(knn, X_selected_10)

In [None]:
dt_dt.append(mse_dt)
gpr_dt.append(mse_gpr)
krr_dt.append(mse_krr)
knn_dt.append(mse_knn)

In [None]:
mse_dt

In [None]:
mse_gpr

In [None]:
mse_krr

In [None]:
mse_knn

## Top5 descriptors

In [None]:
X_index_5 = [feature for feature, importance in importances[:5]]
X_selected_5 = X[X_index_5]
r_square_dt, mse_dt = performance(dt, X_selected_5)
r_square_gpr, mse_gpr = performance(gpr, X_selected_5)
r_square_krr, mse_krr = performance(krr, X_selected_5)
r_square_knn, mse_knn = performance(knn, X_selected_5)

In [None]:
dt_dt.append(mse_dt)
gpr_dt.append(mse_gpr)
krr_dt.append(mse_krr)
knn_dt.append(mse_knn)

In [None]:
mse_dt

In [None]:
mse_gpr

In [None]:
mse_krr

In [None]:
mse_knn

# Performance of group validation by descriptors selected by decision tree and univariate testing

In [None]:
dtuv_15 = ['T',
 'fAC1x',
 'fENc',
 'fEb',
 'fdensity',
 'fheatcap',
 'fmelt',
 'fthick',
 'funit',
 'sAC1y',
 'sENc',
 'sIPc',
 'sheatcap',
 'smelt',
 'sunit']
dtuv_5 = ['fheatcap', 'fmelt', 'funit', 'sENc', 'sheatcap']

In [None]:
dt_dtuv = []
gpr_dtuv = []
knn_dtuv = []
krr_dtuv = []

## Performance of dtuv_15

In [None]:
X_selected_15 = X[dtuv_15]
r_square_dt, mse_dt = performance(dt, X_selected_15)
r_square_gpr, mse_gpr = performance(gpr, X_selected_15)
r_square_krr, mse_krr = performance(krr, X_selected_15)
r_square_knn, mse_knn = performance(knn, X_selected_15)

In [None]:
dt_dtuv.append(mse_dt)
gpr_dtuv.append(mse_gpr)
knn_dtuv.append(mse_knn)
krr_dtuv.append(mse_krr)

In [None]:
mse_dt

In [None]:
mse_gpr

In [None]:
mse_krr

In [None]:
mse_knn

## Performance of dtuv_5

In [None]:
X_selected_5 = X[dtuv_5]
r_square_dt, mse_dt = performance(dt, X_selected_5)
r_square_gpr, mse_gpr = performance(gpr, X_selected_5)
r_square_krr, mse_krr = performance(krr, X_selected_5)
r_square_knn, mse_knn = performance(knn, X_selected_5)

In [None]:
dt_dtuv.append(mse_dt)
gpr_dtuv.append(mse_gpr)
knn_dtuv.append(mse_knn)
krr_dtuv.append(mse_krr)

In [None]:
mse_dt

In [None]:
mse_gpr

In [None]:
mse_krr

In [None]:
mse_knn

In [None]:
dt_dt

In [None]:
gpr_dt

In [None]:
krr_dt

In [None]:
knn_dt

In [None]:
dt_dtuv

In [None]:
gpr_dtuv

In [None]:
krr_dtuv

In [None]:
knn_dtuv