# Tuning

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt
from yellowbrick.regressor import PredictionError, ResidualsPlot
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)

In [None]:
predictors = pd.read_csv('Input/Predictors_2000SNP.csv', delim_whitespace=True)
target = pd.read_csv('Input/Target_2000SNP.csv', delim_whitespace=True)

y = target['rFitness']
X = predictors.iloc[:, 1:].copy()    # without rs column

In [None]:
X.isnull().any().any()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regr_nn = MLPRegressor(random_state=0)

In [None]:
# scaling

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# check for NANs
#np.isnan(X_train).any(), np.isnan(X_test).any(), np.isnan(y_train).any(), np.isnan(y_test).any()

# check for infinity values
np.all(np.isfinite(X_train)), np.all(np.isfinite(X_test)), np.all(np.isfinite(y_train)), np.all(np.isfinite(y_test))
#np.any(np.isnan(X_train)), np.any(np.isnan(X_test)), np.any(np.isnan(y_train)), np.any(np.isnan(y_test))

In [None]:
X_train.dtype

In [None]:
X_train = np.array(X_train, dtype=np.float128)

In [None]:
y_train.dtype

In [None]:
y_train = np.array(y_train, dtype=np.float128)

In [None]:
y_test2plot = y_test.copy()
y_test2plot = y_test2plot.reset_index()

#tarMLP #index 0-7756
#tarMLI #index 7757-15513
#tarTHP #index 15514-23270
#tarTHI #index 23271-31027

y_test2plot['locat'] = ['MLP' if 0 <= x <= 7756 else 'MLI' if 7757 <= x <= 15513 else 'THP' if 15514 <= x <= 23270 else 'THI' for x in y_test2plot['index']]

In [None]:
# Tuning

parameter_space = {
    'hidden_layer_sizes':[(400,),(200,100),(100,)],
    'activation':['tanh', 'relu'],
    'learning_rate_init': [0.001, 0.05],
    'learning_rate': ['constant','adaptive'],}

#removed 'solver':['sgd', 'adam']
#removed 'alpha':[0.0001, 0.05]

In [None]:
tuned_nn = GridSearchCV(regr_nn, parameter_space)

In [None]:
tuned_nn.fit(X_train, y_train)

In [None]:
# Best parameter set
print('Best parameters found:\n', tuned_nn.best_params_)

# All results
means = tuned_nn.cv_results_['mean_test_score']
stds = tuned_nn.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, tuned_nn.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

# in Anlehnung an: https://datascience.stackexchange.com/questions/36049/how-to-adjust-the-hyperparameters-of-mlp-classifier-to-get-more-perfect-performa

In [None]:
# Prediction
y_nn = tuned_nn.predict(X_test)
tuned_nn.score(X_test, y_test)

In [None]:
# Metrics
print('Training set score: %f' % tuned_nn.score(X_train, y_train))
print('Test set score: %f' % tuned_nn.score(X_test, y_test))


In [None]:
pr = tuned_nn.predict(X_test)
print('R2 score: %f' % r2_score(y_test, pr))  
print('RMSE score: %f' % sqrt (mean_squared_error(y_test, pr)))

# taken from here: https://github.com/Gurpremm/rxnpredict-using-sklearn-python/blob/master/chemistry_rxn_predict.ipynb 

#### Plot results with Yellowbrick
https://www.scikit-yb.org/en/latest/api/regressor/peplot.html

In [None]:
# Residuals plot
f = plt.figure()
visualizer = ResidualsPlot(tuned_nn)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.show()
#f.savefig("Output/03_Residuals.png", bbox_inches='tight', dpi=600)

In [None]:
# Prediction error plot
f = plt.figure()
visualizer = PredictionError(tuned_nn)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.show()
#f.savefig("Output/03_PredActual.png", bbox_inches='tight', dpi=600)

In [None]:
y_rf2plot = pd.DataFrame(y_nn)
df2plot = pd.concat([y_test2plot, y_rf2plot], axis=1)
df2plot.columns = ['index', 'actual', 'location', 'pred']
df2plot

In [None]:
sns.set_palette('Paired')
sns.scatterplot(x='actual', y='pred', hue='location', data=df2plot)
plt.title("Actual vs predicted beta values")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()
#plt.savefig('Output/05_PredActual_Color.png', bbox_inches='tight', dpi=600)