# NNSL-7-MLI-HL10
## NN using scikit-learn, 7 environments, MLI as cross validation data, 10 hidden layers (decreasing size)

## Import packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt
from yellowbrick.regressor import PredictionError, ResidualsPlot
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)



## Neural net

In [None]:
predictors = pd.read_csv('Input/Predictors_7locs.csv', delim_whitespace=True)
target = pd.read_csv('Input/Target_7locs.csv', delim_whitespace=True)

entire = pd.concat([predictors.reset_index(drop=True), target.reset_index(drop=True)], axis=1, sort=False)
entire

In [None]:
# chosen environment for CV
MLI = entire[entire["locat"] == 'MLI']
# dataset for training
rest = entire[entire["locat"] != 'MLI']

In [None]:
y_train = rest['rFitness']
X_train =  rest.iloc[:, 1:-3]

y_test= MLI['rFitness']
X_test = MLI.iloc[:, 1:-3]

***

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regr_nn = MLPRegressor(activation ='relu', hidden_layer_sizes = (100, 90, 80, 70, 60, 50, 40, 30, 20, 10), learning_rate='adaptive', learning_rate_init = 0.001, random_state=0)
# using best parameters found by GridSearchCV

In [None]:
# Scaling

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
y_test2plot = y_test.copy()
y_test2plot = y_test.reset_index()

#restMLP #0-16486
#restMLI #16487-32973
#restTHP #32974-49460
#restTHI #49461-65947
#restAND #65948-82434
#restGER #82435-98921
#restFIN #98922-115408
#restUKI #131896-148382

y_test2plot['locat'] = ['MLP' if 0 <= x <= 16486 else 'MLI' if 16487 <= x <= 32973 else 'THP' if 32974 <= x <= 49460 else 'THI' if 49461 <= x <= 65947 else 'AND' if 65948 <= x <= 82434 else 'GER' if 82435 <= x <= 98921 else 'FIN' if 98922 <= x <= 115408 else 'SPA' for x in y_test2plot['index']]

In [None]:
# Training
regr_nn.fit(X_train, y_train)

In [None]:
# Prediction
y_nn = regr_nn.predict(X_test)
regr_nn.score(X_test, y_test)


In [None]:
# Metrics
print('Training set score: %f' % regr_nn.score(X_train, y_train), file=open('Output/08CVenv_Metrics.txt', 'a'))
print('Test set score: %f' % regr_nn.score(X_test, y_test), file=open('Output/08CVenv_Metrics.txt', 'a'))
#print('Mean of cross validation score: ', scores.mean(), file=open('Output/05_Metrics.txt', 'a'))

In [None]:
pr = regr_nn.predict(X_test)
print('R2 score: %f' % r2_score(y_test, pr), file=open('Output/08CVenv_Metrics.txt', 'a'))  
print('RMSE score: %f' % sqrt ( mean_squared_error(y_test, pr)), file=open('Output/08CVenv_Metrics.txt', 'a'))

# taken from here: https://github.com/Gurpremm/rxnpredict-using-sklearn-python/blob/master/chemistry_rxn_predict.ipynb 

#### Plot results with Yellowbrick
https://www.scikit-yb.org/en/latest/api/regressor/peplot.html

In [None]:
# Residuals plot
f = plt.figure()
visualizer = ResidualsPlot(regr_nn)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.show()
#f.savefig("Output/08CVenv_Residuals.png", bbox_inches='tight', dpi=600)

In [None]:
# Prediction error plot
f = plt.figure()
visualizer = PredictionError(regr_nn)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.show()
#f.savefig("Output/08CVenv_PredActual.png", bbox_inches='tight', dpi=600)

In [None]:
y_nn2plot = pd.DataFrame(y_nn)
df2plot = pd.concat([y_test2plot, y_nn2plot], axis=1)
df2plot.columns = ['index', 'Actual', 'Location', 'Pred']
df2plot
df2plot.to_csv('Input/08CVenvMLI_predictedValues.csv')

In [None]:
df2plot = pd.read_csv('Input/08CVenvMLI_predictedValues.csv')
df2plot

In [None]:
df2plot = pd.read_csv('Input/08CVenvMLI_predictedValues.csv')

col =['#006e00']

sns.set(rc={'figure.figsize':(13,10)})
sns.set_style("whitegrid")
sns.set_palette(col)
s = sns.scatterplot(data = df2plot, x='Pred', y='Actual', sizes=(20)) 
plt.title("Neural network | scikit-learn | 1,000 - 1,000 SNPs", size= 16, pad=25)
plt.suptitle("Actual vs predicted selection coefficients from MLI", size = 20)
plt.xlabel("Predicted", size=16)
plt.ylabel("Actual", size=16)
plt.xlim(-1,3)
plt.ylim(-1,3)
#plt.savefig('Output/08CVenvMLI_PredActual.png', bbox_inches='tight')
