# NNSL-2-20-HL20
## NN using scikit-learn, 2 locations, train-test-split (80/20), 20 hidden layers (decreasing size)


## Import packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt
from yellowbrick.regressor import PredictionError, ResidualsPlot
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)



## Neural net

In [None]:
predictors = pd.read_csv('Input/Predictors_2000SNP.csv', delim_whitespace=True)
target = pd.read_csv('Input/Target_2000SNP.csv', delim_whitespace=True)

y = target['rFitness']
X = predictors.iloc[:, 1:].copy()    # without rs column

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regr_nn = MLPRegressor(activation ='relu', hidden_layer_sizes = (100, 98, 95, 92, 90, 85, 80, 75, 70, 65, 60, 55, 50, 45, 40, 35, 30, 25, 20, 18), learning_rate='adaptive', learning_rate_init = 0.001, random_state=0)
# using best parameters found by GridSearchCV

In [None]:
# Scaling

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
y_test2plot = y_test.copy()
y_test2plot = y_test2plot.reset_index()

#tarMLP #index 0-7756
#tarMLI #index 7757-15513
#tarTHP #index 15514-23270
#tarTHI #index 23271-31027

y_test2plot['locat'] = ['MLP' if 0 <= x <= 7756 else 'MLI' if 7757 <= x <= 15513 else 'THP' if 15514 <= x <= 23270 else 'THI' for x in y_test2plot['index']]
y_test2plot

In [None]:
# Training
regr_nn.fit(X_train, y_train)

In [None]:
# Prediction
y_nn = regr_nn.predict(X_test)
regr_nn.score(X_test, y_test)

In [None]:
# Metrics
print('Training set score: %f' % regr_nn.score(X_train, y_train), file=open('Output/07b_Metrics.txt', 'a'))
print('Test set score: %f' % regr_nn.score(X_test, y_test), file=open('Output/07b_Metrics.txt', 'a'))
#print('Mean of cross validation score: ', scores.mean(), file=open('Output/05_Metrics.txt', 'a'))

In [None]:
pr = regr_nn.predict(X_test)
print('R2 score: %f' % r2_score(y_test, pr), file=open('Output/07b_Metrics.txt', 'a'))  
print('RMSE score: %f' % sqrt ( mean_squared_error(y_test, pr)), file=open('Output/07b_Metrics.txt', 'a'))

# taken from here: https://github.com/Gurpremm/rxnpredict-using-sklearn-python/blob/master/chemistry_rxn_predict.ipynb 

#### Plot results with Yellowbrick
https://www.scikit-yb.org/en/latest/api/regressor/peplot.html

In [None]:
# Residuals plot
f = plt.figure()
visualizer = ResidualsPlot(regr_nn)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.show()
#f.savefig("Output/07b_Residuals.png", bbox_inches='tight')

In [None]:
# Prediction error plot
f = plt.figure()
visualizer = PredictionError(regr_nn)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.show()f.savefig("Output/07b_PredActual.png", bbox_inches='tight')

In [None]:
y_rf2plot = pd.DataFrame(y_nn)
df2plot = pd.concat([y_test2plot, y_rf2plot], axis=1)
df2plot.columns = ['index', 'actual', 'location', 'pred']
df2plot

In [None]:
sns.set_palette('Paired')
sns.scatterplot(x='actual', y='pred', hue='location', data=df2plot)
plt.title("Actual vs predicted beta values")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.show()
#plt.savefig('Output/07b_PredActual_Color.png', bbox_inches='tight')