# NNSL-7-Chr1-HL10
## NN using scikit-learn, 7 environments, data of Chromosome 1 for cross validation, 10 hidden layers (decreasing size)


## Import packages

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt
from yellowbrick.regressor import PredictionError, ResidualsPlot
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)



## Neural net

In [None]:
predictors = pd.read_csv('Input/Predictors_7locs.csv', delim_whitespace=True)
target = pd.read_csv('Input/Target_7locs.csv', delim_whitespace=True)

In [None]:
# drop all entries from the chromosome that is later used for Cross Validation
chr = "1_"

target_chr = target['rs'].str.contains(chr)
target_minusChr = target[~target_chr]

predictors_chr = predictors['rs'].str. contains(chr)
predictors_minusChr = predictors[~predictors_chr]

In [None]:
y_train = target_minusChr['rFitness']
X_train = predictors_minusChr.iloc[:, 1:].copy()    # without rs column

y_test = target.loc[target.rs.str.match(chr)]
y_test = y_test['rFitness']

X_test = predictors.loc[predictors.rs.str.match(chr)]
X_test = X_test.iloc[:, 1:].copy()

***

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regr_nn = MLPRegressor(activation ='relu', hidden_layer_sizes = (100, 90, 80, 70, 60, 50, 40, 30, 20, 10), learning_rate='adaptive', learning_rate_init = 0.001, random_state=0)
# using best parameters found by GridSearchCV

In [None]:
# Scaling

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
y_test2plot = y_test.copy()
y_test2plot = y_test2plot.reset_index()

#tarMLP #0-17280
#tarMLI #17282-34561
#tarTHP #34563-51842
#tarTHI #51844-69123
#tarAND #69125-86404
#tarSPA #120968-138247
#tarUKI #138249-155528
#tarGER #86406-103685
#tarFIN #103687-120966

y_test2plot['locat'] = ['MLP' if 0 <= x <= 17280 else 'MLI' if 17282 <= x <= 34561 else 
                        'THP' if 34563 <= x <= 51842 else 'THI' if 51844 <= x <= 69123 else 
                        'AND' if 69125 <= x <= 86404 else 'GER' if 86406 <= x <= 103685 else 
                        'FIN' if 103687 <= x <= 120966 else 'UKI' if 138249 <= x <= 155528 else 
                        'SPA' for x in y_test2plot['index']]
y_test2plot

In [None]:
# Training
regr_nn.fit(X_train, y_train)

In [None]:
# Prediction
y_nn = regr_nn.predict(X_test)
regr_nn.score(X_test, y_test)

In [None]:
# Metrics
print('Training set score: %f' % regr_nn.score(X_train, y_train), file=open('Output/08CV_Metrics.txt', 'a'))
print('Test set score: %f' % regr_nn.score(X_test, y_test), file=open('Output/08CV_Metrics.txt', 'a'))
#print('Mean of cross validation score: ', scores.mean(), file=open('Output/05_Metrics.txt', 'a'))

In [None]:
pr = regr_nn.predict(X_test)
print('R2 score: %f' % r2_score(y_test, pr), file=open('Output/08CV_Metrics.txt', 'a'))  
print('RMSE score: %f' % sqrt ( mean_squared_error(y_test, pr)), file=open('Output/08CV_Metrics.txt', 'a'))

# taken from here: https://github.com/Gurpremm/rxnpredict-using-sklearn-python/blob/master/chemistry_rxn_predict.ipynb 

#### Plot results with Yellowbrick
https://www.scikit-yb.org/en/latest/api/regressor/peplot.html

In [None]:
# Residuals plot
f = plt.figure()
visualizer = ResidualsPlot(regr_nn)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.show()
#f.savefig("Output/08CV_Residuals.png", bbox_inches='tight', dpi=600)

In [None]:
# Prediction error plot
f = plt.figure()
visualizer = PredictionError(regr_nn)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.show()
#f.savefig("Output/08CV_PredActual.png", bbox_inches='tight', dpi=600)

In [None]:
y_nn2plot = pd.DataFrame(y_nn)
df2plot = pd.concat([y_test2plot, y_nn2plot], axis=1)
df2plot.columns = ['index', 'Actual', 'Location', 'Predicted']
df2plot
df2plot.to_csv('Input/08CVchr_predictedValues.csv')

In [None]:
df2plot = pd.read_csv('Input/08CVchr_predictedValues.csv')

#        MLP     MLI      THP           THI      AND     SPA     GER        FIN        UKI
col =['#984ea3','#006e00','#ebac23','#b80058','#008cf9','#ff9287', '#00bbad','#878500', '#5954d6']

sns.set(rc={'figure.figsize':(13,10)})
sns.set_style("whitegrid")
sns.set_palette(col)
s = sns.scatterplot(x='Predicted', y='Actual', hue='Location', sizes=(20), data=df2plot) 
plt.title("Neural network | scikit-learn | 1,000 - 1,000 SNPs", size= 16, pad=25)
plt.suptitle("Actual vs predicted selection coefficients from all locations of chromosome 1", size = 20)
plt.xlabel("Predicted", size=16)
plt.ylabel("Actual", size=16)
plt.xlim(-1,3)
plt.ylim(-1,3)
plt.setp(s.get_legend().get_texts(), fontsize='16') # for legend text
plt.setp(s.get_legend().get_title(), fontsize='18') # for legend title
#plt.savefig('Output//08CVchr_PredActual.png', bbox_inches='tight')

In [None]:
df2plot = df2plot.sort_values('Location')

In [None]:
# Order AND       FIN        GER       MLI        MLP        SPA        THI        THP          UKI
col =['#008cf9','#878500','#00bbad', '#006e00', '#984ea3', '#ff9287', '#b80058', '#ebac23', '#5954d6']

# plot separated
sns.set_style("whitegrid")
sns.set_palette(col)

p =sns.relplot(
    data=df2plot, x='Predicted', y='Actual',
    col="Location", hue="Location",
    kind="scatter", col_wrap=5)
plt.subplots_adjust(top=0.9)
p.fig.suptitle(" NNSL-7-Chr1-HL10 | Actual vs predicted selection coefficients from all locations of chromosome 1", size = 20)

#plt.savefig('Output/08CVchr_PredActual_grid.png', bbox_inches='tight')