# RF based on 10K randomly selected SNPs


## Import packages

In [None]:
import random
import pandas as pd
pd.set_option('display.max_columns', 999)

import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Settings for seaborn
sns.set(color_codes=True)
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(13,10)})

## Select random 10k SNPs
Extract SNP list from beta table, generate list of random 10k SNPs & create new beta table:

In [None]:
df = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_55climvars_rFit.txt', sep='\t')
df.rename(columns={'clim-bio18.assoc_y':'clim-bio18'}, inplace=True)
df.drop(['clim-bio18.assoc_x'],axis=1, inplace=True)
df

In [None]:
mySNPs = df['rs'].tolist()
numberToSelect = 10000                          
randomSNPs = random.sample(mySNPs, numberToSelect)

dfSNPs = pd.DataFrame(np.array([randomSNPs]).T)
dfSNPs.columns = ['randomSNPs']
dfSNPs.to_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/randomSNPsList.txt', sep='\t', index=False)

df.set_index('rs')

randomDF = df.loc[df['rs'].isin(randomSNPs)]
randomDF.to_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/randomSNPs_55climvars_4rFit.txt', sep='\t', index=False)

In [None]:
randomDF

In [None]:
dfSNPs

### Prepare the target dataset


In [None]:
df = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/randomSNPs_55climvars_4rFit.txt', delim_whitespace=True)
dfmlp = df[['rs', 'rFitness2_mlp']].copy()
dfmlp.rename(columns={'rFitness2_mlp':'beta'}, inplace=True)
dfmlp['locat'] = 'MLP'

dfmli = df[['rs', 'rFitness2_mli']].copy()
dfmli.rename(columns={'rFitness2_mli':'beta'}, inplace=True)
dfmli['locat'] = 'MLI'

dfthi = df[['rs', 'rFitness2_thi']].copy()
dfthi.rename(columns={'rFitness2_thi':'beta'}, inplace=True)
dfthi['locat'] = 'THI'

dfthp = df[['rs', 'rFitness2_thp']].copy()
dfthp.rename(columns={'rFitness2_thp':'beta'}, inplace=True)
dfthp['locat'] = 'THP'

target = pd.DataFrame(data=dfmlp)
target = target.append([dfmli, dfthi, dfthp], ignore_index=True, sort=False)
target

### Prepare the predictors dataset
#### Select the climate data
Get closest accession to Madrid and Tübingen --> Manual selection from 2029gaccessions.tsv: 

- Tuebingen 48.544886 9.043042: **1813** 9792 9792 0 0 0 1 Lu4-2 GER 48.54 9.09 0 NA CS77058 Arabidopsis thaliana NA 10
- Madrid 40.408049 -3.835350: **1845** 9825 9825 0 0 0 1 IP-Boa-0 Spain 40.4 -3.88 Carlos Alonso-Blanco NA CS76714 Arabidopsis thaliana NA 134



<img src='Tuebingen.png' width="600"></img>

<img src='Madrid.png' width="600"></img>

#### Prepare predictors
In a stepwise manner: 
- predictors part 1 concatenates the entire dataset with the 10k random SNPs 4 times, and drops the 'betas' columns that will serve later as the target.
- the annotation information is then added to the predictors 1 and numerically encoded
- predictors part 2 is generated out of the climate data

In [None]:
# Predictors part 1

df = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/randomSNPs_55climvars_4rFit.txt', delim_whitespace=True)
pred1 = pd.DataFrame(df)
pred1 = pd.concat([pred1]*4, ignore_index=True)
pred1.drop(columns=['rFitness2_mlp', 'rFitness2_mli', 'rFitness2_thp', 'rFitness2_thi'], inplace=True)
pred1.to_csv(r'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/pred1.txt', sep='\t')
pred1

In [None]:
# add annotation to predictors pred1 dataset

df = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/515g2.ann.txt', sep='\t')
pred1 = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/pred1.txt', sep='\t')
pred1 = pred1.iloc[:, 1:]
pred1 = pred1.join(df.set_index('rs'), on='rs')
pred1 = pred1.drop(columns=['chr', 'ps', 'allel1', 'allel2'])
pred1 = pred1.iloc[:, 1:]

In [None]:
# encode annotation numerically

lb = LabelEncoder()
pred1['ann'] = lb.fit_transform(pred1['ann'])

In [None]:
pred1

In [None]:
pred2 = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/natvar/climate/2029gclimate.tsv', delim_whitespace=True)
pred2a = pd.concat([pred2.iloc[[1813]]]*int(len(pred1)/2), ignore_index=True) #1813 = accession close to Thübingen
pred2b = pd.concat([pred2.iloc[[1845]]]*int(len(pred1)/2), ignore_index=True) #1845 = accession close to Madrid
pred2_fin = pd.concat([pred2b, pred2a], axis=0) #concat this way, to have Madrid at first, then Thübingen
pred2_fin = pred2_fin.iloc[:, :-12]

In [None]:
predictors = pd.concat([pred1.reset_index(drop=True), pred2_fin.reset_index(drop=True)], axis=1, sort=False)  # without reset_index, NAs were introduced in DF
predictors

## Random Forest
### Input variable preparation and distribution plots

In [None]:
y = target['beta']
X = predictors.copy()

In [None]:
# check distribution of target
sns.distplot(y)
plt.xlabel('Combined beta values as target variable')
plt.title('Distribution of target')
#plt.savefig('Output/02_TargetVarDist.png', bbox_inches='tight', dpi=600)

In [None]:
# check distribution of selected predictors

a = X['ann']
b = X['bio1']
c = X['clim-bio19']

sns.set()
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (24, 6))
fig.suptitle('Distribution of beta values from selected predictor variables') 
sns.distplot(a, ax=ax1)
sns.distplot(b, ax=ax2)
sns.distplot(c, ax=ax3)
ax1.set_xlabel('Annotation')
ax2.set_xlabel('Worldclim | bio1')
ax3.set_xlabel('Betas | bio1')
fig.show()
#fig.savefig('Output/02_PredVarDist.png', bbox_inches='tight', dpi=600)

### Packages

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import QuantileTransformer, quantile_transform
from sklearn import metrics
from sklearn.metrics import r2_score, accuracy_score
from scipy.stats import spearmanr, pearsonr
from yellowbrick.regressor import PredictionError, ResidualsPlot
from yellowbrick.features import Rank1D

***

### Sklearn Random Forest Regression
Using all predictor variables

In [None]:
# Fit regression model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regr_rf = RandomForestRegressor(oob_score=True, random_state=0, n_estimators=500)


In [None]:
y_test2plot = y_test.copy()
y_test2plot = y_test2plot.reset_index()

#MLP 0-9999
#MLI 10000-19999
#THI 20000-29999
#THP 30000-39999

y_test2plot['locat'] = ['MLP' if 0 <= x <= 9999 else 'MLI' if 10000 <= x <= 19999 else 'THI' if 20000 <= x <= 29999 else 'THP' for x in y_test2plot['index']]
y_test2plot

In [None]:
# Training
regr_rf.fit(X_train, y_train)

In [None]:
# Prediction
y_rf = regr_rf.predict(X_test)

In [None]:
predicted_train = regr_rf.predict(X_train)
predicted_test = regr_rf.predict(X_test)
test_score = r2_score(y_test, predicted_test)
spearman = spearmanr(y_test, predicted_test)
pearson = pearsonr(y_test, predicted_test)

In [None]:
# Metrics --> printed to file

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_rf), file=open('Output/02_Metrics.txt', 'a'))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_rf), file=open('Output/02_Metrics.txt', 'a'))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_rf)), file=open('Output/02_Metrics.txt', 'a'))

print('Further statistics:', file=open('Output/02_Metrics.txt', 'a'))
print(f'Out-of-bag R2 score estimate: {regr_rf.oob_score_:>5.3}', file=open('Output/02_Metrics.txt', 'a'))
print(f'Test data R2 score: {test_score:>5.3}', file=open('Output/02_Metrics.txt', 'a'))
print(f'Test data Spearman correlation: {spearman[0]:.3}', file=open('Output/02_Metrics.txt', 'a'))
print(f'Test data Pearson correlation: {pearson[0]:.3}', file=open('Output/02_Metrics.txt', 'a'))


#### Plot results with Yellowbrick
https://www.scikit-yb.org/en/latest/api/regressor/peplot.html

In [None]:
# Residuals plot

f = plt.figure()
visualizer = ResidualsPlot(regr_rf)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
#f.savefig("Output/02_Residuals.png", bbox_inches='tight', dpi=600)

In [None]:
# Prediction error plot

f = plt.figure()
visualizer = PredictionError(regr_rf)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
#f.savefig("Output/02_PredActual.png", bbox_inches='tight', dpi=600)


In [None]:
y_rf2plot = pd.DataFrame(y_rf)
df2plot = pd.concat([y_test2plot, y_rf2plot], axis=1)
df2plot.columns = ['index', 'actual', 'Location', 'pred']
#df2plot.to_csv('Input/02_RF_10k_predictedValues.csv')

In [None]:
#df2plot = pd.read_csv('Input/02_RF_10k_predictedValues.csv')

# order MLI - MLP - THP - THI
col = ['#006e00','#984ea3', '#ebac23','#b80058']

sns.set(rc={'figure.figsize':(13,10)})
sns.set_style("whitegrid")
sns.set_palette(col)
s = sns.scatterplot(x='pred', y='actual', hue='Location',  sizes=(20), data=df2plot) #alpha=0.8
plt.title("Random forest | 10k random SNPs", size= 16, pad=25)
plt.suptitle("Actual vs predicted selection coefficients from MAD and TUE", size = 20)
plt.xlabel("Predicted", size=16)
plt.ylabel("Actual", size=16)
plt.xlim(-1,3)
plt.ylim(-1,3)
plt.setp(s.get_legend().get_texts(), fontsize='16') # for legend text
plt.setp(s.get_legend().get_title(), fontsize='18') # for legend title
#plt.savefig('Output/02_RF_10k_PredActual.png', bbox_inches='tight')

In [None]:
# plot separated
sns.set_style("whitegrid")
sns.set_palette(col)

sns.relplot(
    data=df2plot, x='pred', y='actual',
    col="Location", hue="Location",
    kind="scatter")

Feature Importance:
https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [None]:
importance = regr_rf.feature_importances_         # get importance

# summarize feature importance
#for i,v in enumerate(importance):
#	print('Feature: %0d, Score: %.5f' % (i,v))

labels = list(X.columns.values)

plt.figure(figsize=(18,8))
plt.title("Feature importance | 10k random SNPs | MAD and TUE", size=20, pad=25)
plt.xlabel("Features", size=18)
plt.ylabel("Score", size=18)
imp = sns.barplot([x for x in range(len(importance))], importance, palette="viridis")
imp.set_xticklabels(labels,  rotation='vertical')
#plt.savefig('Output/02_Features.png', bbox_inches='tight', dpi=600)