# RF including further locations with CrossValidation using SPA dataset

The following locations are to be considered:

| longitude | latitude | country | nearest neighbour accession |
| --- | --- | --- | :---: |
| 36.76539 | -5.499419 | Andalucia | 1600 |
| 51.49702 | 11.970655 | Germany | 1059 |
| 65.00307 | 25.472679 | Finland | 309 |
| 39.48083 | -0.340985 | Spain| 1576|
| 52.62779 | 1.293458 | UK | 578 |
| 48.544886 | 9.043042 | Tuebingen | 1813 |
| 40.408049 | -3.83535 | Madrid | 1845 |

## Import packages

In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)

### Select specific SNPs
Select 1000 SNPs with highest and 1000 with lowest selection coefficient.

In [None]:
betas = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_55climvars_rFit.txt', sep='\t')
betas.rename(columns={'clim-bio18.assoc_y':'clim-bio18'}, inplace=True)
betas.drop(['clim-bio18.assoc_x'],axis=1, inplace=True)
betas = betas[betas.columns.drop(list(betas.filter(regex='rFitness')))]

In [None]:
betas2 = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_Fitness.txt', sep='\t')
betas2.rename(columns={'Fitness_Andaluci':'Fitness_Andalucia'}, inplace=True)
betas2 = betas2[betas2.columns.drop(list(betas2.filter(regex='randomized')))]
betas2

In [None]:
AND = betas2[['rs', 'Fitness_Andalucia']]
SPA = betas2[['rs', 'Fitness_Spain']]
UKI = betas2[['rs', 'Fitness_UnitedKingdom']]
FIN = betas2[['rs', 'Fitness_Finland']]
GER = betas2[['rs', 'Fitness_Germany']]

In [None]:
# Sort & select

AND = AND.sort_values(by=['Fitness_Andalucia'], ascending=False)
SPA = SPA.sort_values(by=['Fitness_Spain'], ascending=False)
UKI = UKI.sort_values(by=['Fitness_UnitedKingdom'], ascending=False)
FIN = FIN.sort_values(by=['Fitness_Finland'], ascending=False)
GER = GER.sort_values(by=['Fitness_Germany'], ascending=False)

x=1000

selAND = AND.iloc[:x, :]   
selAND = selAND.append(AND.iloc[-x:, :])
selANDSNPs = selAND['rs'].tolist()

selSPA = SPA.iloc[:x, :]   
selSPA = selSPA.append(SPA.iloc[-x:, :])
selSPASNPs = selSPA['rs'].tolist()

selUKI = UKI.iloc[:x, :]   
selUKI = selUKI.append(UKI.iloc[-x:, :])
selUKISNPs = selUKI['rs'].tolist()

selFIN = FIN.iloc[:x, :]   
selFIN = selFIN.append(FIN.iloc[-x:, :])
selFINSNPs = selFIN['rs'].tolist()

selGER = GER.iloc[:x, :]   
selGER = selGER.append(GER.iloc[-x:, :])
selGERSNPs = selGER['rs'].tolist()

In [None]:
# Use all rs from the selection above and create new list
mySNPs = selANDSNPs + selSPASNPs + selUKISNPs + selFINSNPs + selGERSNPs
len(mySNPs)

In [None]:
# check for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1];

In [None]:
# remove duplicates
mySNPs = list(set(mySNPs))

# check again for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1]

In [None]:
len(mySNPs)

In [None]:
len(mySNPs)*5

In [None]:
# create now target dataframe with selected SNPs

target = pd.DataFrame(mySNPs, columns=['rs'])

e = target.join(AND.set_index('rs'), on='rs')
e.rename(columns={'Fitness_Andalucia':'rFitness'}, inplace=True)
e['locat'] = 'AND'

f = target.join(GER.set_index('rs'), on='rs')
f.rename(columns={'Fitness_Germany':'rFitness'}, inplace=True)
f['locat'] = 'GER'

g = target.join(FIN.set_index('rs'), on='rs')
g.rename(columns={'Fitness_Finland':'rFitness'}, inplace=True)
g['locat'] = 'FIN'

h = target.join(SPA.set_index('rs'), on='rs')
h.rename(columns={'Fitness_Spain':'rFitness'}, inplace=True)
h['locat'] = 'SPA'

i = target.join(UKI.set_index('rs'), on='rs')
i.rename(columns={'Fitness_UnitedKingdom':'rFitness'}, inplace=True)
i['locat'] = 'UKI'

In [None]:
target = e.append([f, g, h, i], ignore_index=True, sort=False)
target

In [None]:
# count total number of NaNs
target.isnull().sum().sum()

In [None]:
# extract locations where rFitness is NaN
nullDF = target[target['rFitness'].isnull()]

In [None]:
nullDF

In [None]:
nullSNPs = nullDF['rs'].tolist()
#nullSNPs

# check for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1]

In [None]:
len(nullSNPs)

In [None]:
# drop those rows of nullSNPs
newtarget = target[~target.rs.isin(nullSNPs)] 
newtarget

In [None]:
newtarget.isnull().sum().sum()

In [None]:
predictors = pd.DataFrame(mySNPs, columns=['rs'])
predictors = predictors.join(betas.set_index('rs'), on='rs')
predictors = pd.concat([predictors]*5, ignore_index=True)
predictors

In [None]:
# add annotation to predictors dataset
annot = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/515g2.ann.txt', sep='\t')
predictors = predictors.join(annot.set_index('rs'), on='rs')
predictors = predictors.drop(columns=['chr', 'ps', 'allel1', 'allel2'])
predictors

In [None]:
# encode annotation numerically
lb = LabelEncoder()
predictors['ann'] = lb.fit_transform(predictors['ann'])

# print encoding
lbMapping = dict(zip(lb.classes_, lb.transform(lb.classes_)))
lbMapping

In [None]:
# prepare climate data 

clim = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/natvar/climate/2029gclimate.tsv', delim_whitespace=True)

climA = pd.concat([clim.iloc[[1600]]]*int(len(predictors)/5), ignore_index=True) #1600 = accession close to location Andalusia
climG = pd.concat([clim.iloc[[1059]]]*int(len(predictors)/5), ignore_index=True) #1059 = accession close to location Germany
climF = pd.concat([clim.iloc[[309]]]*int(len(predictors)/5), ignore_index=True) #309 = accession close to location Finland
climS = pd.concat([clim.iloc[[1576]]]*int(len(predictors)/5), ignore_index=True) #1576 = accession close to location Spain
climU = pd.concat([clim.iloc[[578]]]*int(len(predictors)/5), ignore_index=True) #578 = accession close to location United Kingdom

climFin = pd.concat([climA, climG, climF, climS, climU], axis=0) #concat this way, to have Madrid at first, then Tübingen and then in alphabetic order to fit to target order
climFin = climFin.iloc[:, :-12]    
climFin


In [None]:
# finalize predictors dataset
predictors = pd.concat([predictors.reset_index(drop=True), climFin.reset_index(drop=True)], axis=1, sort=False)  # without reset_index, NAs were introduced in DF
cols=[1,2,3,4]    #drop rFitness columns
predictors = predictors.drop(predictors.columns[cols], axis=1)
predictors

In [None]:
predictors.isnull().sum().sum()

In [None]:
# drop rows with NaNs
predictors = predictors[~predictors.rs.isin(nullSNPs)]  
predictors

In [None]:
entire = pd.concat([predictors.reset_index(drop=True), newtarget.reset_index(drop=True)], axis=1, sort=False)
entire

#drop one location --> for Cross Validation

In [None]:
entire = entire.dropna()

In [None]:
# dataset for Cross Validation
AND = entire[entire["locat"] == 'AND']

In [None]:
# dataset for RF
rest = entire[entire["locat"] != 'AND']
rest

## Random Forest
### Input variable preparation and distribution plots

In [None]:
y_train = rest['rFitness']
#X = predictors.iloc[:, 1:].copy()    # without rs column
X_train = rest.iloc[:, 1:-3]
y_test= AND['rFitness']
X_test = AND.iloc[:, 1:-3]


### Packages

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import QuantileTransformer, quantile_transform
from sklearn import metrics
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr
from yellowbrick.regressor import PredictionError, ResidualsPlot
from yellowbrick.features import Rank1D
import pandas as pd

***

In [None]:
# Fit regression model
regr_rf = RandomForestRegressor(oob_score=True, random_state=0, n_estimators = 500)

In [None]:
# Training
regr_rf.fit(X_train, y_train)

In [None]:
# Prediction of test set
predicted_test = regr_rf.predict(X_test)

In [None]:
#predicted_train = regr_rf.predict(X_train)
test_score = r2_score(y_test, predicted_test)
spearman = spearmanr(y_test, predicted_test)
pearson = pearsonr(y_test, predicted_test)

In [None]:
# Metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, predicted_test), file=open('Output/08CVAND_Metrics.txt', 'a'))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, predicted_test), file=open('Output/08CVAND_Metrics.txt', 'a'))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predicted_test)), file=open('Output/08CVAND_Metrics.txt', 'a'))

print(f'Out-of-bag R2 score estimate: {regr_rf.oob_score_:>5.3}', file=open('Output/08CVAND_Metrics.txt', 'a'))
print(f'Test data R2 score: {test_score:>5.3}', file=open('Output/08CVAND_Metrics.txt', 'a'))
print(f'Test data Spearman correlation: {spearman[0]:.3}', file=open('Output/08CVAND_Metrics.txt', 'a'))
print(f'Test data Pearson correlation: {pearson[0]:.3}', file=open('Output/08CVAND_Metrics.txt', 'a'))

#### Plot results with Yellowbrick
https://www.scikit-yb.org/en/latest/api/regressor/peplot.html

In [None]:
# Residuals plot
f = plt.figure()
visualizer = ResidualsPlot(regr_rf)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.savefig("Output/08CVAND_Residuals.png", bbox_inches='tight', dpi=600)

In [None]:
# Prediction error plot
f = plt.figure()
visualizer = PredictionError(regr_rf)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
f.savefig("Output/08CVAND_PredActual.png", bbox_inches='tight', dpi=600)

In [None]:
predicted_CV2plot = pd.DataFrame(predicted_CV)
df2plot = pd.concat([y_test, predicted_CV2plot], axis=1)
df2plot.columns = ['index', 'actual', 'location', 'pred']
df2plot

In [None]:
colors = ['#984ea3', '#e41a1c', '#ffff33', '#ff7f00', '#4daf4a', '#a65628', '#377eb8', '#f781bf', '#999999']
#https://colorbrewer2.org/#type=qualitative&scheme=Set1&n=9

plt.figure(figsize=(18,8))
sns.set_palette(colors)
sns.scatterplot(x='actual', y='pred', hue='location', s = 40, data=df2plot)
plt.title("Actual vs predicted beta values")
plt.xlabel("Actual")
plt.ylabel("Predicted")
#plt.savefig('Output/08CV_PredActual_Color.png', bbox_inches='tight', dpi=600)

In [None]:
importance = regr_rf.feature_importances_         # get importance

# summarize feature importance
#for i,v in enumerate(importance):
#	print('Feature: %0d, Score: %.5f' % (i,v))

labels = list(X.columns.values)

plt.figure(figsize=(18,8))
imp = sns.barplot([x for x in range(len(importance))], importance)
imp.set_xticklabels(labels,  rotation='vertical')
plt.title("Feature importance")
#plt.savefig('Output/08CV_Features.png', bbox_inches='tight', dpi=600)
plt.show()
