# RF including further locations

The following locations are to be considered:

| longitude | latitude | country | nearest neighbour accession |
| --- | --- | --- | :---: |
| 36.76539 | -5.499419 | Andalucia | 1600 |
| 51.49702 | 11.970655 | Germany | 1059 |
| 65.00307 | 25.472679 | Finland | 309 |
| 39.48083 | -0.340985 | Spain| 1576|
| 52.62779 | 1.293458 | UK | 578 |
| 48.544886 | 9.043042 | Tuebingen | 1813 |
| 40.408049 | -3.83535 | Madrid | 1845 |

## Import packages

In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)

### Select specific SNPs
Select 1000 SNPs with highest and 1000 with lowest selection coefficient.

In [None]:
betas = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_55climvars_rFit.txt', sep='\t')
betas.rename(columns={'clim-bio18.assoc_y':'clim-bio18'}, inplace=True)
betas.drop(['clim-bio18.assoc_x'],axis=1, inplace=True)

In [None]:
betas

In [None]:
# Extract mlp and mli datasets
MLP = betas[['rs', 'rFitness2_mlp']]
MLI = betas[['rs', 'rFitness2_mli']]
THP = betas[['rs', 'rFitness2_thp']]
THI = betas[['rs', 'rFitness2_thi']]
#THI

In [None]:
betas2 = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_Fitness.txt', sep='\t')
betas2.rename(columns={'Fitness_Andaluci':'Fitness_Andalucia'}, inplace=True)
betas2 = betas2[betas2.columns.drop(list(betas2.filter(regex='randomized')))]
betas2

In [None]:
AND = betas2[['rs', 'Fitness_Andalucia']]
SPA = betas2[['rs', 'Fitness_Spain']]
UKI = betas2[['rs', 'Fitness_UnitedKingdom']]
FIN = betas2[['rs', 'Fitness_Finland']]
GER = betas2[['rs', 'Fitness_Germany']]
GER

In [None]:
# Sort & select

MLP = MLP.sort_values(by=['rFitness2_mlp'], ascending=False)
MLI = MLI.sort_values(by=['rFitness2_mli'], ascending=False)
THP = THP.sort_values(by=['rFitness2_thp'], ascending=False)
THI = THI.sort_values(by=['rFitness2_thi'], ascending=False)

AND = AND.sort_values(by=['Fitness_Andalucia'], ascending=False)
SPA = SPA.sort_values(by=['Fitness_Spain'], ascending=False)
UKI = UKI.sort_values(by=['Fitness_UnitedKingdom'], ascending=False)
FIN = FIN.sort_values(by=['Fitness_Finland'], ascending=False)
GER = GER.sort_values(by=['Fitness_Germany'], ascending=False)

x=1000

# get the first and last 1000 objects (highest and lowest betas)
selMLP = MLP.iloc[:x, :]   
selMLP = selMLP.append(MLP.iloc[-x:, :])
selMLPSNPs = selMLP['rs'].tolist()

selMLI = MLI.iloc[:x, :]   
selMLI = selMLI.append(MLI.iloc[-x:, :])
selMLISNPs = selMLI['rs'].tolist()

selTHP = THP.iloc[:x, :]   
selTHP = selTHP.append(THP.iloc[-x:, :])
selTHPSNPs = selTHP['rs'].tolist()

selTHI = THI.iloc[:x, :]   
selTHI = selTHI.append(THI.iloc[-x:, :])
selTHISNPs = selTHI['rs'].tolist()

selAND = AND.iloc[:x, :]   
selAND = selAND.append(AND.iloc[-x:, :])
selANDSNPs = selAND['rs'].tolist()

selSPA = SPA.iloc[:x, :]   
selSPA = selSPA.append(SPA.iloc[-x:, :])
selSPASNPs = selSPA['rs'].tolist()

selUKI = UKI.iloc[:x, :]   
selUKI = selUKI.append(UKI.iloc[-x:, :])
selUKISNPs = selUKI['rs'].tolist()

selFIN = FIN.iloc[:x, :]   
selFIN = selFIN.append(FIN.iloc[-x:, :])
selFINSNPs = selFIN['rs'].tolist()

selGER = GER.iloc[:x, :]   
selGER = selGER.append(GER.iloc[-x:, :])
selGERSNPs = selGER['rs'].tolist()

In [None]:
MLP.isnull().sum().sum() 
#MLP.dtypes

In [None]:
#selGER.isnull().sum().sum() 

In [None]:
# Use all rs from the selection above and create new list
mySNPs = selMLPSNPs + selMLISNPs + selTHPSNPs + selTHISNPs + selANDSNPs + selSPASNPs + selUKISNPs + selFINSNPs + selGERSNPs
len(mySNPs)

In [None]:
# check for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1];

In [None]:
# remove duplicates
mySNPs = list(set(mySNPs))

# check again for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1]

In [None]:
len(mySNPs)

In [None]:
len(mySNPs)*9

In [None]:
# create now target dataframe with selected SNPs

target = pd.DataFrame(mySNPs, columns=['rs'])

a = target.join(MLP.set_index('rs'), on='rs')
a.rename(columns={'rFitness2_mlp':'rFitness'}, inplace=True)
a['locat'] = 'MLP'

b = target.join(MLI.set_index('rs'), on='rs')
b.rename(columns={'rFitness2_mli':'rFitness'}, inplace=True)
b['locat'] = 'MLI'

c = target.join(THP.set_index('rs'), on='rs')
c.rename(columns={'rFitness2_thp':'rFitness'}, inplace=True)
c['locat'] = 'THP'

d = target.join(THI.set_index('rs'), on='rs')
d.rename(columns={'rFitness2_thi':'rFitness'}, inplace=True)
d['locat'] = 'THI'

e = target.join(AND.set_index('rs'), on='rs')
e.rename(columns={'Fitness_Andalucia':'rFitness'}, inplace=True)
e['locat'] = 'AND'

f = target.join(GER.set_index('rs'), on='rs')
f.rename(columns={'Fitness_Germany':'rFitness'}, inplace=True)
f['locat'] = 'GER'

g = target.join(FIN.set_index('rs'), on='rs')
g.rename(columns={'Fitness_Finland':'rFitness'}, inplace=True)
g['locat'] = 'FIN'

h = target.join(SPA.set_index('rs'), on='rs')
h.rename(columns={'Fitness_Spain':'rFitness'}, inplace=True)
h['locat'] = 'SPA'

i = target.join(UKI.set_index('rs'), on='rs')
i.rename(columns={'Fitness_UnitedKingdom':'rFitness'}, inplace=True)
i['locat'] = 'UKI'

In [None]:
target = a.append([b, c, d, e, f, g, h, i], ignore_index=True, sort=False)
target

In [None]:
# count total number of NaNs
target.isnull().sum().sum()

In [None]:
# extract locations where rFitness is NaN
nullDF = target[target['rFitness'].isnull()]

In [None]:
nullDF

In [None]:
nullSNPs = nullDF['rs'].tolist()
#nullSNPs

# check for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1]

In [None]:
len(nullSNPs)

In [None]:
# drop those rows of nullSNPs
newtarget = target[~target.rs.isin(nullSNPs)] 
newtarget

In [None]:
newtarget.isnull().sum().sum()

In [None]:
# this is to extract the indices for each location - for later colorization of the plot
tarMLP = newtarget[newtarget["locat"] == 'MLP']
tarMLP #0-17280

In [None]:
tarMLI = newtarget[newtarget["locat"] == 'MLI']
tarMLI #17281-34561

In [None]:
tarTHP = newtarget[newtarget["locat"] == 'THP']
tarTHP #34562-51842

In [None]:
tarTHI = newtarget[newtarget["locat"] == 'THI']
tarTHI #51843-69123

In [None]:
tarSPA = newtarget[newtarget["locat"] == 'SPA']
tarSPA #120967-138247

In [None]:
tarUKI = newtarget[newtarget["locat"] == 'UKI']
tarUKI #138248-155528

In [None]:
tarGER = newtarget[newtarget["locat"] == 'GER']
tarGER #86405-103685

In [None]:
tarFIN = newtarget[newtarget["locat"] == 'FIN']
tarFIN #103686-120966

In [None]:
tarAND = newtarget[newtarget["locat"] == 'AND']
tarAND #69124-86404

In [None]:
predictors = pd.DataFrame(mySNPs, columns=['rs'])
predictors = predictors.join(betas.set_index('rs'), on='rs')
predictors = pd.concat([predictors]*9, ignore_index=True)
predictors

In [None]:
# add annotation to predictors dataset
annot = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/515g2.ann.txt', sep='\t')
predictors = predictors.join(annot.set_index('rs'), on='rs')
predictors = predictors.drop(columns=['chr', 'ps', 'allel1', 'allel2'])
predictors

In [None]:
# encode annotation numerically
lb = LabelEncoder()
predictors['ann'] = lb.fit_transform(predictors['ann'])

# print encoding
lbMapping = dict(zip(lb.classes_, lb.transform(lb.classes_)))
lbMapping

In [None]:
# prepare climate data 

clim = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/natvar/climate/2029gclimate.tsv', delim_whitespace=True)

climT = pd.concat([clim.iloc[[1813]]]*int(len(predictors)/9*2), ignore_index=True) #1813 = accession close to Tübingen
climM = pd.concat([clim.iloc[[1845]]]*int(len(predictors)/9*2), ignore_index=True) #1845 = accession close to Madrid

climA = pd.concat([clim.iloc[[1600]]]*int(len(predictors)/9), ignore_index=True) #1600 = accession close to location Andalusia
climG = pd.concat([clim.iloc[[1059]]]*int(len(predictors)/9), ignore_index=True) #1059 = accession close to location Germany
climF = pd.concat([clim.iloc[[309]]]*int(len(predictors)/9), ignore_index=True) #309 = accession close to location Finland
climS = pd.concat([clim.iloc[[1576]]]*int(len(predictors)/9), ignore_index=True) #1576 = accession close to location Spain
climU = pd.concat([clim.iloc[[578]]]*int(len(predictors)/9), ignore_index=True) #578 = accession close to location United Kingdom

climFin = pd.concat([climM, climT, climA, climG, climF, climS, climU], axis=0) #concat this way, to have Madrid at first, then Tübingen and then in alphabetic order to fit to target order
climFin = climFin.iloc[:, :-12]    
climFin


In [None]:
# finalize predictors dataset
predictors = pd.concat([predictors.reset_index(drop=True), climFin.reset_index(drop=True)], axis=1, sort=False)  # without reset_index, NAs were introduced in DF
cols=[1,2,3,4]    #drop rFitness columns
predictors = predictors.drop(predictors.columns[cols], axis=1)
predictors

In [None]:
predictors.isnull().sum().sum()

In [None]:
# drop rows with NaNs
predictors = predictors[~predictors.rs.isin(nullSNPs)]  
predictors

## Random Forest
### Input variable preparation and distribution plots

In [None]:
# drop all entries from the chromosome that is later used for Cross Validation
chr = "1_"

target_chr = newtarget['rs'].str.contains(chr)
target_minusChr = newtarget[~target_chr]

predictors_chr = predictors['rs'].str. contains(chr)
predictors_minusChr = predictors[~predictors_chr]

In [None]:
target_minusChr

In [None]:
y_train = target_minusChr['rFitness']
X_train = predictors_minusChr.iloc[:, 1:].copy()    # without rs column

y_test = newtarget.loc[newtarget.rs.str.match(chr)]
y_test = y_test['rFitness']

X_test = predictors.loc[predictors.rs.str.match(chr)]
X_test = X_test.iloc[:, 1:].copy()


### Packages

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import QuantileTransformer, quantile_transform
from sklearn import metrics
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr
from yellowbrick.regressor import PredictionError, ResidualsPlot
from yellowbrick.features import Rank1D
import pandas as pd

***

In [None]:
# Fit regression model
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
regr_rf = RandomForestRegressor(oob_score=True, random_state=0, n_estimators = 500)

In [None]:
y_test2plot = y_test.copy()
y_test2plot = y_test2plot.reset_index()

#tarMLP #0-17280
#tarMLI #17280-34561
#tarTHP #34562-51842
#tarTHI #51843-69123
#tarAND #69124-86404
#tarGER #86405-103685
#tarFIN #103686-120966
#tarSPA #120967-138247
#tarUKI #138248-155528

y_test2plot['locat'] = ['MLP' if 0 <= x <= 17280 else 
                        'MLI' if 17281 <= x <= 34561 else 
                        'THP' if 34562 <= x <= 51842 else 
                        'THI' if 51843 <= x <= 69123 else 
                        'AND' if 69124 <= x <= 86404 else 
                        'GER' if 86405 <= x <= 103685 else 
                        'FIN' if 103686 <= x <= 120966 else 
                        'SPA' if 120967 <= x <= 138247 else 
                        'UKI' for x in y_test2plot['index']]
y_test2plot

In [None]:
# Training
regr_rf.fit(X_train, y_train)

In [None]:
# Prediction
y_rf = regr_rf.predict(X_test)

In [None]:
predicted_train = regr_rf.predict(X_train)
predicted_test = regr_rf.predict(X_test)
test_score = r2_score(y_test, predicted_test)
spearman = spearmanr(y_test, predicted_test)
pearson = pearsonr(y_test, predicted_test)

In [None]:
# Metrics
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_rf), file=open('Output/10_Metrics.txt', 'a'))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_rf), file=open('Output/10_Metrics.txt', 'a'))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_rf)), file=open('Output/10_Metrics.txt', 'a'))

print(f'Out-of-bag R2 score estimate: {regr_rf.oob_score_:>5.3}', file=open('Output/10_Metrics.txt', 'a'))
print(f'Test data R2 score: {test_score:>5.3}', file=open('Output/10_Metrics.txt', 'a'))
print(f'Test data Spearman correlation: {spearman[0]:.3}', file=open('Output/10_Metrics.txt', 'a'))
print(f'Test data Pearson correlation: {pearson[0]:.3}', file=open('Output/10_Metrics.txt', 'a'))

#### Plot results with Yellowbrick
https://www.scikit-yb.org/en/latest/api/regressor/peplot.html

In [None]:
# Residuals plot
f = plt.figure()
visualizer = ResidualsPlot(regr_rf)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
#f.savefig("Output/10_Residuals.png", bbox_inches='tight', dpi=600)

In [None]:
# Prediction error plot
f = plt.figure()
visualizer = PredictionError(regr_rf)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)
visualizer.show()
#f.savefig("Output/10_PredActual.png", bbox_inches='tight', dpi=600)

In [None]:
y_rf2plot = pd.DataFrame(y_rf)
df2plot = pd.concat([y_test2plot, y_rf2plot], axis=1)
df2plot.columns = ['index', 'Actual', 'Location', 'Predicted']
#df2plot.to_csv('Input/10_RF_Chr1CV_predictedValues.csv')

In [None]:
#df2plot = pd.read_csv('Input/10_RF_Chr1CV_predictedValues.csv')

# Order MLP       MLI        THP        THI         AND       GER       FIN         SPA        UKI
col =['#984ea3','#006e00','#ebac23', '#b80058', '#008cf9', '#00bbad', '#878500', '#ff9287', '#5954d6']

sns.set(rc={'figure.figsize':(13,10)})
sns.set_style("whitegrid")
sns.set_palette(col)
s = sns.scatterplot(x='Predicted', y='Actual', hue='Location', sizes=(20), data=df2plot) 
plt.title("Random forest | 1,000 - 1,000 SNPs", size= 16, pad=25)
plt.suptitle("Actual vs predicted selection coefficients from all locations of chromosome 1", size = 20)
plt.xlabel("Predicted", size=16)
plt.ylabel("Actual", size=16)
plt.xlim(-1,3)
plt.ylim(-1,3)
plt.setp(s.get_legend().get_texts(), fontsize='16') # for legend text
plt.setp(s.get_legend().get_title(), fontsize='18') # for legend title
#plt.savefig('Output/10_PredActual.png', bbox_inches='tight')


In [None]:
df2plot

In [None]:
df2plot = df2plot.sort_values('Location')

In [None]:
# Order AND       FIN        GER       MLI        MLP        SPA        THI        THP          UKI
col =['#008cf9','#878500','#00bbad', '#006e00', '#984ea3', '#ff9287', '#b80058', '#ebac23', '#5954d6']

# plot separated
sns.set_style("whitegrid")
sns.set_palette(col)

p =sns.relplot(
    data=df2plot, x='Predicted', y='Actual',
    col="Location", hue="Location",
    kind="scatter", col_wrap=5)
plt.subplots_adjust(top=0.9)
p.fig.suptitle("RF-7-Chr1 | Actual vs predicted selection coefficients from all locations of chromosome 1", size = 20)

#plt.savefig('Output/10_PredActual_grid.png', bbox_inches='tight')

Further strategy for feature importance:
https://machinelearningmastery.com/calculate-feature-importance-with-python/

In [None]:
importance = regr_rf.feature_importances_         # get importance

# summarize feature importance
#for i,v in enumerate(importance):
#	print('Feature: %0d, Score: %.5f' % (i,v))

labels = list(X_train.columns.values)

plt.figure(figsize=(18,8))
plt.title("Feature importance | 1,000 - 1,000 SNPs | all locations | chromosome 1", size=20, pad=25)
plt.xlabel("Features", size=18)
plt.ylabel("Score", size=18)
imp = sns.barplot([x for x in range(len(importance))], importance, palette='viridis')
imp.set_xticklabels(labels,  rotation='vertical')
#plt.savefig('Output/10_Features.png', bbox_inches='tight', dpi=600)
plt.show()
