# Preparation of dataframe based on 2000 selected SNPs from both distribution tails of rel. fitness betas using all 7 locations for NN analyses

The following locations are to be considered:

| longitude | latitude | country | nearest neighbour accession |
| --- | --- | --- | :---: |
| 36.76539 | -5.499419 | Andalucia | 1600 |
| 51.49702 | 11.970655 | Germany | 1059 |
| 65.00307 | 25.472679 | Finland | 309 |
| 39.48083 | -0.340985 | Spain| 1576|
| 52.62779 | 1.293458 | UK | 578 |
| 48.544886 | 9.043042 | Tuebingen | 1813 |
| 40.408049 | -3.83535 | Madrid | 1845 |

## Import packages

In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)

### Select specific SNPs
Select 1000 SNPs with highest and 1000 with lowest selection coefficient.

In [None]:
betas = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_55climvars_rFit.txt', sep='\t')
betas.rename(columns={'clim-bio18.assoc_y':'clim-bio18'}, inplace=True)
betas.drop(['clim-bio18.assoc_x'],axis=1, inplace=True)
#betas = betas[betas.columns.drop(list(betas.filter(regex='rFitness')))]

In [None]:
# Extract mlp and mli datasets
MLP = betas[['rs', 'rFitness2_mlp']]
MLI = betas[['rs', 'rFitness2_mli']]
THP = betas[['rs', 'rFitness2_thp']]
THI = betas[['rs', 'rFitness2_thi']]
#THI

In [None]:
betas2 = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_Fitness.txt', sep='\t')
betas2.rename(columns={'Fitness_Andaluci':'Fitness_Andalucia'}, inplace=True)
betas2 = betas2[betas2.columns.drop(list(betas2.filter(regex='randomized')))]

In [None]:
AND = betas2[['rs', 'Fitness_Andalucia']]
SPA = betas2[['rs', 'Fitness_Spain']]
UKI = betas2[['rs', 'Fitness_UnitedKingdom']]
FIN = betas2[['rs', 'Fitness_Finland']]
GER = betas2[['rs', 'Fitness_Germany']]

In [None]:
# Sort & select

MLP = MLP.sort_values(by=['rFitness2_mlp'], ascending=False)
MLI = MLI.sort_values(by=['rFitness2_mli'], ascending=False)
THP = THP.sort_values(by=['rFitness2_thp'], ascending=False)
THI = THI.sort_values(by=['rFitness2_thi'], ascending=False)

AND = AND.sort_values(by=['Fitness_Andalucia'], ascending=False)
SPA = SPA.sort_values(by=['Fitness_Spain'], ascending=False)
UKI = UKI.sort_values(by=['Fitness_UnitedKingdom'], ascending=False)
FIN = FIN.sort_values(by=['Fitness_Finland'], ascending=False)
GER = GER.sort_values(by=['Fitness_Germany'], ascending=False)

x=1000

# get the first and last 1000 objects (highest and lowest betas)
selMLP = MLP.iloc[:x, :]   
selMLP = selMLP.append(MLP.iloc[-x:, :])
selMLPSNPs = selMLP['rs'].tolist()

selMLI = MLI.iloc[:x, :]   
selMLI = selMLI.append(MLI.iloc[-x:, :])
selMLISNPs = selMLI['rs'].tolist()

selTHP = THP.iloc[:x, :]   
selTHP = selTHP.append(THP.iloc[-x:, :])
selTHPSNPs = selTHP['rs'].tolist()

selTHI = THI.iloc[:x, :]   
selTHI = selTHI.append(THI.iloc[-x:, :])
selTHISNPs = selTHI['rs'].tolist()

selAND = AND.iloc[:x, :]   
selAND = selAND.append(AND.iloc[-x:, :])
selANDSNPs = selAND['rs'].tolist()

selSPA = SPA.iloc[:x, :]   
selSPA = selSPA.append(SPA.iloc[-x:, :])
selSPASNPs = selSPA['rs'].tolist()

selUKI = UKI.iloc[:x, :]   
selUKI = selUKI.append(UKI.iloc[-x:, :])
selUKISNPs = selUKI['rs'].tolist()

selFIN = FIN.iloc[:x, :]   
selFIN = selFIN.append(FIN.iloc[-x:, :])
selFINSNPs = selFIN['rs'].tolist()

selGER = GER.iloc[:x, :]   
selGER = selGER.append(GER.iloc[-x:, :])
selGERSNPs = selGER['rs'].tolist()

In [None]:
MLP.isnull().sum().sum() 
#MLP.dtypes

In [None]:
# Use all rs from the selection above and create new list
mySNPs = selMLPSNPs + selMLISNPs + selTHPSNPs + selTHISNPs + selANDSNPs + selSPASNPs + selUKISNPs + selFINSNPs + selGERSNPs
len(mySNPs)

In [None]:
# check for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1];

In [None]:
# remove duplicates
mySNPs = list(set(mySNPs))

# check again for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1]

In [None]:
len(mySNPs)

In [None]:
len(mySNPs)*9

In [None]:
# create now target dataframe with selected SNPs

target = pd.DataFrame(mySNPs, columns=['rs'])

a = target.join(MLP.set_index('rs'), on='rs')
a.rename(columns={'rFitness2_mlp':'rFitness'}, inplace=True)
a['locat'] = 'MLP'

b = target.join(MLI.set_index('rs'), on='rs')
b.rename(columns={'rFitness2_mli':'rFitness'}, inplace=True)
b['locat'] = 'MLI'

c = target.join(THP.set_index('rs'), on='rs')
c.rename(columns={'rFitness2_thp':'rFitness'}, inplace=True)
c['locat'] = 'THP'

d = target.join(THI.set_index('rs'), on='rs')
d.rename(columns={'rFitness2_thi':'rFitness'}, inplace=True)
d['locat'] = 'THI'

e = target.join(AND.set_index('rs'), on='rs')
e.rename(columns={'Fitness_Andalucia':'rFitness'}, inplace=True)
e['locat'] = 'AND'

f = target.join(GER.set_index('rs'), on='rs')
f.rename(columns={'Fitness_Germany':'rFitness'}, inplace=True)
f['locat'] = 'GER'

g = target.join(FIN.set_index('rs'), on='rs')
g.rename(columns={'Fitness_Finland':'rFitness'}, inplace=True)
g['locat'] = 'FIN'

h = target.join(SPA.set_index('rs'), on='rs')
h.rename(columns={'Fitness_Spain':'rFitness'}, inplace=True)
h['locat'] = 'SPA'

i = target.join(UKI.set_index('rs'), on='rs')
i.rename(columns={'Fitness_UnitedKingdom':'rFitness'}, inplace=True)
i['locat'] = 'UKI'

In [None]:
target = a.append([b, c, d, e, f, g, h, i], ignore_index=True, sort=False)
target

In [None]:
# count total number of NaNs
target.isnull().sum().sum()

In [None]:
# extract locations where rFitness is NaN
nullDF = target[target['rFitness'].isnull()]

In [None]:
nullDF

In [None]:
nullSNPs = nullDF['rs'].tolist()
#nullSNPs

# check for duplicates
from collections import Counter
[k for k,v in Counter(mySNPs).items() if v>1]

In [None]:
len(nullSNPs)

In [None]:
# drop those rows of nullSNPs
newtarget = target[~target.rs.isin(nullSNPs)] 

In [None]:
newtarget.isnull().sum().sum()

In [None]:
newtarget.to_csv(r'Input/Target_7locs.csv', sep='\t', index=False)
newtarget

In [None]:
tarMLP = newtarget[newtarget["locat"] == 'MLP']
tarMLP #0-17280

In [None]:
tarMLI = newtarget[newtarget["locat"] == 'MLI']
tarMLI #17282-34561

In [None]:
tarTHP = newtarget[newtarget["locat"] == 'THP']
tarTHP #34563-51842

In [None]:
tarTHI = newtarget[newtarget["locat"] == 'THI']
tarTHI #51844-69123

In [None]:
tarSPA = newtarget[newtarget["locat"] == 'SPA']
tarSPA #120968-138247

In [None]:
tarUKI = newtarget[newtarget["locat"] == 'UKI']
tarUKI #138249-155528

In [None]:
tarGER = newtarget[newtarget["locat"] == 'GER']
tarGER #86406-103685

In [None]:
tarFIN = newtarget[newtarget["locat"] == 'FIN']
tarFIN #103687-120966

In [None]:
tarAND = newtarget[newtarget["locat"] == 'AND']
tarAND #69125-86404

In [None]:
predictors = pd.DataFrame(mySNPs, columns=['rs'])
predictors = predictors.join(betas.set_index('rs'), on='rs')
predictors = pd.concat([predictors]*9, ignore_index=True)

In [None]:
# add annotation to predictors dataset
annot = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/515g2.ann.txt', sep='\t')
predictors = predictors.join(annot.set_index('rs'), on='rs')
predictors = predictors.drop(columns=['chr', 'ps', 'allel1', 'allel2'])

In [None]:
# encode annotation numerically
lb = LabelEncoder()
predictors['ann'] = lb.fit_transform(predictors['ann'])

# print encoding
lbMapping = dict(zip(lb.classes_, lb.transform(lb.classes_)))
lbMapping

In [None]:
# prepare climate data 

clim = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/natvar/climate/2029gclimate.tsv', delim_whitespace=True)

climT = pd.concat([clim.iloc[[1813]]]*int(len(predictors)/9*2), ignore_index=True) #1813 = accession close to Tübingen
climM = pd.concat([clim.iloc[[1845]]]*int(len(predictors)/9*2), ignore_index=True) #1845 = accession close to Madrid

climA = pd.concat([clim.iloc[[1600]]]*int(len(predictors)/9), ignore_index=True) #1600 = accession close to location Andalusia
climG = pd.concat([clim.iloc[[1059]]]*int(len(predictors)/9), ignore_index=True) #1059 = accession close to location Germany
climF = pd.concat([clim.iloc[[309]]]*int(len(predictors)/9), ignore_index=True) #309 = accession close to location Finland
climS = pd.concat([clim.iloc[[1576]]]*int(len(predictors)/9), ignore_index=True) #1576 = accession close to location Spain
climU = pd.concat([clim.iloc[[578]]]*int(len(predictors)/9), ignore_index=True) #578 = accession close to location United Kingdom

climFin = pd.concat([climM, climT, climA, climG, climF, climS, climU], axis=0) #concat this way, to have Madrid at first, then Tübingen and then in alphabetic order to fit to target order
climFin = climFin.iloc[:, :-12]

In [None]:
# finalize predictors dataset
predictors = pd.concat([predictors.reset_index(drop=True), climFin.reset_index(drop=True)], axis=1, sort=False)  # without reset_index, NAs were introduced in DF
cols=[1,2,3,4]    #drop rFitness columns
predictors = predictors.drop(predictors.columns[cols], axis=1)

In [None]:
predictors.isnull().sum().sum()

In [None]:
# drop rows with NaNs
predictors = predictors[~predictors.rs.isin(nullSNPs)]  
predictors.to_csv(r'Input/Predictors_7locs.csv', sep='\t', index=False)