# Preparation of dataframe by selection of 10k random SNPs from all 7 locations for NN analyses

The following locations are considered:

| longitude | latitude | country | nearest neighbour accession |
| --- | --- | --- | :---: |
| 36.76539 | -5.499419 | Andalucia | 1600 |
| 51.49702 | 11.970655 | Germany | 1059 |
| 65.00307 | 25.472679 | Finland | 309 |
| 39.48083 | -0.340985 | Spain| 1576|
| 52.62779 | 1.293458 | UK | 578 |
| 48.544886 | 9.043042 | Tuebingen | 1813 |
| 40.408049 | -3.83535 | Madrid | 1845 |

## Import packages

In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
pd.set_option('display.max_columns', 999)

### Select specific SNPs
Selection of 10k random SNPs

In [None]:
betas = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_55climvars_rFit.txt', sep='\t')
betas.rename(columns={'clim-bio18.assoc_y':'clim-bio18'}, inplace=True)
betas.drop(['clim-bio18.assoc_x'],axis=1, inplace=True)

In [None]:
betas2 = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_Fitness.txt', sep='\t')
betas2.rename(columns={'Fitness_Andaluci':'Fitness_Andalucia'}, inplace=True)
betas2 = betas2[betas2.columns.drop(list(betas2.filter(regex='randomized')))]

In [None]:
total = pd.merge(betas2, betas, on='rs')
total

In [None]:
randomSNPs = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/randomSNPsList.txt', sep='\t')
randomSNPs = randomSNPs['randomSNPs'].tolist()

total.set_index('rs')

randomDF = total.loc[total['rs'].isin(randomSNPs)]

In [None]:
MLP = randomDF[['rs', 'rFitness2_mlp']].copy()
MLP.rename(columns={'rFitness2_mlp':'beta'}, inplace=True)
MLP['locat'] = 'MLP'

MLI = randomDF[['rs', 'rFitness2_mli']].copy()
MLI.rename(columns={'rFitness2_mli':'beta'}, inplace=True)
MLI['locat'] = 'MLI'

THI = randomDF[['rs', 'rFitness2_thi']].copy()
THI.rename(columns={'rFitness2_thi':'beta'}, inplace=True)
THI['locat'] = 'THI'

THP = randomDF[['rs', 'rFitness2_thp']].copy()
THP.rename(columns={'rFitness2_thp':'beta'}, inplace=True)
THP['locat'] = 'THP'

AND = randomDF[['rs', 'Fitness_Andalucia']].copy()
AND.rename(columns={'Fitness_Andalucia':'beta'}, inplace=True)
AND['locat'] = 'AND'

SPA = randomDF[['rs', 'Fitness_Spain']].copy()
SPA.rename(columns={'Fitness_Spain':'beta'}, inplace=True)
SPA['locat'] = 'SPA'

UKI = randomDF[['rs', 'Fitness_UnitedKingdom']].copy()
UKI.rename(columns={'Fitness_UnitedKingdom':'beta'}, inplace=True)
UKI['locat'] = 'UKI'

FIN = randomDF[['rs', 'Fitness_Finland']].copy()
FIN.rename(columns={'Fitness_Finland':'beta'}, inplace=True)
FIN['locat'] = 'FIN'

GER = randomDF[['rs', 'Fitness_Germany']].copy()
GER.rename(columns={'Fitness_Germany':'beta'}, inplace=True)
GER['locat'] = 'GER'
GER

In [None]:
target = pd.DataFrame(data=MLP)
target = target.append([MLI, THP, THI, AND, FIN, GER, SPA, UKI], ignore_index=True, sort=False)
target

In [None]:
# count total number of NaNs
target.isnull().sum().sum()

In [None]:
target.to_csv(r'Input/Target_randomSNPs.csv', sep='\t', index=False)

In [None]:
tarMLP = target[target["locat"] == 'MLP']
tarMLP #0-9995

In [None]:
tarMLI = target[target["locat"] == 'MLI']
tarMLI #9996-19991

In [None]:
tarTHP = target[target["locat"] == 'THP']
tarTHP #19992-29987

In [None]:
tarTHI = target[target["locat"] == 'THI']
tarTHI #29988-39983

In [None]:
tarSPA = target[target["locat"] == 'SPA']
tarSPA #69972-79967

In [None]:
tarUKI = target[target["locat"] == 'UKI']
tarUKI #79968-89963

In [None]:
tarGER = target[target["locat"] == 'GER']
tarGER #59976-69971

In [None]:
tarFIN = target[target["locat"] == 'FIN']
tarFIN #49980-59975

In [None]:
tarAND = target[target["locat"] == 'AND']
tarAND #39984-49979

In [None]:
predictors = pd.concat([randomDF]*9, ignore_index=True)
predictors

In [None]:
# add annotation to predictors dataset
annot = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/515g2.ann.txt', sep='\t')
predictors = predictors.join(annot.set_index('rs'), on='rs')
predictors = predictors.drop(columns=['chr', 'ps', 'allel1', 'allel2'])

In [None]:
# encode annotation numerically
lb = LabelEncoder()
predictors['ann'] = lb.fit_transform(predictors['ann'])

# print encoding
lbMapping = dict(zip(lb.classes_, lb.transform(lb.classes_)))
lbMapping

In [None]:
# prepare climate data 

clim = pd.read_csv(f'/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/natvar/climate/2029gclimate.tsv', delim_whitespace=True)

climT = pd.concat([clim.iloc[[1813]]]*int(len(predictors)/9*2), ignore_index=True) #1813 = accession close to Tübingen
climM = pd.concat([clim.iloc[[1845]]]*int(len(predictors)/9*2), ignore_index=True) #1845 = accession close to Madrid

climA = pd.concat([clim.iloc[[1600]]]*int(len(predictors)/9), ignore_index=True) #1600 = accession close to location Andalusia
climG = pd.concat([clim.iloc[[1059]]]*int(len(predictors)/9), ignore_index=True) #1059 = accession close to location Germany
climF = pd.concat([clim.iloc[[309]]]*int(len(predictors)/9), ignore_index=True) #309 = accession close to location Finland
climS = pd.concat([clim.iloc[[1576]]]*int(len(predictors)/9), ignore_index=True) #1576 = accession close to location Spain
climU = pd.concat([clim.iloc[[578]]]*int(len(predictors)/9), ignore_index=True) #578 = accession close to location United Kingdom

In [None]:
climFin = pd.concat([climM, climT, climA,  climF, climG, climS, climU], axis=0) #concat this way, to have Madrid at first, then Tübingen and then in alphabetic order to fit to target order
climFin = climFin.iloc[:, :-12]    
climFin

In [None]:
# finalize predictors dataset
predictors = pd.concat([predictors.reset_index(drop=True), climFin.reset_index(drop=True)], axis=1, sort=False)  # without reset_index, NAs were introduced in DF
cols=[1,2,3,4,5,6,7,8,9]    #drop rFitness columns
predictors = predictors.drop(predictors.columns[cols], axis=1)
predictors

In [None]:
predictors.isnull().sum().sum()

In [None]:
predictors.to_csv(r'Input/Predictors_randomSNPs.csv', sep='\t', index=False).ipynb_checkpoints/