# EDA before SNP selection

In [None]:
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set(color_codes=True)
sns.set_style("whitegrid")
sns.set(rc={'figure.figsize':(13,10)})
pd.set_option('display.max_columns', 999)

In [None]:
betas = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_55climvars_rFit.txt', sep='\t')
betas.rename(columns={'clim-bio18.assoc_y':'clim-bio18'}, inplace=True)
betas.drop(['clim-bio18.assoc_x'],axis=1, inplace=True)

## Correlation matrix of climate-related selection coefficients

In [None]:
betas4corr = betas.iloc[:,5:].copy()
betas4corr

In [None]:
betas4corr = betas4corr.reindex(sorted(betas4corr.columns), axis=1)

In [None]:
betas4corr.columns

In [None]:
betas4corr.columns = ['clim-bio01', 'clim-bio10', 'clim-bio11', 'clim-bio12', 'clim-bio13',
       'clim-bio14', 'clim-bio15', 'clim-bio16', 'clim-bio17', 'clim-bio18',
       'clim-bio19', 'clim-bio02', 'clim-bio03', 'clim-bio04', 'clim-bio05',
       'clim-bio06', 'clim-bio07', 'clim-bio08', 'clim-bio09', 'clim-prec01',
       'clim-prec10', 'clim-prec11', 'clim-prec12', 'clim-prec02', 'clim-prec03',
       'clim-prec04', 'clim-prec05', 'clim-prec06', 'clim-prec07', 'clim-prec08',
       'clim-prec09', 'clim-tmax01', 'clim-tmax10', 'clim-tmax11', 'clim-tmax12',
       'clim-tmax02', 'clim-tmax03', 'clim-tmax04', 'clim-tmax05', 'clim-tmax06',
       'clim-tmax07', 'clim-tmax08', 'clim-tmax09', 'clim-tmin01', 'clim-tmin10',
       'clim-tmin11', 'clim-tmin12', 'clim-tmin02', 'clim-tmin03', 'clim-tmin04',
       'clim-tmin05', 'clim-tmin06', 'clim-tmin07', 'clim-tmin08', 'clim-tmin09']

In [None]:
betas4corr.columns

In [None]:
betas4corr = betas4corr.reindex(sorted(betas4corr.columns), axis=1)

In [None]:
betas4corr.columns

In [None]:
betas4corr.columns = ['betas-bio01', 'betas-bio02', 'betas-bio03', 'betas-bio04', 'betas-bio05',
       'betas-bio06', 'betas-bio07', 'betas-bio08', 'betas-bio09', 'betas-bio10',
       'betas-bio11', 'betas-bio12', 'betas-bio13', 'betas-bio14', 'betas-bio15',
       'betas-bio16', 'betas-bio17', 'betas-bio18', 'betas-bio19', 'betas-prec01',
       'betas-prec02', 'betas-prec03', 'betas-prec04', 'betas-prec05',
       'betas-prec06', 'betas-prec07', 'betas-prec08', 'betas-prec09',
       'betas-prec10', 'betas-prec11', 'betas-prec12', 'betas-tmax01',
       'betas-tmax02', 'betas-tmax03', 'betas-tmax04', 'betas-tmax05',
       'betas-tmax06', 'betas-tmax07', 'betas-tmax08', 'betas-tmax09',
       'betas-tmax10', 'betas-tmax11', 'betas-tmax12', 'betas-tmin01',
       'betas-tmin02', 'betas-tmin03', 'betas-tmin04', 'betas-tmin05',
       'betas-tmin06', 'betas-tmin07', 'betas-tmin08', 'betas-tmin09',
       'betas-tmin10', 'betas-tmin11', 'betas-tmin12']

In [None]:
matrix = np.triu(betas4corr.corr())
sns.set(rc={'figure.figsize':(14,14)})
sns.set_style("whitegrid")
sns.heatmap(betas4corr.corr(), vmin=-1, vmax=1, center= 0, cmap= 'PRGn', mask=matrix, cbar_kws={"shrink": .70})
plt.title('Correlation heatmap of climate-related correlation coefficients', fontsize=20)
#plt.savefig('corr_climcorrcoeff.png', bbox_inches='tight')

## Histogramm of fitness-related selection coefficients
### Before SNP selection

In [None]:
# Extract mlp and mli datasets
MLP = betas[['rs', 'rFitness2_mlp']]
MLI = betas[['rs', 'rFitness2_mli']]
THP = betas[['rs', 'rFitness2_thp']]
THI = betas[['rs', 'rFitness2_thi']]

In [None]:
betas2 = pd.read_csv('/Carnegie/DPB/Data/Shared/Labs/Moi/Everyone/deepselection/randomForest/betas_woNAs_Fitness.txt', sep='\t')
betas2.rename(columns={'Fitness_Andaluci':'Fitness_Andalucia'}, inplace=True)
betas2 = betas2[betas2.columns.drop(list(betas2.filter(regex='randomized')))]

In [None]:
AND = betas2[['rs', 'Fitness_Andalucia']]
SPA = betas2[['rs', 'Fitness_Spain']]
UKI = betas2[['rs', 'Fitness_UnitedKingdom']]
FIN = betas2[['rs', 'Fitness_Finland']]
GER = betas2[['rs', 'Fitness_Germany']]

In [None]:
a = MLP['rFitness2_mlp']
b = MLI['rFitness2_mli']
c = THP['rFitness2_thp']
d = THI['rFitness2_thi']

color = '#1b7837'
sns.set_style("whitegrid")
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize = (24, 6))
fig.suptitle('Density plots and histograms of fitness-related selection coefficients', size= 20) 
sns.distplot(a, ax=ax1, color=color)
sns.distplot(b, ax=ax2, color=color)
sns.distplot(c, ax=ax3, color=color)
sns.distplot(d, ax=ax4, color=color)
ax1.set_xlabel('MLP')
ax2.set_xlabel('MLI')
ax3.set_xlabel('THP')
ax4.set_xlabel('THI')
fig.show()
#fig.savefig('FitnessBetasDist.png', bbox_inches='tight', dpi=100)

In [None]:
e = AND['Fitness_Andalucia']
f = FIN['Fitness_Finland']
g = GER['Fitness_Germany']
h = SPA['Fitness_Spain']
i = UKI['Fitness_UnitedKingdom']

sns.set_style("whitegrid")
fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1,5, figsize = (24, 6))
sns.distplot(e, ax=ax1, color=color)
sns.distplot(f, ax=ax2, color=color)
sns.distplot(g, ax=ax3, color=color)
sns.distplot(h, ax=ax4, color=color)
sns.distplot(i, ax=ax5, color=color)
ax1.set_xlabel('AND')
ax2.set_xlabel('FIN')
ax3.set_xlabel('GER')
ax4.set_xlabel('SPA')
ax5.set_xlabel('UKI')
fig.show()
#fig.savefig('FitnessBetasDist_5locs.png', bbox_inches='tight', dpi=100)

## Histogramm of fitness-related selection coefficients
### After SNP selection

In [None]:
# Sort & select

MLP = MLP.sort_values(by=['rFitness2_mlp'], ascending=False)
MLI = MLI.sort_values(by=['rFitness2_mli'], ascending=False)
THP = THP.sort_values(by=['rFitness2_thp'], ascending=False)
THI = THI.sort_values(by=['rFitness2_thi'], ascending=False)

AND = AND.sort_values(by=['Fitness_Andalucia'], ascending=False)
SPA = SPA.sort_values(by=['Fitness_Spain'], ascending=False)
UKI = UKI.sort_values(by=['Fitness_UnitedKingdom'], ascending=False)
FIN = FIN.sort_values(by=['Fitness_Finland'], ascending=False)
GER = GER.sort_values(by=['Fitness_Germany'], ascending=False)

x=1000

# get the first and last 1000 objects (highest and lowest betas)
selMLP = MLP.iloc[:x, :]   
selMLP = selMLP.append(MLP.iloc[-x:, :])
selMLPSNPs = selMLP['rs'].tolist()

selMLI = MLI.iloc[:x, :]   
selMLI = selMLI.append(MLI.iloc[-x:, :])
selMLISNPs = selMLI['rs'].tolist()

selTHP = THP.iloc[:x, :]   
selTHP = selTHP.append(THP.iloc[-x:, :])
selTHPSNPs = selTHP['rs'].tolist()

selTHI = THI.iloc[:x, :]   
selTHI = selTHI.append(THI.iloc[-x:, :])
selTHISNPs = selTHI['rs'].tolist()

selAND = AND.iloc[:x, :]   
selAND = selAND.append(AND.iloc[-x:, :])
selANDSNPs = selAND['rs'].tolist()

selSPA = SPA.iloc[:x, :]   
selSPA = selSPA.append(SPA.iloc[-x:, :])
selSPASNPs = selSPA['rs'].tolist()

selUKI = UKI.iloc[:x, :]   
selUKI = selUKI.append(UKI.iloc[-x:, :])
selUKISNPs = selUKI['rs'].tolist()

selFIN = FIN.iloc[:x, :]   
selFIN = selFIN.append(FIN.iloc[-x:, :])
selFINSNPs = selFIN['rs'].tolist()

selGER = GER.iloc[:x, :]   
selGER = selGER.append(GER.iloc[-x:, :])
selGERSNPs = selGER['rs'].tolist()

In [None]:
MLP.isnull().sum().sum() 
#MLP.dtypes

In [None]:
selGER

In [None]:
# check distribution of selection

a = selMLP['rFitness2_mlp']
b = selMLI['rFitness2_mli']
c = selTHP['rFitness2_thp']
d = selTHI['rFitness2_thi']

color = '#1b7837'
sns.set_style("whitegrid")
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize = (24, 6))
fig.suptitle('Density plots and histograms of selected fitness-related selection coefficients', size= 20)
sns.distplot(a, ax=ax1, color=color)
sns.distplot(b, ax=ax2, color=color)
sns.distplot(c, ax=ax3, color=color)
sns.distplot(d, ax=ax4, color=color)
ax1.set_xlabel('MLP')
ax2.set_xlabel('MLI')
ax3.set_xlabel('THP')
ax4.set_xlabel('THI')
fig.show()
#fig.savefig('SelFitnessBetasDist.png', bbox_inches='tight', dpi=100)

In [None]:
e = selAND['Fitness_Andalucia']
f = selFIN['Fitness_Finland']
g = selGER['Fitness_Germany']
h = selSPA['Fitness_Spain']
i = selUKI['Fitness_UnitedKingdom']

sns.set_style("whitegrid")
fig, (ax1, ax2, ax3, ax4, ax5) = plt.subplots(1,5, figsize = (24, 6))
sns.distplot(e, ax=ax1, color=color)
sns.distplot(f, ax=ax2, color=color)
sns.distplot(g, ax=ax3, color=color)
sns.distplot(h, ax=ax4, color=color)
sns.distplot(i, ax=ax5, color=color)
ax1.set_xlabel('AND')
ax2.set_xlabel('FIN')
ax3.set_xlabel('GER')
ax4.set_xlabel('SPA')
ax5.set_xlabel('UKI')
fig.show()
#fig.savefig('SelFitnessBetasDist_5locs.png', bbox_inches='tight', dpi=100)

## Density plot & histogram of target

In [None]:
target = pd.read_csv('/home/esehr/NN_notebook/Input/Target_7locs.csv', delim_whitespace=True)

In [None]:
target

In [None]:
sns.set_style("whitegrid")
sns.distplot(target['rFitness'], color=color)
plt.title('Density plot and histogram of the combined selection coefficients of selected SNPs', size=20)
plt.xlabel('All locations combined')
#plt.savefig('TargetDist.png', bbox_inches='tight')

## EDA of features after SNP selection

In [None]:
predictors = pd.read_csv('/home/esehr/NN_notebook/Input/Predictors_7locs.csv', delim_whitespace=True)

In [None]:
predictors.describe()

In [None]:
# check distribution of predictors
color = '#1b7837'

b = predictors['bio1']
c = predictors['clim-bio1']

sns.set_style("whitegrid")
fig, (ax2, ax3) = plt.subplots(1,2, figsize = (13, 4))
fig.suptitle('Density plots and histograms of selected features after SNP selection', size=12) 
sns.distplot(b, ax=ax2, color=color)
sns.distplot(c, ax=ax3, color=color)

ax2.set_xlabel("Annual mean temperature (bio1)", size = 9)
ax3.set_xlabel("Correlation coefficients from bio1 GWAS", size = 9)

plt.subplots_adjust(top=0.9)
fig.show()
#fig.savefig('PredVarDist_v2.png', bbox_inches='tight', dpi=100)


In [None]:
color = '#1b7837'

a = predictors['ann']

sns.set_style("whitegrid")
fig = sns.distplot(a, color = color, bins=np.arange(0.5,22.5), kde=False)
fig.set_title('Distribution of the annotation categories after SNP selection', size=20, pad=20)
fig.set_xlabel("Annotation (categorical)", size= 16)
fig.set_ylabel('Number of SNPs annotated', size = 16)

fig.xaxis.tick_bottom()
fig.set_xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22])
fig.set_xticklabels(['3_prime_UTR_variant', '5_prime_UTR_premature_start_codon_gain_variant', '5_prime_UTR_variant', 
                     'initiator_codon_variant', 'intergenic_region', 'intron_variant', 'missense_variant', 
                      'missense_variant&splice_region_variant', 'non_coding_transcript_exon_variant', 'splice_acceptor_variant&intron_variant', 
                      'splice_donor_variant&intron_variant', 'splice_region_variant', 'splice_region_variant&intron_variant', 
                      'splice_region_variant&non_coding_transcript_exon_variant', 'splice_region_variant&stop_retained_variant', 
                      'splice_region_variant&synonymous_variant', 'start_lost', 'stop_gained', 'stop_gained&splice_region_variant', 
                      'stop_lost', 'stop_lost&splice_region_variant', 'stop_retained_variant', 'synonymous_variant'], rotation=30, ha='right')

#fig.figure.savefig('Annotation_afterSNPsel.png', bbox_inches='tight',dpi=100)


## Heatmap after SNP selection

In [None]:
# subset with sel coeff
climbetas4corr = predictors.iloc[:,1:56].copy()
climbetas4corr

In [None]:
climbetas4corr = climbetas4corr.reindex(sorted(climbetas4corr.columns), axis=1)

In [None]:
climbetas4corr.columns = ['clim-bio01', 'clim-bio10', 'clim-bio11', 'clim-bio12', 'clim-bio13',
       'clim-bio14', 'clim-bio15', 'clim-bio16', 'clim-bio17', 'clim-bio18',
       'clim-bio19', 'clim-bio02', 'clim-bio03', 'clim-bio04', 'clim-bio05',
       'clim-bio06', 'clim-bio07', 'clim-bio08', 'clim-bio09', 'clim-prec01',
       'clim-prec10', 'clim-prec11', 'clim-prec12', 'clim-prec02', 'clim-prec03',
       'clim-prec04', 'clim-prec05', 'clim-prec06', 'clim-prec07', 'clim-prec08',
       'clim-prec09', 'clim-tmax01', 'clim-tmax10', 'clim-tmax11', 'clim-tmax12',
       'clim-tmax02', 'clim-tmax03', 'clim-tmax04', 'clim-tmax05', 'clim-tmax06',
       'clim-tmax07', 'clim-tmax08', 'clim-tmax09', 'clim-tmin01', 'clim-tmin10',
       'clim-tmin11', 'clim-tmin12', 'clim-tmin02', 'clim-tmin03', 'clim-tmin04',
       'clim-tmin05', 'clim-tmin06', 'clim-tmin07', 'clim-tmin08', 'clim-tmin09']

In [None]:
climbetas4corr = climbetas4corr.reindex(sorted(climbetas4corr.columns), axis=1)

In [None]:
climbetas4corr.columns = ['betas-bio01', 'betas-bio02', 'betas-bio03', 'betas-bio04', 'betas-bio05',
       'betas-bio06', 'betas-bio07', 'betas-bio08', 'betas-bio09', 'betas-bio10',
       'betas-bio11', 'betas-bio12', 'betas-bio13', 'betas-bio14', 'betas-bio15',
       'betas-bio16', 'betas-bio17', 'betas-bio18', 'betas-bio19', 'betas-prec01',
       'betas-prec02', 'betas-prec03', 'betas-prec04', 'betas-prec05',
       'betas-prec06', 'betas-prec07', 'betas-prec08', 'betas-prec09',
       'betas-prec10', 'betas-prec11', 'betas-prec12', 'betas-tmax01',
       'betas-tmax02', 'betas-tmax03', 'betas-tmax04', 'betas-tmax05',
       'betas-tmax06', 'betas-tmax07', 'betas-tmax08', 'betas-tmax09',
       'betas-tmax10', 'betas-tmax11', 'betas-tmax12', 'betas-tmin01',
       'betas-tmin02', 'betas-tmin03', 'betas-tmin04', 'betas-tmin05',
       'betas-tmin06', 'betas-tmin07', 'betas-tmin08', 'betas-tmin09',
       'betas-tmin10', 'betas-tmin11', 'betas-tmin12']

In [None]:
# subset with clima
clim4corr = predictors.iloc[:,57:].copy()
clim4corr

In [None]:
matrix = np.triu(clim4corr.corr())
sns.set(rc={'figure.figsize':(14,14)})
sns.set_style("whitegrid")
sns.heatmap(clim4corr.corr(), vmin=-1, vmax=1, center= 0, cmap= 'PRGn', mask=matrix, cbar_kws={"shrink": .70})
plt.title('Correlation heatmap of climate variables', fontsize=20)
#plt.savefig('corr_clima.png', bbox_inches='tight', dpi=100)

In [None]:
matrix = np.triu(climbetas4corr.corr())
sns.set(rc={'figure.figsize':(14,14)})
sns.set_style("whitegrid")
sns.heatmap(climbetas4corr.corr(), vmin=-1, vmax=1, center= 0, cmap= 'PRGn', mask=matrix, cbar_kws={"shrink": .70})
plt.title('Correlation heatmap of climate-related correlation coefficients after SNP selection', fontsize=20)
#plt.savefig('corr_climabetas_afterSNPsel.png', bbox_inches='tight', dpi=100)