In [1]:
#Initial configuration, probably overkill in imports.
import sys, os, re
import numpy as np
import pandas as pd
import allel
import zarr
import dask
import numcodecs
import warnings
from pathlib import Path


%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina', 'png')
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
sns.set()
sns.set_style("white")
sns.set_context("notebook")

#Dask setup
from multiprocessing.pool import ThreadPool
import dask
dask.config.set(pool=ThreadPool(int(os.environ['SLURM_CPUS_PER_TASK'])))

<dask.config.set at 0x2adc3a8aa390>

In [2]:
#Opening the zarr data
callset = zarr.open_group('/faststorage/project/primatediversity/people/kmt/baboon_flagship/steps/callset.zarr', mode='r')
callset.tree(expand=False)

Tree(nodes=(Node(disabled=True, name='/', nodes=(Node(disabled=True, name='chr1', nodes=(Node(disabled=True, n…

In [3]:
#Sample information and chromosome lengths
chromosome_lengths = dict()
for line in open('../data/macFas5.chrom.sizes.txt'): # NB MACAQUE REF GENOME...
    chrom, length = line.split()
    chromosome_lengths[chrom] = int(length)
    
chromosomes = [f'chr{x}' for x in range(1, 21)] + ['chrX']
meta_data = pd.read_excel('../data/Papio-Genomes_JR_120720_MR-CR-KM_geoloc.xlsx')
with pd.option_context("display.max_rows", 1000):
    display(meta_data)
baboon_samples = [x for x in meta_data.PGDP_ID if x.startswith('PD')] #  NB: to not get the SciAdvPaper samples
#Meta data for the sample present in the zarr data structure - Kasper has removed some of the samples.
samples_list = list(callset['chr1/samples'][:])
meta_data_samples = meta_data.loc[meta_data.PGDP_ID.isin(samples_list)].copy()
samples_callset_index = [samples_list.index(s) for s in meta_data_samples.PGDP_ID]
meta_data_samples['callset_index'] = samples_callset_index

Unnamed: 0.1,Unnamed: 0,PGDP_ID,Provider_ID,Provider,Genus,Species,Origin,Sex,address,longitude,latitude
0,0,PD_0067,1043,Roos,Theropithecus,gelada,captive,M,"SDSU Captive Wildlife Research Facility, Brook...",-96.79328,44.334031
1,1,PD_0199,09SNF1101115,Knauf/Chuma/Roos,Papio,anubis,"Serengeti, Tanzania",F,"Serengeti, Mara, Lake Zone, Tanzania",34.742544,-1.996626
2,2,PD_0200,11SNF1101115,Knauf/Chuma/Roos,Papio,anubis,"Serengeti, Tanzania",F,"Serengeti, Mara, Lake Zone, Tanzania",34.742544,-1.996626
3,3,PD_0201,19SNM1131115,Knauf/Chuma/Roos,Papio,anubis,"Serengeti, Tanzania",M,"Serengeti, Mara, Lake Zone, Tanzania",34.742544,-1.996626
4,4,PD_0202,20SNF1131115,Knauf/Chuma/Roos,Papio,anubis,"Serengeti, Tanzania",F,"Serengeti, Mara, Lake Zone, Tanzania",34.742544,-1.996626
5,5,PD_0203,21SNF1151115,Knauf/Chuma/Roos,Papio,anubis,"Serengeti, Tanzania",F,"Serengeti, Mara, Lake Zone, Tanzania",34.742544,-1.996626
6,6,PD_0204,03ANM1080815,Knauf/Chuma/Roos,Papio,anubis,"Arusha, Tanzania",M,"Arusha, Northern Zone, Tanzania",36.708585,-3.427534
7,7,PD_0205,05ANF1030116,Knauf/Chuma/Roos,Papio,anubis,"Arusha, Tanzania",F,"Arusha, Northern Zone, Tanzania",36.708585,-3.427534
8,8,PD_0206,09ANF1040116,Knauf/Chuma/Roos,Papio,anubis,"Arusha, Tanzania",F,"Arusha, Northern Zone, Tanzania",36.708585,-3.427534
9,9,PD_0207,12ANM1050116,Knauf/Chuma/Roos,Papio,anubis,"Arusha, Tanzania",M,"Arusha, Northern Zone, Tanzania",36.708585,-3.427534


In [4]:
#Make an ABBA-BABA with hamadryas as the outgroup
mikumi_cynocephalus = meta_data_samples[meta_data_samples.Origin == 
                                        "Mikumi, Tanzania"].callset_index.values
lake_manyara_anubis = meta_data_samples[meta_data_samples.Origin == 
                                        "Lake Manyara, Tanzania"].callset_index.values
woreda_anubis = meta_data_samples[meta_data_samples.Origin == 
                                  "Gog Woreda, Gambella region, Ethiopia"].callset_index.values
filoha_hamadryas = meta_data_samples[meta_data_samples.Origin == 
                                     "Filoha, Ethiopia"].callset_index.values

In [5]:
filoha_hamadryas

array([62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
       79, 80, 81, 82, 83, 84, 85])

In [6]:
gt = allel.GenotypeArray(callset["chrX/calldata/GT"])
pos = callset["chrX/variants/POS"]

In [7]:
mikumi_count = gt.take(mikumi_cynocephalus, axis = 1).count_alleles(max_allele=1)
lake_manyara_count = gt.take(lake_manyara_anubis, axis = 1).count_alleles(max_allele=1)
woreda_count = gt.take(woreda_anubis, axis = 1).count_alleles(max_allele=1)
filoha_count = gt.take(filoha_hamadryas, axis = 1).count_alleles(max_allele=1)
filoha_count

Unnamed: 0,0,1,Unnamed: 3
0,0,0,
1,0,0,
2,10,0,
...,...,...,...
3609322,6,0,
3609323,0,0,
3609324,0,0,


In [8]:
#Testing sequence diversity
pi = allel.sequence_diversity(pos, mikumi_count)
pi

0.0014564299730477784

In [9]:
allel.average_patterson_d(lake_manyara_count, woreda_count, mikumi_count, filoha_count, blen = 10000)

(0.1673691558458336,
 0.014569376962792507,
 11.487735973423122,
 array([ 0.12667842,  0.0337346 ,  0.12740012,  0.08000351,  0.14990918,
         0.08312016,  0.04535607,  0.06932767,  0.13357799,  0.1685413 ,
         0.12464086,  0.08595914,  0.05558469,  0.14766083,  0.23757195,
         0.31850284, -0.06618342, -0.04724267,  0.07914198,  0.08623492,
         0.03202523,  0.00222388,  0.04361587, -0.10635684,  0.10359266,
         0.1203602 ,  0.05361671, -0.22372639,  0.29902231,  0.29847778,
         0.20310076,  0.20447041, -0.08733963, -0.07459904, -0.03965144,
        -0.07342793,  0.35511579,  0.36252399,  0.20029863,  0.37843387,
         0.57228171,  0.55085692,  0.3400627 ,  0.44956058,  0.54970724,
         0.41025873, -0.15203907, -0.09232994,  0.18725703,  0.30301955,
         0.41908221,  0.18939836,  0.15709748, -0.09450006, -0.02117213,
        -0.00703513,  0.51355069,  0.55205641,  0.61317952, -0.28718812,
        -0.08537813,  0.06338935, -0.06517229, -0.05343339,

In [28]:
#Investigating f3
allel.average_patterson_f3(lake_manyara_count, mikumi_count, woreda_count, blen = 10000)

(0.14280580108909086,
 0.009479432285604229,
 15.064805231634002,
 array([ 1.19083505e-01,  2.21740912e-01,  1.55282231e-01,  1.62326236e-01,
         8.69591714e-02,  9.09653313e-02,  1.64625626e-01,  8.82317132e-02,
         1.09012345e-01,  1.09082914e-01,  3.88888140e-02,  1.41179889e-01,
         1.53288058e-01,  1.85238741e-01,  1.35226750e-01,  2.39876529e-02,
         2.10923554e-01,  1.78473247e-01,  7.54151607e-01,  2.44431532e-01,
         2.05809016e-01,  1.70332089e-01,  2.95107495e-01,  2.38791105e-01,
         2.11121362e-01,  1.13382311e-01,  5.47507270e-01,  3.97177327e-01,
         8.30060512e-02,  2.05849626e-01,  4.63994674e-01,  4.21274099e-01,
         6.65294974e-01,  4.60877242e-01,  3.14958217e-01,  3.53106601e-01,
         3.04991845e-01,  8.72683180e-02,  2.96664570e-01,  1.76492493e-01,
        -5.82008148e-02, -5.40526031e-02,  3.41156113e-01,  2.51884855e-01,
        -9.43017785e-03,  1.06721719e-01,  1.99888197e-01,  2.38371259e-01,
         1.72339813e-0

In [29]:
allel.average_patterson_f3(mikumi_count, lake_manyara_count, woreda_count, blen = 10000)

(0.7734290287244459,
 0.022760626167553918,
 33.98100838838065,
 array([0.32196523, 0.25720799, 0.27467697, 0.30983076, 0.31335194,
        0.20715077, 0.3947313 , 0.40339475, 0.32040513, 0.36907323,
        0.41945466, 0.57514113, 0.32594333, 0.39768303, 0.37257653,
        1.05981629, 0.31547392, 0.27495024, 0.59422978, 0.13948422,
        0.1430369 , 0.75317818, 0.27147054, 0.32618577, 0.47542574,
        0.76040802, 0.26115857, 0.96844745, 0.55771334, 0.70327803,
        1.03691023, 1.27616276, 0.88859606, 3.43786476, 1.75188663,
        1.71377381, 1.45366492, 0.6280995 , 0.94259987, 0.7114941 ,
        0.80970929, 0.52501449, 0.72033612, 0.78271162, 0.65795908,
        1.12063039, 2.05905123, 1.04236384, 2.82061155, 1.09061097,
        1.19098977, 1.25114657, 1.49102158, 1.84778891, 1.74140659,
        1.46442667, 0.91723067, 0.63418082, 2.19190673, 1.40790824,
        1.02007883, 1.09099707, 2.14262876, 1.2981288 , 1.20713083,
        0.96017494, 0.84786068, 0.93078893, 0.375413

In [30]:
allel.average_patterson_f3(woreda_count, mikumi_count, lake_manyara_count, blen = 10000)

(0.20948881837483668,
 0.01012578390778788,
 20.68865188933332,
 array([ 0.11115011,  0.0954951 ,  0.15523905,  0.09344038,  0.19689869,
         0.12606198,  0.119804  ,  0.08391003,  0.11095224,  0.11879884,
         0.11236032,  0.09182086,  0.03728728,  0.04974823,  0.08260423,
         0.40541348,  0.03227444,  0.06588215,  0.06348235,  0.00737498,
         0.02170409,  0.17303337,  0.08405362,  0.0496834 ,  0.05152897,
         0.27115217,  0.06472292,  0.06819421,  0.24356033,  0.32065129,
         0.14828036,  0.10850284,  0.20838347,  0.12844818,  0.01486831,
         0.47215969,  0.64616134,  0.3594008 ,  0.20092802,  0.1956656 ,
         0.34305666,  0.45764035,  0.28815892,  0.13941766,  0.25841381,
         0.21468208,  0.11568729,  0.19240576,  0.27943053,  0.23308735,
         0.42087053,  0.27570792,  0.30488356,  0.5507773 ,  0.33211982,
         0.40437998,  0.29148495,  0.37307275,  0.4955184 ,  0.37735253,
         0.0685394 ,  0.19035562,  0.84597836,  0.13702604, 