In [2]:
#Libraries and paths
#Setup as advised from http://alimanfoo.github.io/2016/06/10/scikit-allel-tour.html
import numpy as np
import sys
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
sns.set_context('notebook')
import h5py
import zarr
import numcodecs
import allel; print('scikit-allel', allel.__version__)

vcf_dir = "/faststorage/project/primatediversity/data/variants/"
vcf_suffix = ".variable.filtered.HF.snps.vcf.gz"
metainfo = "data/New_Papio.xlsx"

#Specific ID for first data exploration
ID = "PD_0793"

ModuleNotFoundError: No module named 'numpy'

In [2]:
#Chcking scikit version
print(allel.__version__)

1.3.2


In [3]:
#Loading data and visualizing structure
callset = allel.read_vcf(vcf_dir+ID+vcf_suffix)

In [4]:
#Size of dataset
!ls -lh {vcf_dir+ID+vcf_suffix}
!zcat {vcf_dir+ID+vcf_suffix} | wc -l

-rw-r--r-- 1 mroussel primatediversity 1.2G Jul 15 23:23 /faststorage/project/primatediversity/data/variants/PD_0793.variable.filtered.HF.snps.vcf.gz
40404268


In [5]:
#Looking at vcf data structure
gt = allel.GenotypeArray(callset["calldata/GT"])
print(callset.keys())
callset["samples"]

dict_keys(['samples', 'calldata/GT', 'variants/ALT', 'variants/CHROM', 'variants/FILTER_PASS', 'variants/ID', 'variants/POS', 'variants/QUAL', 'variants/REF'])


array(['PD_0793'], dtype=object)

In [6]:
#Looking at the gt data structure
print(gt.count_het())
gt.count_alleles()

7158651


Unnamed: 0,0,1,2,Unnamed: 4
0,0,2,0,
1,0,2,0,
2,1,1,0,
...,...,...,...,...
40401288,0,2,0,
40401289,0,2,0,
40401290,1,1,0,


In [7]:
#Selecting fields and chromosome to investigate
data_18 = allel.read_vcf(vcf_dir+ID+vcf_suffix,
                            region="chr18", fields=["calldata/GT"])
gt_data_18 = allel.GenotypeArray(data_18["calldata/GT"])
print(gt_data_18.count_het())
gt_data_18.count_alleles()


210424


Unnamed: 0,0,1,2,Unnamed: 4
0,1,1,0,
1,1,1,0,
2,0,2,0,
...,...,...,...,...
1129875,1,1,0,
1129876,0,2,0,
1129877,1,1,0,


In [8]:
#Setting up zarr - needs zarr from conda.
#allel.vcf_to_zarr(vcf_dir+ID+vcf_suffix, "test.zarr", fields="*", overwrite=True)
#Here I just save in the notebooks folder, as that is the working directory
#Should maybe change it to somewhere else.
#I also take all fields, for larger analysis make sure to only select needed fields.

In [9]:
#Working with zarr - needs two other libraries
#import zarr
#import numcodecs
print('zarr', zarr.__version__, 'numcodecs', numcodecs.__version__)
zarr_data = zarr.open_group("test.zarr", mode="r")

zarr 2.6.1 numcodecs 0.7.2


In [10]:
#Size of zarr storage
!du -hs {"test.zarr"}
#The size is actually larger with the zarr storage - why?

1.6G	test.zarr


In [11]:
#The tree does not show the graphic as desired - it showed an un-interactable tree before i installed ipytree
#I can make it do that again, but no interactivity
print(zarr_data.tree(expand=True))

/
 ├── calldata
 │   ├── AD (40401291, 1, 4) int16
 │   ├── DP (40401291, 1) int16
 │   ├── GQ (40401291, 1) int8
 │   ├── GT (40401291, 1, 2) int8
 │   ├── PGT (40401291, 1) object
 │   ├── PID (40401291, 1) object
 │   ├── PL (40401291, 1, 3) int32
 │   ├── PS (40401291, 1) int32
 │   ├── RGQ (40401291, 1) int32
 │   └── SB (40401291, 1, 4) int32
 ├── samples (1,) object
 └── variants
     ├── AC (40401291, 3) int32
     ├── AF (40401291, 3) float32
     ├── ALT (40401291, 3) object
     ├── AN (40401291,) int32
     ├── BaseQRankSum (40401291,) float32
     ├── CHROM (40401291,) object
     ├── DP (40401291,) int32
     ├── ExcessHet (40401291,) float32
     ├── FILTER_LowQual (40401291,) bool
     ├── FILTER_PASS (40401291,) bool
     ├── FS (40401291,) float32
     ├── ID (40401291,) object
     ├── InbreedingCoeff (40401291,) float32
     ├── MLEAC (40401291, 3) int32
     ├── MLEAF (40401291, 3) float32
     ├── MQ (40401291,) float32
     ├── MQRankSum (40401291,) float32
     

In [12]:
#Only keeping the genotype data and loading genotype from Zarr, should be quicker than from vcf
gt_zarr = allel.GenotypeArray(zarr_data["calldata/GT"])

In [13]:
#Looking at this dataset.
gt_zarr.count_missing() # 0
gt_zarr.count_called() # 40401291
gt_zarr.count_hom() #33242640
gt_zarr.count_het() #7158651

7158651

In [14]:
#Taking a subset

In [26]:
#Converting the individuals i want to investigate into zarr
individuals = ["PD_0206", "PD_0214"]
chromosomes = list(range(1, 21))+["X"]
for ind in individuals:
    for chrom in chromosomes:
        allel.vcf_to_zarr(vcf_dir+ind+vcf_suffix, "{}_zarr".format(ind), fields="*", group="chr{}".format(chrom),
                         region="chr{}".format(chrom), log = sys.stdout)

[vcf_to_zarr] 65536 rows in 0.37s; chunk in 0.37s (177493 rows/s); chr1 :3827886
[vcf_to_zarr] 131072 rows in 1.12s; chunk in 0.75s (87801 rows/s); chr1 :8243903
[vcf_to_zarr] 196608 rows in 1.68s; chunk in 0.56s (116390 rows/s); chr1 :12351433
[vcf_to_zarr] 262144 rows in 2.23s; chunk in 0.55s (118287 rows/s); chr1 :16606819
[vcf_to_zarr] 327680 rows in 2.79s; chunk in 0.56s (118004 rows/s); chr1 :20736245
[vcf_to_zarr] 393216 rows in 3.32s; chunk in 0.54s (122217 rows/s); chr1 :25383120
[vcf_to_zarr] 458752 rows in 3.90s; chunk in 0.57s (114516 rows/s); chr1 :31113681
[vcf_to_zarr] 524288 rows in 4.46s; chunk in 0.56s (116776 rows/s); chr1 :36778184
[vcf_to_zarr] 589824 rows in 5.06s; chunk in 0.60s (109411 rows/s); chr1 :41991228
[vcf_to_zarr] 655360 rows in 5.65s; chunk in 0.60s (110044 rows/s); chr1 :46301823
[vcf_to_zarr] 720896 rows in 6.22s; chunk in 0.56s (116069 rows/s); chr1 :50267113
[vcf_to_zarr] 786432 rows in 6.77s; chunk in 0.56s (117861 rows/s); chr1 :54767706
[vcf_to_

In [27]:
#Data treatment, first try. Ignoring callability here
df = pd.DataFrame(columns = ["chr", "pos", "het"])
chromosomes = list(range(19, 21))+["X"]
for ind in individuals:
    zarr_data = zarr.open_group("{}_zarr".format(ind), mode="r")
    print(zarr_data.tree(expand=True))
    for chrom in chromosomes:
        pos = allel.SortedIndex(zarr_data["chr{}/variants/POS".format(chrom)])
        print(chrom)


/
 ├── chr1
 │   ├── calldata
 │   │   ├── AD (3025870, 1, 4) int16
 │   │   ├── DP (3025870, 1) int16
 │   │   ├── GQ (3025870, 1) int8
 │   │   ├── GT (3025870, 1, 2) int8
 │   │   ├── PGT (3025870, 1) object
 │   │   ├── PID (3025870, 1) object
 │   │   ├── PL (3025870, 1, 3) int32
 │   │   ├── PS (3025870, 1) int32
 │   │   ├── RGQ (3025870, 1) int32
 │   │   └── SB (3025870, 1, 4) int32
 │   ├── samples (1,) object
 │   └── variants
 │       ├── AC (3025870, 3) int32
 │       ├── AF (3025870, 3) float32
 │       ├── ALT (3025870, 3) object
 │       ├── AN (3025870,) int32
 │       ├── BaseQRankSum (3025870,) float32
 │       ├── CHROM (3025870,) object
 │       ├── DP (3025870,) int32
 │       ├── ExcessHet (3025870,) float32
 │       ├── FILTER_LowQual (3025870,) bool
 │       ├── FILTER_PASS (3025870,) bool
 │       ├── FS (3025870,) float32
 │       ├── ID (3025870,) object
 │       ├── InbreedingCoeff (3025870,) float32
 │       ├── MLEAC (3025870, 3) int32
 │       ├── MLEAF 

In [17]:
#First visualization