# Assess Number of Variants Dropoff with different Read Depth Cutoffs

So we have 2 types of data on the 17 vcf batch files from ukbb.

The first one is a bit longer & more comprehensive. It contains:
- `snpid \t avg_read_depth \t first_quartile_read_depth \t min_read_depth \t allele_balance`
  - we can establish the avg & 25% read depths with this
  - note this ignores missing data

The second one makes heavy use of early termination algorithms to establish a early cutoff for 90% read depths < 7
- `snpid \t 90%_read_depth \t allele_balance`
  - we can establish the 10% read depth cutoff with this
  - note this marks missing data as a read-depth of 0

# imports, globals, and data

In [12]:
import pandas as pd
import numpy as np
DATA_OUT = "/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/"
old_runs_files = "{}old".format(DATA_OUT)
updated_runs_files = "{}updated_runs/v1_6_full_runs".format(DATA_OUT)
from glob import glob

In [9]:
old_data_list = []
for fname in glob(old_runs_folder + "*"):
    with open(fname, "r") as f:
        for line in f:
            line = line.strip().split()
            old_data_list.append({"snpid":line[0],
                                  "avg_rd":float(line[1]),
                                  "rd_25":float(line[2]),
                                  "min_rd":float(line[3]),
                                  "ab":float(line[4])})
old_data_df = pd.DataFrame(old_data_list)
print(old_data_df.shape)
old_data_df.head()

(450264, 5)


Unnamed: 0,snpid,avg_rd,rd_25,min_rd,ab
0,chr21_39198667_T_A,10.530052,7.0,0.0,0.285714
1,chr21_39198677_CTT_C,12.713595,9.0,0.0,0.470588
2,chr21_39198680_T_A,12.968998,10.0,0.0,0.2
3,chr21_39198683_C_T,13.121108,10.0,0.0,0.384615
4,chr21_39198684_G_A,13.264188,11.0,0.0,0.461538


In [10]:
new_data_list = []
for fname in glob(updated_runs_folder + "*"):
    with open(fname, "r") as f:
        for line in f:
            line = line.strip().split()
            for snp in line[0].split(";"):
                new_data_list.append({"snpid":snp,
                                      "rd_10":float(line[1]),
                                      "ab":float(line[2])})
new_data_df = pd.DataFrame(new_data_list)
print(new_data_df.shape)
new_data_df.head()

(450264, 3)


Unnamed: 0,snpid,rd_10,ab
0,chr21_39198667_T_A,0.0,0.0
1,chr21_39198677_CTT_C,7.0,0.470588
2,chr21_39198680_T_A,7.0,0.2
3,chr21_39198683_C_T,7.0,0.384615
4,chr21_39198684_G_A,7.0,0.461538


# Filter out based on different thresholds for read depth and allele balance

So recall the filters for read depth and allele balance:
- SNPs:
  - Read Depth >= 7
  - allele balance >= 0.15
- InDels:
  - Read Depth >= 10
  - allele balance >= 0.2

We will try different definitions of read depth to identify how many variant retentions we will have. Note the legit one is the updated version, but worth checking how much loss before spending the money to calculate this for sure.
- min_read_depth
- avg_read_depth
- rd_25
- rd_10

In [33]:
# min_read_depth
indel_mask = np.array([bool(snp[-2] != "_" or snp[-4] != "_") for snp in old_data_df.snpid])
rd_mask = np.array([rd >= (10 if indel else 7) for indel,rd in zip(indel_mask,old_data_df.min_rd)])
ab_mask = np.array([ab >= (0.2 if indel else 0.15) for indel,ab in zip(indel_mask,old_data_df.ab)])
min_mask = rd_mask & ab_mask
print(np.sum(min_mask),len(min_mask),np.sum(min_mask)/len(min_mask))

193253 450264 0.4291993141801254


In [34]:
# avg_read_depth
indel_mask = np.array([bool(snp[-2] != "_" or snp[-4] != "_") for snp in old_data_df.snpid])
rd_mask = np.array([rd >= (10 if indel else 7) for indel,rd in zip(indel_mask,old_data_df.avg_rd)])
ab_mask = np.array([ab >= (0.2 if indel else 0.15) for indel,ab in zip(indel_mask,old_data_df.ab)])
avg_mask = rd_mask & ab_mask
print(np.sum(avg_mask),len(avg_mask),np.sum(avg_mask)/len(avg_mask))

439305 450264 0.9756609455785938


In [35]:
# 25%_read_depth
indel_mask = np.array([bool(snp[-2] != "_" or snp[-4] != "_") for snp in old_data_df.snpid])
rd_mask = np.array([rd >= (10 if indel else 7) for indel,rd in zip(indel_mask,old_data_df.rd_25)])
ab_mask = np.array([ab >= (0.2 if indel else 0.15) for indel,ab in zip(indel_mask,old_data_df.ab)])
rd25_mask = rd_mask & ab_mask
print(np.sum(rd25_mask),len(rd25_mask),np.sum(rd25_mask)/len(rd25_mask))

426589 450264 0.9474197359771156


In [36]:
# 10%_read_depth
indel_mask = np.array([bool(snp[-2] != "_" or snp[-4] != "_") for snp in new_data_df.snpid])
rd_mask = np.array([rd >= (10 if indel else 7) for indel,rd in zip(indel_mask,new_data_df.rd_10)])
ab_mask = np.array([ab >= (0.2 if indel else 0.15) for indel,ab in zip(indel_mask,new_data_df.ab)])
rd10_mask = rd_mask & ab_mask
print(np.sum(rd10_mask),len(rd10_mask),np.sum(rd10_mask)/len(rd10_mask))

402898 450264 0.8948039372457047


Note we have 26 million variants to start off with & if we use the 90% cutoff we should eliminate about 11% of them leaving us with ~23 million variants ... which is miles better than the 16.9 million I have been using for now.

To check that this dropoff is what I would expect. I can verify the retention rate on the rest of the chr 1 variants and ensure that a similar variant retention rate is observed.

# Check with previous Chr 1 batch job

Note that the previous chr1 job was run using the outdated code ...

In [58]:
old_chr1_data_list = []
for fname in glob("{}chr1_b0-95/output/".format(DATA_OUT) + "*"):
    print(fname)
    with open(fname, "r") as f:
        for line in f:
            line = line.strip().split()
            old_chr1_data_list.append({"snpid":line[0],
                                  "avg_rd":float(line[1]),
                                  "rd_25":float(line[2]),
                                  "min_rd":float(line[3]),
                                  "ab":float(line[4])})
old_chr1_data_df = pd.DataFrame(old_chr1_data_list)
print(old_chr1_data_df.shape)
old_chr1_data_df.head()

/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/ukb23157_c1_b76_v1.ouput
/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/ukb23157_c1_b61_v1.ouput
/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/ukb23157_c1_b90_v1.ouput
/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/ukb23157_c1_b9_v1.ouput
/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/ukb23157_c1_b5_v1.ouput
/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/ukb23157_c1_b27_v1.ouput
/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/ukb23157_c1_b59_v1.ouput
/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/ukb23157_c1_b47_v1.ouput
/mnt/speliotes-lab/Projects/UK_ATLAS/IndivProj/craut/UKB_500K_exome_QC/chr1_b0-95/output/u

Unnamed: 0,snpid,avg_rd,rd_25,min_rd,ab
0,chr1_193138000_T_C,7.204891,5.0,0.0,0.5
1,chr1_193138001_G_A,7.350802,5.0,0.0,0.222222
2,chr1_193138006_G_A,8.338117,6.0,0.0,0.5
3,chr1_193138010_A_G,8.52995,6.0,0.0,0.833333
4,chr1_193138016_C_T,8.900486,7.0,0.0,0.222222


In [59]:
# min_read_depth
indel_mask = np.array([bool(snp[-2] != "_" or snp[-4] != "_") for snp in old_chr1_data_df.snpid])
rd_mask = np.array([rd >= (10 if indel else 7) for indel,rd in zip(indel_mask,old_chr1_data_df.min_rd)])
ab_mask = np.array([ab >= (0.2 if indel else 0.15) for indel,ab in zip(indel_mask,old_chr1_data_df.ab)])
chr1_min_mask = rd_mask & ab_mask
print(np.sum(chr1_min_mask),len(chr1_min_mask),np.sum(chr1_min_mask)/len(chr1_min_mask))

1134919 2678597 0.4236990484197511


In [61]:
# avg_read_depth
indel_mask = np.array([bool(snp[-2] != "_" or snp[-4] != "_") for snp in old_chr1_data_df.snpid])
rd_mask = np.array([rd >= (10 if indel else 7) for indel,rd in zip(indel_mask,old_chr1_data_df.avg_rd)])
ab_mask = np.array([ab >= (0.2 if indel else 0.15) for indel,ab in zip(indel_mask,old_chr1_data_df.ab)])
chr1_avg_mask = rd_mask & ab_mask
print(np.sum(chr1_avg_mask),len(chr1_avg_mask),np.sum(chr1_avg_mask)/len(chr1_avg_mask))

2615755 2678597 0.9765392106390024


In [63]:
# 25pct_read_depth
indel_mask = np.array([bool(snp[-2] != "_" or snp[-4] != "_") for snp in old_chr1_data_df.snpid])
rd_mask = np.array([rd >= (10 if indel else 7) for indel,rd in zip(indel_mask,old_chr1_data_df.rd_25)])
ab_mask = np.array([ab >= (0.2 if indel else 0.15) for indel,ab in zip(indel_mask,old_chr1_data_df.ab)])
chr1_rd25_mask = rd_mask & ab_mask
print(np.sum(chr1_rd25_mask),len(chr1_rd25_mask),np.sum(chr1_rd25_mask)/len(chr1_rd25_mask))

2532480 2678597 0.9454501741023379
