In [1]:
from collections import defaultdict
import numpy as np
from cyvcf2 import VCF
import pandas as pd
from functools import reduce

In [2]:
### Loading coding TR coordinates ###
coordinates = pd.read_csv("TR_intersect.txt", sep = "\t", header=None)

In [4]:
one_nr_per_sample_coding = defaultdict(int)
both_nr_per_sample_coding = defaultdict(int)

for chrom in range(1,23):
    vcf_addr = "/gymreklab-tscc/helia/ensembl/experiments/coding_regions/heterozygosity/coding_calls_" + str(chrom) + ".vcf.gz"
    vcf = VCF(vcf_addr)
    samples = vcf.samples
    for record in vcf:
        if int(record.POS) in list(coordinates[1]) and int(record.INFO['END']) in list(coordinates[2]):
            genotypes = record.genotypes
            for i in range(len(samples)):
                sample = samples[i]
                genotype = genotypes[i]
                if genotype[0] == -1:
                    continue
                if genotype[0] != 0 and genotype[1] != 0:
                    both_nr_per_sample_coding[sample] += 1
                    continue
                if genotype[0] != 0 or genotype[1] != 0:
                    one_nr_per_sample_coding[sample] += 1

                

In [None]:
one_nr_per_sample_whole = defaultdict(int)
both_nr_per_sample_whole = defaultdict(int)


for i in range(1,23):
    gt_file = "/gymreklab-tscc/helia/ensembl/experiments/stats/info/gt_chr" + str(i) + ".txt"
    with open(gt_file) as f:
        for line in f:
            line = line.split()
            GT = line[3].split("/")
            if GT[0] == ".":
                continue
            if GT[0] != '0' and GT[1] != '0':
                both_nr_per_sample_whole[line[2]] += 1
                continue
            if GT[0] != '0' or GT[1] != '0':
                one_nr_per_sample_whole[line[2]] += 1

In [None]:
### Merging all data ###
one_nr_per_sample_coding_df = pd.DataFrame(one_nr_per_sample_coding.items())
both_nr_per_sample_coding_df = pd.DataFrame(both_nr_per_sample_coding.items())

one_nr_per_sample_whole_df = pd.DataFrame(one_nr_per_sample_whole.items())
both_nr_per_sample_whole_df = pd.DataFrame(both_nr_per_sample_whole.items())

all_df_list = [one_nr_per_sample_coding_df, both_nr_per_sample_coding_df, 
               one_nr_per_sample_whole_df, both_nr_per_sample_whole_df]

all_df = reduce(lambda  left,right: pd.merge(left,right,on=[0],
                                            how='inner'), all_df_list)

all_df.columns = ['sample', 'coding_one_allele_non_ref', 'coding_two_alleles_non_ref',
                  'wg_one_allele_non_ref', 'wg_two_alleles_non_ref']



In [28]:
#### Loading pedigree ####

pedigree = pd.read_csv("/gymreklab-tscc/helia/TR_1000G/1000G.ped", delim_whitespace=True)

all_df_family = pd.merge(all_df, pedigree, left_on = "sample",
                         right_on = "SampleID")[['sample', 'Population', 'Superpopulation',
                                                 'coding_one_allele_non_ref', 'coding_two_alleles_non_ref',
                                                 'wg_one_allele_non_ref', 'wg_two_alleles_non_ref']]
all_df_family

Unnamed: 0,sample,Population,Superpopulation,coding_one_allele_non_ref,coding_two_alleles_non_ref,wg_one_allele_non_ref,wg_two_alleles_non_ref
0,NA11892,CEU,EUR,182,135,240974,254957
1,HG02813,GWD,AFR,219,151,275529,281471
2,HG03049,GWD,AFR,221,149,278451,280919
3,NA10865,CEU,EUR,190,130,242162,254203
4,HG04158,BEB,SAS,192,128,243226,262862
...,...,...,...,...,...,...,...
3197,NA20811,TSI,EUR,163,144,243658,257386
3198,HG01466,CLM,AMR,194,143,258105,255721
3199,HG00538,CHS,EAS,192,139,226100,273319
3200,NA10864,CEU,EUR,202,140,243909,251783


In [29]:
print(np.mean(all_df_family['coding_one_allele_non_ref']), np.mean(all_df_family['coding_two_alleles_non_ref']),
      np.mean(all_df_family['wg_one_allele_non_ref']), np.mean(all_df_family['wg_two_alleles_non_ref']))

191.76951905059337 144.86039975015615 250696.10149906308 269333.4009993754


In [None]:
np.mean