In this notebook, I will investigate the het calls on the male X, and remove them based on the state present in females from the same species.
I will adapt some of the code from Primatediversity_shared_het

In [1]:
%run ../scripts/notebook_settings.py

In [2]:
meta_data_samples = pd.read_table("../data/metadata_with_x_missing.txt", sep=" ")
zarr_dir = "/faststorage/project/baboondiversity/data/PG_panu3_zarr_12_03_2021/callset.zarr/chrX"
#Opening the zarr data
callset = zarr.open_group(zarr_dir, mode="r")
gt = allel.GenotypeArray(callset["calldata/GT"])

In [3]:
def count_het_species(gt, species):
    IDs = meta_data_samples.loc[meta_data_samples.Species == species].callset_index.values
    females = meta_data_samples.loc[(meta_data_samples.Species == species)
                                    & (meta_data_samples.Sex == "F")].callset_index.values
    males = meta_data_samples.loc[(meta_data_samples.Species == species)
                                    & (meta_data_samples.Sex == "M")].callset_index.values
    gt_f = gt.take(females, axis=1)
    gt_m = gt.take(males, axis=1)
    f_het = gt_f.is_het()
    m_het = gt_m.is_het()
    matrix = np.zeros((len(males)+1, len(females)+1))
    m_sum = m_het.sum(axis=1)
    f_sum = f_het.sum(axis=1)
    for i in range(len(m_sum)):
        matrix[m_sum[i], f_sum[i]] += 1
    print(species)
    print("{} males and {} females".format(len(males), len(females)))
    print("Out of {} sites, {} are present in males,".format(matrix.sum(), sum(matrix.sum(axis=1)[1:])/matrix.sum()))
    print("{} are in females".format(sum(matrix.sum(axis=0)[1:])/matrix.sum()))
    print("With {} of male sites overlapping".format(matrix[1:, 1:].sum()/sum(matrix.sum(axis=1)[1:])))
    print("{} sites are het in all".format(matrix[len(males), len(females)]/matrix.sum()))
    return matrix

In [4]:
for species in meta_data_samples.Species.unique():
    df = count_het_species(gt, species)
df

cynocephalus
38 males and 24 females
Out of 2923212.0 sites, 0.08423405486841187 are present in males,
0.3749060964445959 are in females
With 0.3710413671548202 of male sites overlapping
0.0012883088876208774 sites are het in all
anubis
42 males and 52 females
Out of 2923212.0 sites, 0.07334124244153349 are present in males,
0.29615060419839545 are in females
With 0.31978338743982987 of male sites overlapping
0.001240416363917499 sites are het in all
kindae
18 males and 9 females
Out of 2923212.0 sites, 0.06085668778042783 are present in males,
0.17061677360383032 are in females
With 0.2319488243196906 of male sites overlapping
0.0018226526163685698 sites are het in all
gelada
2 males and 0 females
Out of 2923212.0 sites, 0.046292229232775456 are present in males,
0.0 are in females
With 0.0 of male sites overlapping
0.031158191742507897 sites are het in all
hamadryas
20 males and 6 females
Out of 2923212.0 sites, 0.05886504297327734 are present in males,
0.110604020508947 are in femal

array([[2599872.,  137074.,   73899.,   16869.],
       [  79898.,    2734.,    2641.,   10225.]])

Now I remove the putative PAR.

In [13]:
callset = zarr.open_group(zarr_dir, mode="r")
gt_zarr = callset["calldata/GT"]
pos = allel.SortedIndex(callset['variants/POS'])
loc_region = pos.locate_range(2500000, 140000000)
gt_nonpar = allel.GenotypeArray(gt_zarr[loc_region])

In [11]:
gt_nonpar

Unnamed: 0,0,1,2,3,4,...,222,223,224,225,226,Unnamed: 12
0,0/0,0/0,0/1,0/0,0/0,...,0/0,0/0,0/0,0/1,0/1,
1,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2,0/0,0/0,0/1,0/0,0/0,...,0/0,0/0,0/0,0/1,0/1,
...,...,...,...,...,...,...,...,...,...,...,...,...
2655056,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2655057,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,
2655058,0/0,0/0,0/0,0/0,0/0,...,0/0,0/0,0/0,0/0,0/0,


In [7]:
len(gt)

2923212

In [14]:
for species in meta_data_samples.Species.unique():
    df = count_het_species(gt_nonpar, species)

cynocephalus
38 males and 24 females
Out of 2774845.0 sites, 0.07232728314554507 are present in males,
0.38014303501637026 are in females
With 0.4039024001355277 of male sites overlapping
0.001236825840722635 sites are het in all
anubis
42 males and 52 females
Out of 2774845.0 sites, 0.062125632242521656 are present in males,
0.29948123228504653 are in females
With 0.34951185980544003 of male sites overlapping
0.0011957424648944355 sites are het in all
kindae
18 males and 9 females
Out of 2774845.0 sites, 0.05066084772302597 are present in males,
0.1738197989437248 are in females
With 0.26005861597996816 of male sites overlapping
0.0017337905360479595 sites are het in all
gelada
2 males and 0 females
Out of 2774845.0 sites, 0.038015096338714416 are present in males,
0.0 are in females
With 0.0 of male sites overlapping
0.024557407711061338 sites are het in all
hamadryas
20 males and 6 females
Out of 2774845.0 sites, 0.04971881312289515 are present in males,
0.11216806704518631 are in f

Removing PAR actually increases overlap, but it decreases the percentage of sites with male het.

In [9]:
df = count_het_species(gt_nonpar, "kindae")

kindae
18 males and 9 females
Out of 2774845.0 sites, 0.05066084772302597 are present in males,
0.1738197989437248 are in females
With 0.26005861597996816 of male sites overlapping
0.0017337905360479595 sites are het in all


Now, I will create a "reference" allele count for each female population, and then go through the males one by one to decide on the states for each site.

In [107]:
def state_decision(gt, species, meta_data_samples):
    females = meta_data_samples.loc[(meta_data_samples.Species == species)
                                    & (meta_data_samples.Sex == "F")].callset_index.values
    males = meta_data_samples.loc[(meta_data_samples.Species == species)
                                    & (meta_data_samples.Sex == "M")].callset_index.values
    gt_f = gt.take(females, axis=1)
    ac_f = gt_f.count_alleles()
    allelism_f = np.reshape(ac_f.allelism()==1, (-1,1))
    #reshape is needed to make sure they have the same np dimensions
    print("Total sites:", len(ac_f))
    print("Variants in all females: ", ac_f.is_variant().sum())
    for ID in males:
        gt_m = gt.take([ID], axis=1)
        print("Variants in male: ", gt_m.count_called().sum()-gt_m.is_hom_ref().sum())
        het_state = gt_m.is_het()
        het_males_fixed_females = het_state & allelism_f
        print("""Fixed female: {}, Het sites in male {}, percentage {}""".format(
        het_males_fixed_females.sum(), het_state.sum(), het_males_fixed_females.sum()/het_state.sum()))
        print()

In [108]:
state_decision(gt_nonpar, "ursinus (grayfoot)", meta_data_samples)

Total sites: 2774845
Variants in all females:  465119
Variants in male:  385471
Fixed female: 56649, Het sites in male 70629, percentage 0.8020643078622096
240762


In [16]:
callset.tree()

Tree(nodes=(Node(disabled=True, name='/', nodes=(Node(disabled=True, name='calldata', nodes=(Node(disabled=Tru…