In [1]:
#Initial configuration, probably overkill in imports.
import sys, os, re
from functools import reduce
import glob
import numpy as np
import allel
import zarr
import dask
import numcodecs
import warnings
from pathlib import Path
from horizonplot import horizonplot

#os.environ["MODIN_ENGINE"] = "ray"

#import modin.pandas as pd
import pandas as pd

%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina', 'png')
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
sns.set()
sns.set_theme()
sns.set_style("white")
sns.set_context("notebook")

In [2]:
vcf_dir = "/faststorage/project/primatediversity/data/PGDP_16_7_2020/variants/"
vcf_suffix = ".variable.filtered.HF.snps.vcf.gz"
metadata = "/faststorage/project/primatediversity/data/PGDP_16_7_2020/metadata/PGDP_md.16.7.2020.xlsx"

In [3]:
X_vs_A = glob.glob("/faststorage/project/primatediversity/data/PGDP_16_7_2020/metadata/*simplified.csv")
sex_via_cov = "/faststorage/project/primatediversity/data/PGDP_16_7_2020/metadata/sex_via_coverage_Lukas.csv"

In [4]:
d_cov = pd.read_csv(sex_via_cov)
d_cov[:]

Unnamed: 0,PGDP_ID,species,autosomal_coverage,X_coverage,X_over_autosome_ratio
0,PD_0001,Pithecia_pithecia.fasta,31,30,0.967742
1,PD_0002,Callithrix_jacchus.fasta,24,12,0.500000
2,PD_0003,Pithecia_pithecia.fasta,30,16,0.533333
3,PD_0004,Atele_fusciceps.fasta,31,31,1.000000
4,PD_0005,Atele_fusciceps.fasta,30,15,0.500000
...,...,...,...,...,...
673,SAMN01920547,Pongo_pygmaeus.fasta,25,26,1.040000
674,SAMN01920548,Pongo_pygmaeus.fasta,26,26,1.000000
675,SAMN01920549,Pongo_pygmaeus.fasta,30,31,1.033330
676,SAMN01920550,Pongo_pygmaeus.fasta,29,30,1.034480


In [5]:
hetero = "/faststorage/project/primatediversity/data/PGDP_16_7_2020/metadata/sex_Marjo_heterozygosity.csv"
d_hetero = pd.read_csv(hetero, "\t")
d_hetero.reference_file.unique()

array(['Atele_fusciceps.fasta', 'Aotus_nancymaae.fasta',
       'Pithecia_pithecia.fasta', 'Cebus_albifrons.fasta',
       'Cercopithecus_mitis.fasta', 'Gorilla_gorilla_gorilla.fasta',
       'Hoolock_hoolock.fasta', 'Macaca_mulatta.fasta',
       'Nycticebus_pygmaeus.fasta', 'Pan_troglodytes.fasta',
       'rheMac10.fa', nan, 'Papio_anubis.fasta',
       'Pygathrix_nemaeus.fasta', 'Sapajus_apella.fasta',
       'Rhinopithecus_roxellana.fasta', 'Trachypithecus_phayrei.fasta'],
      dtype=object)

A small test to understand merge, fillna and sort_values before I run it on the real dataset.

In [6]:
d1={"pos": [1, 10], "ID_1": [0, 1]}
d2={"pos": [1, 9], "ID_2": [0, 1]}
df1 = pd.DataFrame(data=d1)
df2 = pd.DataFrame(data=d2)

In [7]:
merge_df = pd.merge(df1, df2, on="pos", how="outer")
merge_df.fillna(0, inplace=True)
merge_df.sort_values("pos")

Unnamed: 0,pos,ID_1,ID_2
0,1,0.0,0.0
2,9,0.0,1.0
1,10,1.0,0.0


The following code extracts the het sites and their positions. As the sites might vary among individuals, there is a pos-based merge to generate the final dataframe.

In [8]:
simplified_csv = "/faststorage/project/primatediversity/data/PGDP_16_7_2020/metadata/{}_X_vs_A_simplified.csv"

def count_het_species(d, genus):
    X_vs_A = simplified_csv.format(genus[:-6])
    contigs = pd.read_csv(X_vs_A, sep="\t")
    x_contigs = contigs.loc[contigs.decision == "X"].scaffold.values
    df_list = []
    for contig in x_contigs:
        df_list_contig = []
        for i, row in d.iterrows():
            ID = row.PDGP_ID
            path = vcf_dir+ID+vcf_suffix
            if os.path.exists(path):
                vcf_callset = allel.read_vcf(path, region=contig, fields=["calldata/GT", "variants/POS"])
                if vcf_callset != None:
                    gt = allel.GenotypeArray(vcf_callset["calldata/GT"])
                    pos = vcf_callset["variants/POS"]
                    het_state = gt.is_het()
                    dfgt = pd.DataFrame(het_state, columns=["{}".format(ID)])
                    dfpos = pd.DataFrame(pos, columns=["pos"])
                    dfpos = dfpos.loc[dfgt[ID] == True]
                    dfgt = dfgt.loc[dfgt[ID] == True]
                    df_list_contig.append(pd.concat([dfpos, dfgt], axis=1))
                else:
                    print(contig, ID)
            else:
                  print(ID)
        #Merging these dataframes, with differing sizes. Merging here so it can handle multiple contigs
        df_list.append(reduce(lambda x, y: pd.merge(x, y, on = 'pos', how="outer"), df_list_contig))
    df = pd.concat(df_list, ignore_index=True)
    df.reset_index()
    return df.fillna(False)

genus_dict = {}
for genus in d_hetero.reference_file.unique():
    d = d_hetero.loc[d_hetero.reference_file == genus]
    print(genus)
    if genus not in ["rheMac10.fa", "Papio_anubis.fasta"] and len(d) > 0:
        df = count_het_species(d, genus)
        if len(df) > 0:
            genus_dict[genus] = df


Atele_fusciceps.fasta
Aotus_nancymaae.fasta
Pithecia_pithecia.fasta
tarseq_214 PD_0001
tarseq_56 PD_0001
tarseq_87 PD_0001
Cebus_albifrons.fasta
Contig110 PD_0368
Cercopithecus_mitis.fasta
Gorilla_gorilla_gorilla.fasta
Hoolock_hoolock.fasta
Macaca_mulatta.fasta
Nycticebus_pygmaeus.fasta
Pan_troglodytes.fasta
rheMac10.fa
nan
Papio_anubis.fasta
Pygathrix_nemaeus.fasta
Sapajus_apella.fasta
Rhinopithecus_roxellana.fasta
Trachypithecus_phayrei.fasta


Now, I will split the dataframe based on Marjorlaines sexing, and generate a M+1*F+1 size dataframe, with M being number of males and F being number of females, and count the nunber of sites fitting into the various classes.

I choose to subsample so that there is an equal number of males/females. This removes information, but makes it much more comparable when looking at males/females.

In [9]:
np.set_printoptions(suppress=True)

def generate_shared_het_matrix(d_hetero, df):
    d_m = d.loc[d.sex_coverage == "male"]
    d_f = d.loc[d.sex_coverage == "female"]
    min_s = min(len(d_m), len(d_f))
    m_l = d_m.PDGP_ID.tolist()
    f_l = d_f.PDGP_ID.tolist()
    m_df = df.loc[:, m_l[:min_s]]
    f_df = df.loc[:, f_l[:min_s]]
    #print(m_df.sum(axis=1))
    #print(f_df.sum(axis=1))
    df.reset_index() #To handle an error I probably got due to the earlier concats.
    print(len(d_m), len(d_f))
    matrix = np.zeros((min_s+1, min_s+1))
    m_sum = m_df.sum(axis=1)
    f_sum = f_df.sum(axis=1)
    for i in range(len(df)):
        matrix[m_sum[i], f_sum[i]] += 1
    matrix[0, 0] = 0
    return matrix

def summarize_matrix(matrix, genus):
    singleton_male, singleton_female = matrix.sum(axis=1)[1], matrix.sum(axis=1)[2]
    d = {"genus": genus}

d = d_hetero.loc[(d_hetero.PDGP_ID != "PD_0001") & (d_hetero.PDGP_ID != "PD_0368")]
d = d.loc[d.reference_file == "Gorilla_gorilla_gorilla.fasta"]
n_matrix = generate_shared_het_matrix(d, genus_dict["Gorilla_gorilla_gorilla.fasta"])
#summarize_matrix(n_matrix, "Gorilla_gorilla_gorilla.fasta")

5 11


In [10]:
def summarize_matrix(matrix, genus):
    df_list = []
    for g, a in [["male", 1], ["female", 0]]:
        i = len(matrix)-1
        total_het = matrix.sum()
        singleton = matrix.sum(axis=a)[1]
        s_het = sum(matrix.sum(axis=a)[1:])
        all_shared = matrix.sum(axis=a)[i]
        above_hw = sum(matrix.sum(axis=a)[1+i//2+(i%2>0):])
        het_per_individual = sum(matrix.sum(axis=a)[1:]*list(range(1, i+1)))
        calls_in_above_hw = sum(matrix.sum(axis=a)[1+i//2+(i%2>0):]*list(range(1+i//2+(i%2>0), i+1)))
        both_f_m = matrix[1:, 1:].sum()
        all_f_m = matrix[i:, i:].sum()
        above_hw_m_f = matrix[1+i//2+(i%2>0):, 1:].sum()
        d = {"genus": genus[:-6], "n": i, "sex": g, "s_het": s_het, "average_het": het_per_individual/i,
             "singletons": singleton/s_het, "all_shared_het": all_shared/s_het, "above_hw": above_hw/s_het, 
             "weight_above_hw": calls_in_above_hw/het_per_individual, 
             "total_het": total_het,"both_f_m": both_f_m/total_het, "het_all": all_f_m/total_het,
             "above_hw_m_f": above_hw_m_f/total_het}
        df = pd.DataFrame(data=d, index=[0])
        df_list.append(df)
    df = pd.concat(df_list, ignore_index=True)
    return df
d = d_hetero.loc[(d_hetero.PDGP_ID != "PD_0001") & (d_hetero.PDGP_ID != "PD_0368")]
d = d.loc[d.reference_file == "Gorilla_gorilla_gorilla.fasta"]
n_matrix = generate_shared_het_matrix(d, genus_dict["Gorilla_gorilla_gorilla.fasta"])
summarize_matrix(n_matrix, "Gorilla_gorilla_gorilla.fasta")

5 11


Unnamed: 0,genus,n,sex,s_het,average_het,singletons,all_shared_het,above_hw,weight_above_hw,total_het,both_f_m,het_all,above_hw_m_f
0,Gorilla_gorilla_gorilla,5,male,178461.0,114810.6,0.246844,0.352794,0.49659,0.727194,551534.0,0.032045,0.000225,0.005882
1,Gorilla_gorilla_gorilla,5,female,390747.0,153233.6,0.41177,0.01328,0.086731,0.183705,551534.0,0.032045,0.000225,0.005882


In [11]:
n_matrix[4:, 1:]

array([[404., 577., 496., 268.,  73.],
       [382., 363., 361., 196., 124.]])

In [12]:
df_list = []
for genus in genus_dict:
    print(genus)
    d = d_hetero.loc[(d_hetero.PDGP_ID != "PD_0001") & (d_hetero.PDGP_ID != "PD_0368")]
    d = d.loc[d.reference_file == genus]
    n_matrix = generate_shared_het_matrix(d, genus_dict[genus])
    df = summarize_matrix(n_matrix, genus)
    df_list.append(df)


Atele_fusciceps.fasta
16 14
Aotus_nancymaae.fasta
17 6
Pithecia_pithecia.fasta
21 23
Cebus_albifrons.fasta
12 7
Cercopithecus_mitis.fasta
1 3
Gorilla_gorilla_gorilla.fasta
5 11
Hoolock_hoolock.fasta
3 6
Macaca_mulatta.fasta
35 29
Nycticebus_pygmaeus.fasta
1 3
Pan_troglodytes.fasta
2 2
Pygathrix_nemaeus.fasta
7 3
Sapajus_apella.fasta
11 3
Rhinopithecus_roxellana.fasta
6 4
Trachypithecus_phayrei.fasta
5 1


In [13]:
df = pd.concat(df_list, ignore_index=True)
df.reset_index()
df["average_het"] = df["average_het"].astype(int)
df["s_het"] = df["s_het"].astype(int)
df["total_het"] = df["total_het"].astype(int)
df.to_csv("../results/primate_diversity_het_statistics.txt", header=True,
          index=False, sep='\t', mode='a', float_format='%.5f')
df.to_csv("../results/primate_diversity_het_statistics.csv", header=True,
          index=False, float_format='%.5f')
df.to_html("../results/primate_diversity_het_statistics.html", float_format='%.5f')

Unnamed: 0,genus,n,sex,s_het,average_het,singletons,all_shared_het,above_hw,weight_above_hw,total_het,both_f_m,het_all,above_hw_m_f
0,Atele_fusciceps,14,male,77537,13911,0.566916,0.003908,0.080349,0.321127,399595,0.091703,1.5e-05,0.014802
1,Atele_fusciceps,14,female,358702,44904,0.603473,5e-05,0.008458,0.044298,399595,0.091703,1.5e-05,0.014802
2,Aotus_nancymaae,6,male,24786,10411,0.477447,0.1262,0.284394,0.587242,105367,0.084761,0.003568,0.035172
3,Aotus_nancymaae,6,female,89512,24792,0.672837,0.009317,0.103327,0.271532,105367,0.084761,0.003568,0.035172
4,Pithecia_pithecia,21,male,127094,15714,0.498765,0.000889,0.016185,0.09121,686421,0.087024,0.0,0.002785
5,Pithecia_pithecia,21,female,619062,45311,0.675259,2e-06,0.001205,0.010409,686421,0.087024,0.0,0.002785
6,Cebus_albifrons,7,male,18654,4522,0.684089,0.002359,0.065884,0.217316,60073,0.102375,0.000166,0.017063
7,Cebus_albifrons,7,female,47569,9177,0.727217,0.000799,0.004225,0.017854,60073,0.102375,0.000166,0.017063
8,Cercopithecus_mitis,1,male,21040,21040,1.0,1.0,0.0,0.0,143761,0.01579,0.01579,0.0
9,Cercopithecus_mitis,1,female,124991,124991,1.0,1.0,0.0,0.0,143761,0.01579,0.01579,0.0


In [31]:
df.iloc[:, 10:]

Unnamed: 0,both_f_m,het_all,above_hw_m_f
0,0.091703,1.5e-05,0.014802
1,0.091703,1.5e-05,0.014802
2,0.084761,0.003568,0.035172
3,0.084761,0.003568,0.035172
4,0.087024,0.0,0.002785
5,0.087024,0.0,0.002785
6,0.102375,0.000166,0.017063
7,0.102375,0.000166,0.017063
8,0.01579,0.01579,0.0
9,0.01579,0.01579,0.0


In [20]:
genus = "Rhinopithecus_roxellana.fasta"
d = d_hetero.loc[(d_hetero.PDGP_ID != "PD_0001") & (d_hetero.PDGP_ID != "PD_0368")]
d = d.loc[d.reference_file == genus]
n_matrix = generate_shared_het_matrix(d, genus_dict[genus])
n_matrix

6 4


array([[     0., 144346.,  80228.,  40864.,     43.],
       [ 17578.,   2977.,   1322.,    657.,     42.],
       [  7685.,   1737.,   1285.,    809.,     79.],
       [  5063.,   1090.,   1046.,    843.,    129.],
       [  7694.,   1109.,   1342.,   1395.,    233.]])