In [1]:
%run ../scripts/notebook_settings.py
import sgkit as sg
import xarray as xr
import glob

Defining the current paths and sizes.

In [2]:
metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
zarr_path = "../zarr_data/"
metadata_folders = glob.glob(metadata_path+"*_individuals.txt")

size_cutoff = 1000000
window_size = 100000
missing_filter = 0.5

In [3]:
def read_beds(long_form):
    bed_path_x = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024/{}/filteredVCF/pos_bed_cov_based/{}_batch*_fploidy2_mploidy1.bed".format(long_form, long_form)
    bed_path_all = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024/{}/filteredVCF/pos_bed_cov_based/{}_batch*_fploidy2_mploidy2.bed".format(long_form, long_form)
    bed_l = []
    for b in glob.glob(bed_path_all):
        bed_file = pd.read_csv(b, sep="\t", names=["chrom", "start", "end"])
        bed_l.append(bed_file)
    bed_files = pd.concat(bed_l)
    bed_l = []
    for b in glob.glob(bed_path_x):
        #print(b)
        bed_file = pd.read_csv(b, sep="\t", names=["chrom", "start", "end"])
        bed_l.append(bed_file)
    if len(bed_l) > 0:
        bed_x = pd.concat(bed_l)
        bed_files = bed_files.loc[~(bed_files.chrom.isin(bed_x.chrom.unique()))]
        bed_files = pd.concat([bed_files, bed_x]).sort_values(by=["chrom", "start", "end"])
    return bed_files

def pos_windows(bed_l, window_size, chrom_order):
    # Input a bed file and the window size of intervals desired. Multiple chromosomes accepted.
    # It has to be sorted.
    df_l = []
    for c in chrom_order:
        #print(c)
        frac_l = []
        b = bed_l.loc[bed_l["chrom"] == c].copy()
        b["w_s"] = b.end-b.start
        w_start = b.start.iloc[0]
        current_pos, callable_bases = 0, 0
        for i, j, k in zip(b.start, b.end, b.w_s):
            # Nothing called in the current window under investigation.
            while i-window_size >= current_pos:
                frac_l.append(callable_bases/window_size)
                callable_bases = 0
                current_pos += window_size
            # Window starts in current. We know this is true because of the previous while loop.
            callable_bases += min(k, current_pos+window_size-i)
            # Everything called in current.
            while j-window_size >= current_pos:
                frac_l.append(callable_bases/window_size)
                callable_bases = 0
                current_pos += window_size
                if j-window_size >= current_pos:
                    callable_bases += window_size
                else:
                # Window stops in current. Again, know this is true.
                    callable_bases += j-current_pos
        # Last window.
        frac_l.append(callable_bases/(window_size))
        df_l.append(pd.DataFrame({"chrom": c, "window_start": list(range(0, len(frac_l)*window_size, window_size)),
                                  "window_end": list(range(window_size, (len(frac_l)+1)*window_size, window_size)),
                                  "callable_frac": frac_l}))
    return pd.concat(df_l)

def haploid_double(ds, variable, dim):
    unmasked = ~ds[f"{variable}_mask"]
    overwrite = ds.call_genotype[:,:,0]
    overwrite_2 = ds.call_genotype_mask[:,:,0]
    return ds.assign(**{
        f"{variable}": ds[variable].where(
        unmasked, 
        overwrite),
        f"{variable}_mask": ds[f"{variable}_mask"].where(
        unmasked,
        overwrite_2)})

In [4]:
# Input checks: Is there a zarr/bed pair for all species?
bed_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024/{}/filteredVCF/pos_bed_cov_based/{}_batch0_fploidy2_mploidy2.bed"
fvcf_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024/{}/filteredVCF/bcf_step1/{}_all_chr.sorted.bcf"
for x in glob.glob(metadata_path+"*_individuals.txt"):
    short_form = x.split("/")[-1].split("_")[0]
    long_form = x.split("/")[-1]
    # Loading the various metadata files. Metadata, contig information, callability bed.
    metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
    metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
    metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]
    regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
    for f in metadata_df.GVCF_FOLDER.unique():
        r_df = regions_df.loc[regions_df.REFERENCE_FOLDER == f]
        if not os.path.exists(bed_path.format(f, f)):
            print(f, len(metadata_df.loc[metadata_df.GVCF_FOLDER == f]))
        if not os.path.exists(fvcf_path.format(f, f)):
            print(f)
        # for b, f, m in zip(r_df.BATCH, r_df.FEMALE_PLOIDY, r_df.MALE_PLOIDY):
        #    print(b,f,m)

Chlorocebus_aethiops_ssp 26
Chlorocebus_aethiops_ssp
Chlorocebus_cynosuros_ssp 16
Chlorocebus_cynosuros_ssp
Chlorocebus_tantalus_ssp 11
Chlorocebus_tantalus_ssp
Allenopithecus_nigroviridis_ssp 3
Pithecia_pissinatti_ssp 3
Pithecia_pissinatti_ssp
Nasalis_larvatus_ssp 3


In [5]:
# Pi implementation
for x in glob.glob(zarr_path+"*"):
    print(x)
    # The name used to load all the files, short and long version
    short_form = x.split("/")[-1].split("_")[0]
    long_form = x.split("/")[-1]
    print(long_form)
    # Loading the various metadata files. Metadata, contig information, callability bed.
    metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
    metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
    metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]
    regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
    regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
    regions_df["chr_type"] = ["chrX" if x == 2 and y == 1 else "aut" for x, y in zip(regions_df.FEMALE_PLOIDY, regions_df.MALE_PLOIDY)]
    large_contigs = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
    large_x = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID
    bed_files = read_beds(long_form)
    # Loading the genetic data.
    df_l = []
    for c in glob.glob(x+"/*"):
        print(c)
        ds = sg.load_dataset(c)
        # This implementation is the pi implementation.
        # Probably problematic in some cases with population structure, but it is easier to implement
        ds["sample_cohort"] = np.repeat([0], ds.sizes["samples"])
        # Subsetting and windowing the sgkit dataset. The rechunking handles what otherwise would cause an error.
        #ds["call_genotype"] = ds["call_genotype"].clip(0)
        ds = ds.sel(contigs=[ds.variant_contig[0].values])
        missing_rate = ds.call_genotype_mask[:,:,0].sum(axis=1).values/ds.call_genotype_mask[:,:,0].count(axis=1).values
        ds = ds.isel(variants=(missing_rate <= missing_filter))
        ds = sg.window_by_position(ds, size=window_size)
        ds = (sg.diversity(ds.chunk({"variants": len(ds.variants)//100})))

../zarr_data/Leontopithecus_chrysomelas_ssp
Leontopithecus_chrysomelas_ssp
../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_23
../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_1
../zarr_data/Saguinus_labiatus_ssp
Saguinus_labiatus_ssp
../zarr_data/Saguinus_labiatus_ssp/CM063191.1
../zarr_data/Saguinus_labiatus_ssp/JASSZA010000432.1


ValueError: Dimension 0 has 3 blocks, adjust_chunks specified with 100 blocks

In [20]:
ds.variants.chunk().count

<bound method DataArrayAggregations.count of <xarray.DataArray 'variants' (variants: 5748)> Size: 46kB
array([   0,    1,    2, ..., 5745, 5746, 5747])
Dimensions without coordinates: variants>

In [9]:
large_x

20    NC_044996.1
Name: CONTIG_ID, dtype: object

In [7]:
bed_files.chrom.unique()

array(['NC_044976.1', 'NC_044977.1', 'NC_044978.1', 'NC_044979.1',
       'NC_044980.1', 'NC_044981.1', 'NC_044982.1', 'NC_044983.1',
       'NC_044984.1', 'NC_044985.1', 'NC_044986.1', 'NC_044987.1',
       'NC_044988.1', 'NC_044989.1', 'NC_044990.1', 'NC_044991.1',
       'NC_044992.1', 'NC_044993.1', 'NC_044994.1', 'NC_044995.1',
       'NC_044996.1'], dtype=object)

In [24]:
ds = (sg.diversity(ds.chunk({"variants": len(ds.variants)//50})))



In [13]:
regions_df = pd.read_csv(metadata_path+"Semnopithecus_regions_and_batches.txt".format(short_form), sep="\t")
regions_df.loc[regions_df.FEMALE_PLOIDY == 0]

Unnamed: 0,REFERENCE_FOLDER,REGION,CONTIG_ID,START,END,BATCH,FEMALE_PLOIDY,MALE_PLOIDY
9,Trachypithecus_francoisi_ssp,NW_022680469.1,NW_022680469.1,0,1129610,0,0,1


In [32]:
# Pi implementation
for x in glob.glob(zarr_path+"*"):
    print(x)
    # The name used to load all the files, short and long version
    short_form = x.split("/")[-1].split("_")[0]
    long_form = x.split("/")[-1]
    print(long_form)
    # Loading the various metadata files. Metadata, contig information, callability bed.
    metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
    metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
    metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]
    regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
    regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
    regions_df["chr_type"] = ["chrX" if x == 2 and y == 1 else "aut" for x, y in zip(regions_df.FEMALE_PLOIDY, regions_df.MALE_PLOIDY)]
    large_contigs = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
    large_x = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID
    bed_files = read_beds(long_form)
    # Loading the genetic data.
    df_l = []
    for c in glob.glob(x+"/*"):
        print(c)
        ds = sg.load_dataset(c)
        # This implementation is the pi implementation.
        # Probably problematic in some cases with population structure, but it is easier to implement
        ds["sample_cohort"] = np.repeat([0], ds.sizes["samples"])
        # Subsetting and windowing the sgkit dataset. The rechunking handles what otherwise would cause an error.
        #ds["call_genotype"] = ds["call_genotype"].clip(0)
        ds = ds.sel(contigs=[ds.variant_contig[0].values])
        missing_rate = ds.call_genotype_mask[:,:,0].sum(axis=1).values/ds.call_genotype_mask[:,:,0].count(axis=1).values
        ds = ds.isel(variants=(missing_rate <= missing_filter))
        ds = sg.window_by_position(ds, size=window_size)
        ds = (sg.diversity(ds.chunk({"variants": 50000})))
        df_sub = pd.DataFrame({"window_start": list(range(0, len(ds.window_start)*window_size, window_size)),
                               "pi": ds.stat_diversity[:,0]})
        df_sub["chrom"] = c.split("/")[-1]
        df_l.append(df_sub)
    df_het = pd.concat(df_l)
    bed_files = read_beds(long_form)
    intervals_callable = pos_windows(bed_files, window_size, df_het["chrom"].unique())
    output_df = pd.merge(df_het, intervals_callable, on=["chrom", "window_start"])
    output_df["chr_type"] = output_df["chrom"].map(dict(zip(regions_df.CONTIG_ID, regions_df.chr_type)))
    output_df["species"] = long_form
    output_df.to_csv("../results/window_stats/{}_100kb_pi.txt".format(long_form), sep="\t")

../zarr_data/Leontopithecus_chrysomelas_ssp
Leontopithecus_chrysomelas_ssp
../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_23
../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_1
../zarr_data/Saguinus_labiatus_ssp
Saguinus_labiatus_ssp
../zarr_data/Saguinus_labiatus_ssp/CM063191.1
../zarr_data/Saguinus_labiatus_ssp/JASSZA010000432.1
../zarr_data/Saguinus_labiatus_ssp/CM063169.1


In [31]:

ds = sg.load_dataset("../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_1")
        # This implementation is the pi implementation.
        # Probably problematic in some cases with population structure, but it is easier to implement
ds["sample_cohort"] = np.repeat([0], ds.sizes["samples"])
        # Subsetting and windowing the sgkit dataset. The rechunking handles what otherwise would cause an error.
        #ds["call_genotype"] = ds["call_genotype"].clip(0)
ds = ds.sel(contigs=[ds.variant_contig[0].values])
missing_rate = ds.call_genotype_mask[:,:,0].sum(axis=1).values/ds.call_genotype_mask[:,:,0].count(axis=1).values
missing_rate

array([0.  , 0.25, 0.25, ..., 0.  , 0.  , 0.25])

In [33]:
missing_rate.max()

0.75

In [23]:
missing_rate = ds.call_genotype_mask[:,:,0].sum(axis=1).values/ds.call_genotype_mask[:,:,0].count(axis=1).values
missing_rate

array([0., 0., 0., ..., 0., 0., 0.])

In [34]:
sg.Fst(ds)["stat_Fst"].values

array([[[nan]],

       [[nan]],

       [[nan]],

       ...,

       [[nan]],

       [[nan]],

       [[nan]]])

In [4]:
# Heterozygosity implementation. No missing filter - I count all sites which have a het call.
for x in glob.glob(zarr_path+"*")[:1]:
    print(x)
    # The name used to load all the files, short and long version
    short_form = x.split("/")[-1].split("_")[0]
    long_form = x.split("/")[-1]
    print(long_form)
    # Loading the various metadata files. Metadata, contig information, callability bed.
    metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
    metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
    metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]
    regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
    regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
    regions_df["chr_type"] = ["chrX" if x == 2 and y == 1 else "aut" for x, y in zip(regions_df.FEMALE_PLOIDY, regions_df.MALE_PLOIDY)]
    large_contigs = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
    large_x = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID
    bed_files = read_beds(long_form)
    # Loading the genetic data.
    df_l = []
    for c in glob.glob(x+"/*"):
        print(c)
        ds = sg.load_dataset(c)
        # Probably problematic in some cases with population structure, but it is easier to implement
        ds["sample_cohort"] = ds["samples"]
        # Subsetting and windowing the sgkit dataset. The rechunking handles what otherwise would cause an error.
        if c.split("/")[-1] in list(large_x):
            ds = haploid_double(ds, "call_genotype", "samples")
        ds["call_genotype"] = ds["call_genotype"].clip(0)
        ds = ds.sel(contigs=[ds.variant_contig[0].values])
        ds = sg.window_by_position(ds, size=window_size)
        ds = (sg.diversity(ds.chunk({"variants": 50000})))
        df_sub = pd.DataFrame(ds.stat_diversity, columns=ds.sample_id)
        df_sub["window_start"] = list(range(0, len(ds.window_start)*window_size, window_size))
        df_sub["chrom"] = c.split("/")[-1]
        df_l.append(df_sub)
    df_het = pd.concat(df_l)
    bed_files = read_beds(long_form)
    intervals_callable = pos_windows(bed_files, window_size, df_het["chrom"].unique())
    output_df = pd.merge(df_het, intervals_callable, on=["chrom", "window_start"])
    output_df["chr_type"] = output_df["chrom"].map(dict(zip(regions_df.CONTIG_ID, regions_df.chr_type)))
    output_df["species"] = long_form
    output_df.to_csv("../results/window_stats/{}_100kb_het.txt".format(long_form), sep="\t")

../zarr_data/Leontopithecus_chrysomelas_ssp
Leontopithecus_chrysomelas_ssp
../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_23
../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_1


In [9]:
ds.stat_diversity[:,0]

Unnamed: 0,Array,Chunk
Bytes,15.16 kiB,1.37 kiB
Shape,"(1941,)","(175,)"
Dask graph,13 chunks in 32 graph layers,13 chunks in 32 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 15.16 kiB 1.37 kiB Shape (1941,) (175,) Dask graph 13 chunks in 32 graph layers Data type float64 numpy.ndarray",1941  1,

Unnamed: 0,Array,Chunk
Bytes,15.16 kiB,1.37 kiB
Shape,"(1941,)","(175,)"
Dask graph,13 chunks in 32 graph layers,13 chunks in 32 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [15]:
pd.DataFrame({"het": ds.stat_diversity[:,0], "GVCF_ID": ds.sample_id[0].values})

Unnamed: 0,het,GVCF_ID
0,154.0,PD_0840
1,83.0,PD_0840
2,100.0,PD_0840
3,39.0,PD_0840
4,110.0,PD_0840
...,...,...
1936,97.0,PD_0840
1937,92.0,PD_0840
1938,96.0,PD_0840
1939,111.0,PD_0840


In [21]:
output_df

Unnamed: 0,PD_0840,PD_0841,PD_0844,SAMN31952328,window_start,chrom,window_end,callable_frac,chr_type,species
0,0.0,235.0,0.0,0.0,0,HiC_scaffold_23,100000,0.93982,chrX,Leontopithecus_chrysomelas_ssp
1,0.0,194.0,0.0,0.0,100000,HiC_scaffold_23,200000,0.94975,chrX,Leontopithecus_chrysomelas_ssp
2,0.0,194.0,0.0,0.0,200000,HiC_scaffold_23,300000,0.96536,chrX,Leontopithecus_chrysomelas_ssp
3,0.0,170.0,0.0,0.0,300000,HiC_scaffold_23,400000,0.95552,chrX,Leontopithecus_chrysomelas_ssp
4,0.0,222.0,0.0,0.0,400000,HiC_scaffold_23,500000,0.94898,chrX,Leontopithecus_chrysomelas_ssp
...,...,...,...,...,...,...,...,...,...,...
3227,97.0,27.0,49.0,34.0,193600000,HiC_scaffold_1,193700000,0.96172,aut,Leontopithecus_chrysomelas_ssp
3228,92.0,21.0,129.0,24.0,193700000,HiC_scaffold_1,193800000,0.95889,aut,Leontopithecus_chrysomelas_ssp
3229,96.0,15.0,124.0,20.0,193800000,HiC_scaffold_1,193900000,0.96756,aut,Leontopithecus_chrysomelas_ssp
3230,111.0,15.0,93.0,102.0,193900000,HiC_scaffold_1,194000000,0.96973,aut,Leontopithecus_chrysomelas_ssp


In [11]:
# Fst implementation.
for x in glob.glob(zarr_path+"*"):
    print(x)
    # The name used to load all the files, short and long version
    short_form = x.split("/")[-1].split("_")[0]
    long_form = x.split("/")[-1]
    print(long_form)
    # Loading the various metadata files. Metadata, contig information, callability bed.
    metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
    metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
    metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]
    regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
    regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
    regions_df["chr_type"] = ["chrX" if x == 2 and y == 1 else "aut" for x, y in zip(regions_df.FEMALE_PLOIDY, regions_df.MALE_PLOIDY)]
    large_contigs = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
    large_x = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID
    # Skipping the large samples sizes and the singulars for the Fst calc
    print(len(metadata_df.loc[metadata_df.GVCF_FOLDER == long_form]), long_form)
    if ((len(metadata_df.loc[metadata_df.GVCF_FOLDER == long_form]) > 20) or
    (len(metadata_df.loc[metadata_df.GVCF_FOLDER == long_form]) == 1)):
        print("Skipping")
        continue
    # Loading the genetic data.
    df_l = []
    for c in glob.glob(x+"/*"):
        print(c)
        ds = sg.load_dataset(c)
        # This implementation is the pi implementation.
        # Probably problematic in some cases with population structure, but it is easier to implement
        ds["sample_cohort"] = ds["samples"]
        # Subsetting and windowing the sgkit dataset. The rechunking handles what otherwise would cause an error.
        #ds["call_genotype"] = ds["call_genotype"].clip(0)
        ds = ds.sel(contigs=[ds.variant_contig[0].values])
        if c.split("/")[-1] in list(large_x):
            ds = haploid_double(ds, "call_genotype", "samples")
        missing_rate = ds.call_genotype_mask[:,:,0].sum(axis=1).values/ds.call_genotype_mask[:,:,0].count(axis=1).values
        ds = ds.isel(variants=(missing_rate <= 0))
        ds = sg.window_by_genome(ds)
        ds = (sg.Fst(ds.chunk({"variants": 50000})))
        df_sub = pd.DataFrame(ds.stat_Fst[0,:,], columns=ds.sample_id)
        df_sub["chrom"] = c.split("/")[-1]
        df_sub["variants_used"] = len(ds.variants)
        df_l.append(df_sub)
    output_df = pd.concat(df_l)
    output_df["chr_type"] = output_df["chrom"].map(dict(zip(regions_df.CONTIG_ID, regions_df.chr_type)))
    output_df["species"] = long_form
    output_df.to_csv("../results/window_stats/{}_Fst.txt".format(long_form), sep="\t")

../zarr_data/Leontopithecus_chrysomelas_ssp
Leontopithecus_chrysomelas_ssp
4 Leontopithecus_chrysomelas_ssp
../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_23
../zarr_data/Leontopithecus_chrysomelas_ssp/HiC_scaffold_1
../zarr_data/Saguinus_labiatus_ssp
Saguinus_labiatus_ssp
2 Saguinus_labiatus_ssp
../zarr_data/Saguinus_labiatus_ssp/CM063191.1
../zarr_data/Saguinus_labiatus_ssp/JASSZA010000432.1
../zarr_data/Saguinus_labiatus_ssp/CM063169.1
../zarr_data/Plecturocebus_cupreus_ssp
Plecturocebus_cupreus_ssp
5 Plecturocebus_cupreus_ssp
../zarr_data/Plecturocebus_cupreus_ssp/CM080837.1
../zarr_data/Plecturocebus_cupreus_ssp/CM080815.1
../zarr_data/Papio_hamadryas_ssp
Papio_hamadryas_ssp
47 Papio_hamadryas_ssp
Skipping
../zarr_data/Macaca_fascicularis_ssp
Macaca_fascicularis_ssp
159 Macaca_fascicularis_ssp
Skipping
../zarr_data/Saimiri_macrodon_ssp
Saimiri_macrodon_ssp
3 Saimiri_macrodon_ssp
../zarr_data/Saimiri_macrodon_ssp/NW_024100917.1
../zarr_data/Saimiri_macrodon_ssp/NW_0241009

../zarr_data/Trachypithecus_francoisi_ssp/NW_022680461.1
../zarr_data/Trachypithecus_francoisi_ssp/NW_022680471.1
../zarr_data/Trachypithecus_francoisi_ssp/NW_022680465.1
../zarr_data/Trachypithecus_francoisi_ssp/NW_022681437.1
../zarr_data/Trachypithecus_francoisi_ssp/NW_022681445.1
../zarr_data/Nomascus_gabriellae_ssp
Nomascus_gabriellae_ssp
1 Nomascus_gabriellae_ssp
Skipping
../zarr_data/Cercopithecus_hamlyni_ssp
Cercopithecus_hamlyni_ssp
2 Cercopithecus_hamlyni_ssp
../zarr_data/Cercopithecus_hamlyni_ssp/CM053363.1
../zarr_data/Cercopithecus_hamlyni_ssp/CM053398.1
../zarr_data/Leontocebus_nigricollis_ssp
Leontocebus_nigricollis_ssp
1 Leontocebus_nigricollis_ssp
Skipping
../zarr_data/Chiropotes_albinasus_ssp
Chiropotes_albinasus_ssp
4 Chiropotes_albinasus_ssp
../zarr_data/Chiropotes_albinasus_ssp/CM052647.1
../zarr_data/Chiropotes_albinasus_ssp/CM052624.1
../zarr_data/Macaca_tonkeana_ssp
Macaca_tonkeana_ssp
8 Macaca_tonkeana_ssp
../zarr_data/Macaca_tonkeana_ssp/NC_092145.1
../zarr_da

17 Macaca_silenus_ssp
../zarr_data/Macaca_silenus_ssp/NW_027257673.1
../zarr_data/Macaca_silenus_ssp/NC_092125.1
../zarr_data/Macaca_silenus_ssp/NC_092145.1
../zarr_data/Allochrocebus_preussi_ssp
Allochrocebus_preussi_ssp
1 Allochrocebus_preussi_ssp
Skipping
../zarr_data/Cercopithecus_petaurista_ssp
Cercopithecus_petaurista_ssp
1 Cercopithecus_petaurista_ssp
Skipping
../zarr_data/Mico_argentatus_ssp
Mico_argentatus_ssp
4 Mico_argentatus_ssp
../zarr_data/Mico_argentatus_ssp/NC_071464.1
../zarr_data/Mico_argentatus_ssp/NC_071442.1
../zarr_data/Plecturocebus_hoffmannsi_ssp
Plecturocebus_hoffmannsi_ssp
1 Plecturocebus_hoffmannsi_ssp
Skipping
../zarr_data/Alouatta_juara_ssp
Alouatta_juara_ssp
2 Alouatta_juara_ssp
../zarr_data/Alouatta_juara_ssp/CAJZLT010000001.1
../zarr_data/Alouatta_juara_ssp/CAJZLT010000072.1
../zarr_data/Alouatta_juara_ssp/CAJZLT010000040.1
../zarr_data/Alouatta_juara_ssp/CAJZLT010000060.1
../zarr_data/Alouatta_juara_ssp/CAJZLT010000017.1
../zarr_data/Alouatta_juara_ssp/

ValueError: No objects to concatenate

In [46]:
# Fst implementation medium
for x in glob.glob(zarr_path+"*"):
    print(x)
    # The name used to load all the files, short and long version
    short_form = x.split("/")[-1].split("_")[0]
    long_form = x.split("/")[-1]
    print(long_form)
    # Loading the various metadata files. Metadata, contig information, callability bed.
    metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
    metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
    metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]
    regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
    regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
    regions_df["chr_type"] = ["chrX" if x == 2 and y == 1 else "aut" for x, y in zip(regions_df.FEMALE_PLOIDY, regions_df.MALE_PLOIDY)]
    large_contigs = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
    large_x = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID
    # Skipping the large samples sizes and the singulars for the Fst calc
    print(len(metadata_df.loc[metadata_df.GVCF_FOLDER == long_form]), long_form)
    if ((len(metadata_df.loc[metadata_df.GVCF_FOLDER == long_form]) > 50) or
    (len(metadata_df.loc[metadata_df.GVCF_FOLDER == long_form]) <= 20)):
        print("Skipping")
        continue
    # Loading the genetic data.
    df_l = []
    for c in glob.glob(x+"/*"):
        print(c)
        ds = sg.load_dataset(c)
        # This implementation is the pi implementation.
        # Probably problematic in some cases with population structure, but it is easier to implement
        ds["sample_cohort"] = ds["samples"]
        # Subsetting and windowing the sgkit dataset. The rechunking handles what otherwise would cause an error.
        #ds["call_genotype"] = ds["call_genotype"].clip(0)
        ds = ds.sel(contigs=[ds.variant_contig[0].values])
        if c.split("/")[-1] in list(large_x):
            ds = haploid_double(ds, "call_genotype", "samples")
        missing_rate = ds.call_genotype_mask[:,:,0].sum(axis=1).values/ds.call_genotype_mask[:,:,0].count(axis=1).values
        ds = ds.isel(variants=(missing_rate <= 0))
        ds = sg.window_by_genome(ds)
        ds = (sg.Fst(ds.chunk({"variants": 50000})))
        df_sub = pd.DataFrame(ds.stat_Fst[0,:,], columns=ds.sample_id)
        df_sub["chrom"] = c.split("/")[-1]
        df_sub["variants_used"] = len(ds.variants)
        df_l.append(df_sub)
    output_df = pd.concat(df_l)
    output_df["chr_type"] = output_df["chrom"].map(dict(zip(regions_df.CONTIG_ID, regions_df.chr_type)))
    output_df["species"] = long_form
    output_df.to_csv("../results/window_stats/{}_Fst.txt".format(long_form), sep="\t")

../zarr_data/Leontopithecus_chrysomelas_ssp
Leontopithecus_chrysomelas_ssp
4 Leontopithecus_chrysomelas_ssp
Skipping
../zarr_data/Saguinus_labiatus_ssp
Saguinus_labiatus_ssp
2 Saguinus_labiatus_ssp
Skipping
../zarr_data/Plecturocebus_cupreus_ssp
Plecturocebus_cupreus_ssp
5 Plecturocebus_cupreus_ssp
Skipping
../zarr_data/Papio_hamadryas_ssp
Papio_hamadryas_ssp
47 Papio_hamadryas_ssp
../zarr_data/Papio_hamadryas_ssp/NC_044976.1
../zarr_data/Papio_hamadryas_ssp/NC_044996.1
../zarr_data/Macaca_fascicularis_ssp
Macaca_fascicularis_ssp
159 Macaca_fascicularis_ssp
Skipping
../zarr_data/Saimiri_macrodon_ssp
Saimiri_macrodon_ssp
3 Saimiri_macrodon_ssp
Skipping
../zarr_data/Cheirogaleus_crossleyi_ssp
Cheirogaleus_crossleyi_ssp
1 Cheirogaleus_crossleyi_ssp
Skipping
../zarr_data/Cheirogaleus_medius_ssp
Cheirogaleus_medius_ssp
5 Cheirogaleus_medius_ssp
Skipping
../zarr_data/Plecturocebus_caligatus_ssp
Plecturocebus_caligatus_ssp
3 Plecturocebus_caligatus_ssp
Skipping
../zarr_data/Callimico_goeldii_

../zarr_data/Pongo_abelii_ssp/NC_072008.2
../zarr_data/Hapalemur_occidentalis_ssp
Hapalemur_occidentalis_ssp
1 Hapalemur_occidentalis_ssp
Skipping
../zarr_data/Allochrocebus_solatus_ssp
Allochrocebus_solatus_ssp
1 Allochrocebus_solatus_ssp
Skipping
../zarr_data/Cebuella_niveiventris_ssp
Cebuella_niveiventris_ssp
2 Cebuella_niveiventris_ssp
Skipping
../zarr_data/Hoolock_hoolock_ssp
Hoolock_hoolock_ssp
11 Hoolock_hoolock_ssp
Skipping
../zarr_data/Chlorocebus_pygerythrus_ssp
Chlorocebus_pygerythrus_ssp
58 Chlorocebus_pygerythrus_ssp
Skipping
../zarr_data/Cercopithecus_roloway_ssp
Cercopithecus_roloway_ssp
1 Cercopithecus_roloway_ssp
Skipping
../zarr_data/Colobus_guereza_ssp
Colobus_guereza_ssp
9 Colobus_guereza_ssp
Skipping
../zarr_data/Callithrix_geoffroyi_ssp
Callithrix_geoffroyi_ssp
1 Callithrix_geoffroyi_ssp
Skipping
../zarr_data/Trachypithecus_obscurus_ssp
Trachypithecus_obscurus_ssp
3 Trachypithecus_obscurus_ssp
Skipping
../zarr_data/Saguinus_imperator_ssp
Saguinus_imperator_ssp
2 S

3 Nasalis_larvatus_ssp
Skipping
../zarr_data/Propithecus_perrieri_ssp
Propithecus_perrieri_ssp
2 Propithecus_perrieri_ssp
Skipping
../zarr_data/Macaca_radiata_ssp
Macaca_radiata_ssp
13 Macaca_radiata_ssp
Skipping
../zarr_data/Lepilemur_mustelinus_ssp
Lepilemur_mustelinus_ssp
1 Lepilemur_mustelinus_ssp
Skipping
../zarr_data/Prolemur_simus_ssp
Prolemur_simus_ssp
8 Prolemur_simus_ssp
Skipping
../zarr_data/Saguinus_oedipus_ssp
Saguinus_oedipus_ssp
1 Saguinus_oedipus_ssp
Skipping
../zarr_data/Piliocolobus_gordonorum_ssp
Piliocolobus_gordonorum_ssp
1 Piliocolobus_gordonorum_ssp
Skipping
../zarr_data/Alouatta_caraya_ssp
Alouatta_caraya_ssp
2 Alouatta_caraya_ssp
Skipping
../zarr_data/Lagothrix_lagotricha_ssp
Lagothrix_lagotricha_ssp
1 Lagothrix_lagotricha_ssp
Skipping
../zarr_data/Leontopithecus_rosalia_ssp
Leontopithecus_rosalia_ssp
3 Leontopithecus_rosalia_ssp
Skipping
../zarr_data/Daubentonia_madagascariensis_ssp
Daubentonia_madagascariensis_ssp
35 Daubentonia_madagascariensis_ssp
../zarr_d

2 Cercopithecus_pogonias_ssp
Skipping
../zarr_data/Microcebus_griseorufus_ssp
Microcebus_griseorufus_ssp
1 Microcebus_griseorufus_ssp
Skipping
../zarr_data/Saguinus_mystax_ssp
Saguinus_mystax_ssp
1 Saguinus_mystax_ssp
Skipping
../zarr_data/Tarsius_wallacei_ssp
Tarsius_wallacei_ssp
1 Tarsius_wallacei_ssp
Skipping
../zarr_data/Arctocebus_calabarensis_ssp
Arctocebus_calabarensis_ssp
1 Arctocebus_calabarensis_ssp
Skipping
../zarr_data/Trachypithecus_auratus_ssp
Trachypithecus_auratus_ssp
2 Trachypithecus_auratus_ssp
Skipping
../zarr_data/Papio_ursinus_ssp
Papio_ursinus_ssp
8 Papio_ursinus_ssp
Skipping
../zarr_data/Plecturocebus_miltoni_ssp
Plecturocebus_miltoni_ssp
1 Plecturocebus_miltoni_ssp
Skipping
../zarr_data/Eulemur_flavifrons_ssp
Eulemur_flavifrons_ssp
10 Eulemur_flavifrons_ssp
Skipping
../zarr_data/Cercopithecus_wolfi_ssp
Cercopithecus_wolfi_ssp
3 Cercopithecus_wolfi_ssp
Skipping
../zarr_data/Aotus_vociferans_ssp
Aotus_vociferans_ssp
5 Aotus_vociferans_ssp
Skipping
../zarr_data/Tra

In [10]:
output_df

Unnamed: 0,PD_0120,PD_0348,chrom,variants_used,chr_type,species
0,,,CM063191.1,1535859,chrX,Saguinus_labiatus_ssp
1,,,CM063191.1,1535859,chrX,Saguinus_labiatus_ssp
0,,-0.5171,JASSZA010000432.1,4587,aut,Saguinus_labiatus_ssp
1,-0.5171,,JASSZA010000432.1,4587,aut,Saguinus_labiatus_ssp
0,,0.184585,CM063169.1,3030007,aut,Saguinus_labiatus_ssp
1,0.184585,,CM063169.1,3030007,aut,Saguinus_labiatus_ssp


In [7]:
for x in glob.glob(zarr_path+"*")[4:5]:
    print(x)
    # The name used to load all the files, short and long version
    short_form = x.split("/")[-1].split("_")[0]
    long_form = x.split("/")[-1]
    print(long_form)
    # Loading the various metadata files. Metadata, contig information, callability bed.
    metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
    metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
    metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]
    regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
    regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
    regions_df["chr_type"] = ["chrX" if x == 2 and y == 1 else "aut" for x, y in zip(regions_df.FEMALE_PLOIDY, regions_df.MALE_PLOIDY)]
    large_contigs = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
    large_x = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID
    # Loading the genetic data.
    df_l = []
    for c in glob.glob(x+"/*"):
        print(c)
        ds = sg.load_dataset(c)

../zarr_data/Macaca_fascicularis_ssp
Macaca_fascicularis_ssp
../zarr_data/Macaca_fascicularis_ssp/CP141342.1
../zarr_data/Macaca_fascicularis_ssp/CP141341.1
../zarr_data/Macaca_fascicularis_ssp/CP141361.1


In [None]:
metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
zarr_path = "../zarr_data/"
metadata_folders = glob.glob(metadata_path+"*_individuals.txt")

for folder in metadata_folders[:0]:
    metadata_df = pd.read_csv(folder, sep="\t")
    short_form = folder.split("/")[-1].split("_")[0]
    regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
    metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]
    female_df = metadata_df[pd.to_numeric(metadata_df['AVG_COVERAGE_X'], errors='coerce').notnull()]
    female_df = female_df.loc[(female_df.GENETIC_SEX == "F") & (female_df.AVG_COVERAGE_A >= 10)].sort_values(by="AVG_COVERAGE_A", ascending=False)
    for GVCF_FOLDER in female_df.GVCF_FOLDER.unique():
        # Check that zarr data exists.
        zarr_chroms = glob.glob(zarr_path+GVCF_FOLDER+"/*")
        for c in zarr_chroms:
            ds = sg.load_dataset(c, drop_variables=["variant_allele"])
            print(c)
        reference = metadata_df.loc[metadata_df.GVCF_FOLDER == GVCF_FOLDER].REFERENCE_FOLDER.unique()[0]
        regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
        regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
        large_contigs = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
        large_x = regions_df.loc[(regions_df.LENGTH >= size_cutoff) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID
        bed_files = read_beds(GVCF_FOLDER)

In [14]:
metadata_path+"{}_individuals.txt".format(short_form)

'/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/Pongo_individuals.txt'

In [None]:
# Loading the genetic data.
    ds = sg.load_dataset("../data/{}".format(long_form))
    sex_map = dict(zip(metadata_df.GVCF_ID, metadata_df.SEX_I))
    # If you want to split based on sex. I will make every individual unique
    #ds["sample_cohort"] = xr.DataArray(pd.Series(ds.sample_id.values).map(sex_map), dims="samples")
    ds["sample_cohort"] = ds["samples"]
    # Only keeping contigs which are above the cutoff (1Mb or more)
    kept_contigs =  [x for x in ds.contig_id.values if (x == large_contigs).any()]
    contig_IDs = pd.Series(kept_contigs).map(dict(zip(ds.contig_id.values, range(len(ds.contig_id.values))))).values
    # Subsetting and windowing the sgkit dataset. The rechunking handles what otherwise would cause an error.
    ds_autx = ds.sel(variants=(ds.variant_contig.isin(contig_IDs).compute()), contigs=contig_IDs)
    ds_autx = sg.window_by_position(ds_autx, size=window_size)
    ds_autx["call_genotype"] = ds_autx["call_genotype"].clip(0)
    ds_autx_diversity = sg.diversity(ds_autx.chunk({"variants": 50000}))
    # Generating window callability
    contig_l, window_start_l = [], []
    v_c = pd.Series(ds_autx.window_contig).value_counts(sort=False)
    for i in range(len(v_c)):
        window_start_l.extend(list(range(0, v_c[i]*window_size, window_size)))
        contig_l.extend([kept_contigs[i]]*v_c[i])
    window_df = pd.DataFrame({"chrom": contig_l, "window_start": window_start_l})
    intervals_callable = pos_windows(bed_files, window_size, kept_contigs)
    window_df_call = window_df.merge(intervals_callable, on=["chrom", "window_start"])
    # Merge and save
    output_df = pd.merge(window_df_call, pd.DataFrame(ds_autx_diversity["stat_diversity"],
                                      columns=ds_autx_diversity["sample_id"]),
                         left_index=True, right_index=True)
    output_df.to_csv("../results/window_stats/{}_10kb_het.txt".format(long_form), sep="\t")

In [3]:
df_l = []
for p in df_paths:
    d = pd.read_csv(p, sep="\t", index_col=[0])
    d["species"] = p.split("/")[-1].split("_1")[0]
    df_l.append(pd.melt(d, id_vars=["chrom", "window_start", "window_end", "callable_frac", "species"], var_name="GVCF_ID", value_name="het"))
het_df = pd.concat(df_l)

In [4]:
# Standardized parts.
short_form = "Papio"
long_form = "Papio_papio_ssp"
size_cutoff = 1000000
window_size = 100000 # Should maybe decrease to 25 or 10kb.

metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]

regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
large_contigs = regions_df.loc[(regions_df.LENGTH >= 1000000) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
large_x = regions_df.loc[(regions_df.LENGTH >= 1000000) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID

In [5]:
large_x

20    NC_044996.1
Name: CONTIG_ID, dtype: object

In [6]:
# Standardized parts.
short_form = "Gorilla"
long_form = "Gorilla_Gorilla_ssp"
size_cutoff = 1000000
window_size = 100000 # Should maybe decrease to 25 or 10kb.

metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
metadata_df = pd.read_csv(metadata_path+"{}_individuals.txt".format(short_form), sep="\t")
metadata_df["SEX_I"] = [0 if x == "F" else 1 for x in metadata_df.GENETIC_SEX]

regions_df = pd.read_csv(metadata_path+"{}_regions_and_batches.txt".format(short_form), sep="\t")
regions_df["LENGTH"] = regions_df["END"]-regions_df["START"]
large_contigs = regions_df.loc[(regions_df.LENGTH >= 1000000) & (regions_df.FEMALE_PLOIDY == 2)].CONTIG_ID.unique()
large_x = regions_df.loc[(regions_df.LENGTH >= 1000000) & (regions_df.FEMALE_PLOIDY == 2) &
                        (regions_df.MALE_PLOIDY == 1)].CONTIG_ID

In [7]:
large_x

23    NC_073247.2
25    NC_073247.2
Name: CONTIG_ID, dtype: object

In [8]:
het_df["chr_type"] = ["aut" if x not in ["NC_044996.1", "NC_073247.2"] else "chrX" for x in het_df["chrom"]]

In [9]:
het_df_high_call = het_df.loc[het_df.callable_frac >= 0.95]

In [59]:
metadata_path = "/home/eriks/primatediversity/data/gVCFs_recalling_10_12_2024_metadata/"
metadata_df_gorilla = pd.read_csv(metadata_path+"Gorilla_individuals.txt".format(short_form), sep="\t")

In [47]:
df_mean_het = het_df_high_call.groupby(["GVCF_ID", "species", "chr_type"])["het"].mean().reset_index()