# Generate data by subsampling source vcfs

In [1]:
from pathlib import Path
import data
import random
import pysam

## Choose the source file
This is the file we will subsample

In [75]:
in_file = data.xy_vcf
base_name = in_file.name.split(".")[0] # "xy" or "xx"
f = pysam.VariantFile(in_file)

[W::hts_idx_load3] The index file is older than the data file: /Users/vinter/projects/WINGS/varpile/data/source/xy.vcf.gz.tbi


## Template header

make the tempalte header based on the source file.
We want to limit the number of chromosomes as well.
This header does not have a sample attahced to it

In [65]:
# contigs_to_keep = [f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY", "chrM"] # all human chromosomes
contigs_to_keep =  ["chr21", "chr22", "chrX", "chrY", "chrM"]

# create a header template
header_template = pysam.VariantHeader()
for record in f.header.records:
    if record.type == "CONTIG":
        if record.get("ID") not in contigs_to_keep:
            continue

    header_template.add_record(record)

In [16]:
def write_header(out_path: Path | str, header_template: pysam.VariantHeader, sample_name: str):
    """Helper function Creates the file from the tempalte header."""
    header = header_template.copy()
    header.add_sample(sample_name)
    with open(out_path, "w") as out:
        out.write(str(header))


In [17]:
def subsample(records, sample_size):
    elements = [(i, x) for i, x in enumerate(records)]
    elements = random.sample(elements, sample_size)
    elements = sorted(elements, key=lambda x: x[0])
    return [x[1] for x in elements]

## Write the files



In [79]:
random.seed(42)  #For the sake of reproducibility
number_of_files = 10
percentage = 0.25 # 10 % of variants per chromosome

sample_names = [f"{base_name}_sample_{i}" for i in range(1, number_of_files + 1)]
out_files = {name: data.vcfs / f"{name}.vcf" for name in sample_names}

# Generate files with header
for name, vcf_path in out_files.items():
    write_header(vcf_path, header_template, name)

print(f.filename)

# Populate files, one contig at a time
for contig in contigs_to_keep:
    # collect variants:
    records = list(f.fetch(contig))[:100_000] # 100K variants

    # per file subsample the records for given chromosome and write to the file
    for name, vcf_path in out_files.items():
        with vcf_path.open("a") as out:
            # Calc the sample size
            sample_size = int(len(records) * percentage)
            if sample_size < 5:
                sample_size = min(5, len(records)) # if low number of variants try to get at least 5
            sampled_records = subsample(records, sample_size)
            for record in sampled_records:
                out.write(str(record))

b'/Users/vinter/projects/WINGS/varpile/data/source/xy.vcf.gz'


In [77]:
%%sh
# bgzip and index all .vcf files
find vcfs/*.vcf -exec bgzip -kf {} \; -exec bcftools index --tbi {}.gz \;

# Make multisample vcf


In [68]:
%%sh
bcftools merge vcfs/xx_sample_1.vcf.gz vcfs/xy_sample_1.vcf.gz > vcfs/merged.vcf
find vcfs/merged.vcf -exec bgzip -kf {} \; -exec bcftools index --tbi {}.gz \;

[E::hts_open_format] Failed to open file "vcfs/xx_sample_1.vcf.gz" : No such file or directory
Failed to open vcfs/xx_sample_1.vcf.gz: No such file or directory
index: "vcfs/merged.vcf.gz" is in a format that cannot be usefully indexed


## Cleanup

In [80]:
%%sh
rm vcfs/*.vcf