In [20]:
import csv
import re
import subprocess
from enum import Enum
from pathlib import Path

import polars as pl
from polars import selectors as cs

In [21]:
import vcf_processing.models as models
import vcf_processing.preprocessing as pp

In [22]:
# dev: for POG1298

DATA_PATH = Path("../data/")

In [23]:
metadata_raw_indels, data_indels = pp.split_vcf(DATA_PATH / "sample.indels.vcf")
metadata_raw_snvs, data_snvs = pp.split_vcf(DATA_PATH / "sample.snvs.vcf")

In [24]:
metadata_indels = pp.parse_vcf_metadata(metadata_raw_indels)
metadata_snvs = pp.parse_vcf_metadata(metadata_raw_snvs)

In [25]:
set(data_indels.collect_schema().names())

{'#CHROM',
 'ALT',
 'F111184',
 'F111188',
 'FILTER',
 'FORMAT',
 'ID',
 'INFO',
 'NORMAL',
 'POS',
 'QUAL',
 'REF',
 'TUMOR'}

In [26]:
def make_concat_compatible(vcf_1: pl.LazyFrame, vcf_2: pl.LazyFrame):
    """
    Make two VCFs compatible for combining using bcftools concat
    """

    VCF_HEADER = [
        "#CHROM",
        "POS",
        "ID",
        "REF",
        "ALT",
        "QUAL",
        "FILTER",
        "INFO",
        "FORMAT",
    ]

    vcf_1_samples = set(vcf_1.collect_schema().names())
    vcf_2_samples = set(vcf_2.collect_schema().names())

    shared_samples = vcf_1_samples.intersection(vcf_2_samples) - set(VCF_HEADER)

    vcf_1_compatible = vcf_1.select(
        VCF_HEADER + list(shared_samples)
    )
    vcf_2_compatible = vcf_2.select(
        VCF_HEADER + list(shared_samples)
    )

    return vcf_1_compatible, vcf_2_compatible


In [None]:
import tempfile

with tempfile.TemporaryDirectory() as temp_dir:
    with open(Path(temp_dir) / "vcf_1.vcf", "w") as vcf_1:
        vcf_1.writelines(metadata_raw_indels)
        data_indels.collect().write_csv(
            vcf_1, include_header=True, separator="\t"
        )

    with open(Path(temp_dir) / "vcf_2.vcf", "w") as vcf_2:
        vcf_2.writelines(metadata_raw_snvs)
        data_snvs.collect().write_csv(
            vcf_2, include_header=True, separator="\t"
        )

In [None]:
def vcf_concat(
        vcf_1_path: Union[str, Path],
        vcf_2_path: Union[str, Path],
):
    """
    Combine two VCF files using bcftools concat
    """

    # TODO: check that bcftools is installed

    vcf_1_path = Path(vcf_1_path)
    vcf_2_path = Path(vcf_2_path)

    # bcftools concat requires bgzipped VCFs
    subprocess.run(
        [
            "bgzip",
            str(vcf_1_path),
            "&&",
            "bgzip",
            str(vcf_2_path),
        ],
        check=True
    )

    # bcftools concat also requires bgzipped VCFs to be indexed
    subprocess.run(
        [
            "bcftools",
            "index",
            vcf_1_path.with_suffix(".vcf.gz"),
            "&&",
            "bcftools",
            "index",
            vcf_2_path.with_suffix(".vcf.gz"),
        ]
    )

    subprocess.run(
        [
            "bcftools",
            "concat",
            "-a",
            vcf_1_path.with_suffix(".vcf.gz"),
            vcf_2_path.with_suffix(".vcf.gz"),
            "-o",
            "concat.vcf.gz"
        ]
    )

    pass

In [None]:
df.select(
    pl.col("FORMAT", "TUMOR")
).filter(
    pl.col("FORMAT").str.split(":").list.len() != pl.col("TUMOR").str.split(":").list.len()
).unique(
    subset=["TUMOR"]    
).filter(
    pl.col("TUMOR").str.split(":").list.len() < 10
)

FORMAT,TUMOR
str,str
"""GT:AD:AF:DP:F1R2:F2R1:SB""","""."""


In [190]:
df.filter(
    pl.col("FORMAT").str.contains("PS")
).select(
    pl.col("FORMAT", "NORMAL", "TUMOR", "F111183", "F111704")
).row(0)

('GT:DP:DP2:TAR:TIR:TOR:DP50:FDP50:SUBDP50:BCN50:AD:AF:F1R2:F2R1:PGT:PID:PS:SB',
 '.:54:54:53,54:0,0:1,1:50.50:0.29:0.00:0.00',
 '.:118:118:90,92:21,22:8,8:120.73:0.00:0.00:0.00',
 '0|0:54:.:.:.:.:.:.:.:.:54,0:0.018:25,0:25,0:0|1:2081028_TG_T:2081028:25,29,0,0',
 '0|1:122:.:.:.:.:.:.:.:.:98,24:0.202:49,12:45,12:0|1:2081028_TG_T:2081028:46,52,10,14')

In [189]:
df.columns

['#CHROM',
 'POS',
 'ID',
 'REF',
 'ALT',
 'QUAL',
 'FILTER',
 'INFO',
 'FORMAT',
 'NORMAL',
 'TUMOR',
 'F111183',
 'F111704']

In [184]:
df = pl.read_csv(
    str(DATA_PATH / "sample.snvs.vcf"),
    separator="\t",
    skip_lines=count_vcf_metadata_rows(DATA_PATH / "sample.snvs.vcf"),
    has_header=True,
)

df.head()

#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,F111183,F111704
str,i64,str,str,str,str,str,str,str,str,str
"""chr1""",630074,"""rs7349151""","""T""","""C""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:PGT:PID:…","""0|0:51,0:0.019:51:18,0:22,0:0|…","""0|1:115,18:0.132:133:33,10:64,…"
"""chr1""",1395944,""".""","""A""","""C""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:48,0:0.020:48:21,0:25,0:23…","""0/1:98,14:0.125:112:49,6:46,8:…"
"""chr1""",1415696,""".""","""C""","""A""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:59,0:0.017:59:31,0:28,0:28…","""0/1:106,30:0.220:136:57,18:49,…"
"""chr1""",1777972,""".""","""G""","""A""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:44,0:0.022:44:13,0:31,0:28…","""0/1:89,12:0.129:101:39,7:49,5:…"
"""chr1""",1959772,"""rs1015196821""","""C""","""T""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:55,0:0.018:55:16,0:39,0:25…","""0/1:73,76:0.505:149:36,36:37,3…"
