In [1]:
import csv
import re
import subprocess
from enum import Enum
from pathlib import Path
from typing import Union

import polars as pl
from polars import selectors as cs

In [2]:
import vcf_processing.models as models
import vcf_processing.preprocessing as pp

In [3]:
# dev: for POG1297

DATA_PATH = Path("../data/")

In [4]:
metadata_raw_indels, header_indels = pp.read_vcf_metadata(DATA_PATH / "sample.indels.vcf")
metadata_raw_snvs, header_indels = pp.read_vcf_metadata(DATA_PATH / "sample.snvs.vcf")

In [5]:
metadata_indels = pp.parse_vcf_metadata(metadata_raw_indels)
metadata_snvs = pp.parse_vcf_metadata(metadata_raw_snvs)

In [6]:
pp.vcf_concat(
    DATA_PATH / "sample.indels.vcf", 
    DATA_PATH / "sample.snvs.vcf", 
    temp_dir=DATA_PATH / "short_read_tmp/")

Checking the headers and starting positions of 2 files


PosixPath('../data/short_read_tmp/concat.vcf.gz')

In [7]:
sample_rename = [
    {"SAMPLE": "F138871"},
    {"SAMPLE": "F138872"},
]

pp.vcf_merge(
    DATA_PATH / "sample.longread_F138871.vcf", 
    DATA_PATH / "sample.longread_F138872.vcf", 
    sample_rename=sample_rename,
    temp_dir=DATA_PATH / "long_read_tmp/"
)

Error reading the --samples file "SAMPLE F138871"


CalledProcessError: Command '['bcftools', 'reheader', PosixPath('../data/long_read_tmp/sample.longread_F138871.vcf.gz'), '-s', 'SAMPLE F138871', '-o', PosixPath('../data/long_read_tmp/vcf_1_compatible.vcf.gz')]' returned non-zero exit status 255.

In [None]:
import os
import tempfile

outputs = ["vcf_1_compatible.vcf.gz", "vcf_2_compatible.vcf.gz"]

vcf_1_path = DATA_PATH / "long_read_tmp/sample.longread_F138871.vcf.gz"
vcf_2_path = DATA_PATH / "long_read_tmp/sample.longread_F138872.vcf.gz"
temp_dir = DATA_PATH / "long_read_tmp/"

for file_idx, vcf_path in enumerate([vcf_1_path, vcf_2_path]):
    rename_args = "\n".join(
        [f"{old_name} {new_name}" for old_name, new_name in sample_rename[file_idx].items()]
    )

    
    with open(temp_dir / "sample_file.txt", "w") as sample_file:
        sample_file.write(rename_args)
        sample_file.close()
    

    subprocess.run(
        [
            "bcftools",
            "reheader",
            vcf_path,
            "-s",
            sample_file.name,
            "-o",
            temp_dir / outputs[file_idx],
        ],
        check=True,
    )

    os.remove(sample_file)

In [11]:
(DATA_PATH / "short_read_tmp/concat.vcf.gz").suffix

'.gz'

In [None]:
len([elem.split("=") for elem in data_concat.collect().head().select("INFO").row(1)[0].split(";")])

In [None]:
metadata_longread, data_longread = pp.vcf_split(DATA_PATH / "sample.longread_1.vcf")

In [None]:
subprocess.run(
    [
        "bcftools",
        "head",
        DATA_PATH / "sample.longread_1.vcf",
    ]
)