In [1]:
import csv
import re
import subprocess
from enum import Enum
from pathlib import Path
from typing import Union

import polars as pl
from polars import selectors as cs

In [2]:
import vcf_processing.models as models
import vcf_processing.preprocessing as pp

In [3]:
# dev: for POG1297

DATA_PATH = Path("../data/")

In [4]:
metadata_raw_indels, header_indels = pp.read_vcf_metadata(DATA_PATH / "sample.indels.vcf")
metadata_raw_snvs, header_indels = pp.read_vcf_metadata(DATA_PATH / "sample.snvs.vcf")

In [5]:
data_snvs = pp.read_vcf_data(DATA_PATH / "sample.snvs.vcf", len(metadata_raw_snvs) + 1)
data_snvs

#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,F111183,F111704
str,i64,str,str,str,str,str,str,str,str,str
"""chr1""",630074,"""rs7349151""","""T""","""C""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:PGT:PID:…","""0|0:51,0:0.019:51:18,0:22,0:0|…","""0|1:115,18:0.132:133:33,10:64,…"
"""chr1""",1395944,""".""","""A""","""C""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:48,0:0.020:48:21,0:25,0:23…","""0/1:98,14:0.125:112:49,6:46,8:…"
"""chr1""",1415696,""".""","""C""","""A""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:59,0:0.017:59:31,0:28,0:28…","""0/1:106,30:0.220:136:57,18:49,…"
"""chr1""",1777972,""".""","""G""","""A""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:44,0:0.022:44:13,0:31,0:28…","""0/1:89,12:0.129:101:39,7:49,5:…"
"""chr1""",1959772,"""rs1015196821""","""C""","""T""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:55,0:0.018:55:16,0:39,0:25…","""0/1:73,76:0.505:149:36,36:37,3…"
…,…,…,…,…,…,…,…,…,…,…
"""chrUn_GL000224v1""",53004,""".""","""A""","""C""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:34,0:0.028:34:12,0:21,0:15…","""0/1:114,23:0.167:137:49,11:65,…"
"""chrUn_KI270743v1""",74211,""".""","""A""","""T""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:50,0:0.019:50:16,0:33,0:20…","""0/1:111,20:0.155:131:50,7:59,1…"
"""chrUn_KI270743v1""",204754,""".""","""C""","""A""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:43,0:0.022:43:24,0:19,0:16…","""0/1:115,30:0.209:145:45,12:70,…"
"""chrUn_KI270746v1""",7651,""".""","""T""","""A""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""","""0/0:64,0:0.015:64:30,0:33,0:33…","""0/1:152,13:0.085:165:65,6:85,7…"


In [6]:
metadata_indels = pp.parse_vcf_metadata(metadata_raw_indels)
metadata_snvs = pp.parse_vcf_metadata(metadata_raw_snvs)

In [7]:
data_indels = pp.read_vcf_data(DATA_PATH / "sample.indels.vcf", len(metadata_raw_indels) + 1)
data_indels

#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,NORMAL,TUMOR,F111183,F111704
str,i64,str,str,str,str,str,str,str,str,str,str,str
"""chr1""",2081028,""".""","""TG""","""T""",""".""","""PASS""","""SOMATIC;QSI=75;TQSI=1;NT=ref;Q…","""GT:DP:DP2:TAR:TIR:TOR:DP50:FDP…",""".:54:54:53,54:0,0:1,1:50.50:0.…",""".:118:118:90,92:21,22:8,8:120.…","""0|0:54:.:.:.:.:.:.:.:.:54,0:0.…","""0|1:122:.:.:.:.:.:.:.:.:98,24:…"
"""chr1""",4031552,"""rs113283042;rs140543974;rs5768…","""G""","""GT""",""".""","""PASS""","""SOMATIC;QSI=59;TQSI=1;NT=ref;Q…","""DP:DP2:TAR:TIR:TOR:DP50:FDP50:…","""45:45:42,42:0,0:3,3:44.36:0.25…","""115:115:81,82:21,22:11,10:111.…",""".""","""."""
"""chr1""",6481562,""".""","""A""","""AATAT""",""".""","""PASS""","""SOMATIC;QSI=51;TQSI=1;NT=ref;Q…","""DP:DP2:TAR:TIR:TOR:DP50:FDP50:…","""43:43:36,38:3,3:4,3:44.67:1.32…","""111:111:57,62:36,39:21,19:110.…",""".""","""."""
"""chr1""",7156275,""".""","""CA""","""C""",""".""","""PASS""","""SOMATIC;QSI=59;TQSI=1;NT=ref;Q…","""DP:DP2:TAR:TIR:TOR:DP50:FDP50:…","""39:39:28,31:0,0:12,9:39.78:2.4…","""103:103:52,54:25,25:26,25:103.…",""".""","""."""
"""chr1""",11633506,"""rs573016490""","""AT""","""A""",""".""","""PASS""","""SOMATIC;QSI=61;TQSI=2;NT=ref;Q…","""GT:DP:DP2:TAR:TIR:TOR:DP50:FDP…",""".:34:34:28,28:0,0:5,5:31.28:1.…",""".:109:109:56,58:30,30:21,23:10…","""0/0:24:.:.:.:.:.:.:.:.:24,0:0.…","""0/1:70:.:.:.:.:.:.:.:.:42,28:0…"
…,…,…,…,…,…,…,…,…,…,…,…,…
"""chrX""",153296394,""".""","""T""","""TA""",""".""","""PASS""","""AS_FilterStatus=SITE;AS_SB_TAB…","""GT:AD:AF:DP:F1R2:F2R1:SB""",""".""",""".""","""0/0:12,0:0.068:12:5,0:6,0:3,9,…","""0/1:14,46:0.761:60:10,20:3,21:…"
"""chrX""",153557881,"""rs781955274""","""A""","""AGC""",""".""","""PASS""","""SOMATIC;QSI=53;TQSI=1;NT=ref;Q…","""DP:DP2:TAR:TIR:TOR:DP50:FDP50:…","""18:18:16,17:0,0:2,1:20.00:2.61…","""105:105:62,69:25,26:20,15:107.…",""".""","""."""
"""chrX""",154696298,"""rs35105330""","""C""","""CA""",""".""","""PASS""","""SOMATIC;QSI=75;TQSI=1;NT=ref;Q…","""GT:DP:DP2:TAR:TIR:TOR:DP50:FDP…",""".:24:24:17,19:0,0:7,5:29.31:4.…",""".:101:101:11,14:49,53:42,35:10…","""0/0:9:.:.:.:.:.:.:.:.:9,0:0.07…","""0/1:26:.:.:.:.:.:.:.:.:6,20:0.…"
"""chrUn_KI270749v1""",78140,""".""","""ATATATG""","""A""",""".""","""PASS""","""SOMATIC;QSI=51;TQSI=1;NT=ref;Q…","""DP:DP2:TAR:TIR:TOR:DP50:FDP50:…","""75:75:75,78:0,0:0,0:72.19:0.00…","""260:260:242,259:9,9:10,12:259.…",""".""","""."""


In [8]:
# concat here since according to docs merge is used when the samples between files are SHARED
pp.vcf_concat(
    DATA_PATH / "sample.indels.vcf", 
    DATA_PATH / "sample.snvs.vcf", 
    temp_dir=DATA_PATH / "short_read_tmp/")

CompletedProcess(args=['bcftools', 'concat', '-a', '../data/short_read_tmp/vcf_1_compatible.vcf.gz', '../data/short_read_tmp/vcf_2_compatible.vcf.gz', '-o', PosixPath('../data/short_read_tmp/concat.vcf.gz'), '-O', 'b'], returncode=0, stdout=b'', stderr=b'Checking the headers and starting positions of 2 files\n')


PosixPath('../data/short_read_tmp/concat.vcf.gz')

In [9]:
sample_rename = [
    {"SAMPLE": "F138871"},
    {"SAMPLE": "F138872"},
]

# merge here since according to docs merge is used when the samples between files are DIFFERENT
pp.vcf_merge(
    DATA_PATH / "sample.longread_F138871.vcf", 
    DATA_PATH / "sample.longread_F138872.vcf", 
    sample_rename=sample_rename,
    temp_dir=DATA_PATH / "long_read_tmp/"
)

PosixPath('../data/long_read_tmp/merge.vcf.gz')

In [10]:
# question here is should we merge since the samples are technically different. or if we should concatenate since the conditions of the sample are technically the same (from the same source?) just sequenced using different technologies

concat_vcf = Path("../data/short_read_tmp/concat.vcf.gz")
merge_vcf = Path("../data/long_read_tmp/merge.vcf.gz")

# maybe use concat to start since it's faster
# rename samples to make concatenation possible
sample_rename_short = {
    "F111183": "NORMAL",
    "F111704": "TUMOR",
}
pp.vcf_rename(
    concat_vcf,
    sample_rename_short,
)

sample_rename_long = {
    "F138871": "NORMAL",
    "F138872": "TUMOR",
}
pp.vcf_rename(
    merge_vcf,
    sample_rename_long,
)

# then concat
pp.vcf_concat(
    concat_vcf,
    merge_vcf,
    temp_dir=DATA_PATH / "final_tmp/"
)

CompletedProcess(args=['bcftools', 'concat', '-a', '../data/final_tmp/vcf_1_compatible.vcf.gz', '../data/final_tmp/vcf_2_compatible.vcf.gz', '-o', PosixPath('../data/final_tmp/concat.vcf.gz'), '-O', 'b'], returncode=0, stdout=b'', stderr=b'Checking the headers and starting positions of 2 files\n[W::bcf_hdr_merge] Trying to combine "AF" tag definitions of different lengths\n')


PosixPath('../data/final_tmp/concat.vcf.gz')