Skip to content

Commit 3bd2898

Browse files
authored
Merge pull request #4 from codefitz/codex/add-tests-for-vcf-header-and-duplicates
Improve VCF header validation
2 parents 6900c4f + 7429d26 commit 3bd2898

File tree

2 files changed

+109
-2
lines changed

2 files changed

+109
-2
lines changed

tests/test_vcf_validation.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import tempfile
2+
import os
3+
import unittest
4+
from vcf_validation import validate_vcf
5+
6+
class TestVCFValidation(unittest.TestCase):
7+
def _write_temp_vcf(self, content):
8+
tmp = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.vcf')
9+
tmp.write(content)
10+
tmp.flush()
11+
tmp.close()
12+
self.addCleanup(lambda: os.remove(tmp.name))
13+
return tmp.name
14+
15+
def test_valid_file(self):
16+
content = """##fileformat=VCFv4.2
17+
##contig=<ID=1>
18+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tS1\tS2
19+
1\t1\tGAIN1\tA\t<CNV>\t30\tPASS\tSVTYPE=CNV\tCN:GT\t0/1\t1/1
20+
"""
21+
path = self._write_temp_vcf(content)
22+
# Should not raise SystemExit
23+
validate_vcf(path)
24+
25+
def test_missing_fileformat(self):
26+
content = """##contig=<ID=1>
27+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tS1
28+
1\t1\tGAIN1\tA\t<CNV>\t30\tPASS\tSVTYPE=CNV\tCN:GT\t0/1
29+
"""
30+
path = self._write_temp_vcf(content)
31+
with self.assertRaises(SystemExit):
32+
validate_vcf(path)
33+
34+
def test_missing_chrom_header(self):
35+
content = """##fileformat=VCFv4.2
36+
##contig=<ID=1>
37+
1\t1\tGAIN1\tA\t<CNV>\t30\tPASS\tSVTYPE=CNV\tCN:GT\t0/1
38+
"""
39+
path = self._write_temp_vcf(content)
40+
with self.assertRaises(SystemExit):
41+
validate_vcf(path)
42+
43+
def test_missing_format_with_genotypes(self):
44+
content = """##fileformat=VCFv4.2
45+
##contig=<ID=1>
46+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tS1
47+
1\t1\tGAIN1\tA\t<CNV>\t30\tPASS\tSVTYPE=CNV\t0/1
48+
"""
49+
path = self._write_temp_vcf(content)
50+
with self.assertRaises(SystemExit):
51+
validate_vcf(path)
52+
53+
def test_duplicate_sample_names(self):
54+
content = """##fileformat=VCFv4.2
55+
##contig=<ID=1>
56+
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tS1\tS1
57+
1\t1\tGAIN1\tA\t<CNV>\t30\tPASS\tSVTYPE=CNV\tCN:GT\t0/1\t0/1
58+
"""
59+
path = self._write_temp_vcf(content)
60+
with self.assertRaises(SystemExit):
61+
validate_vcf(path)
62+
63+
if __name__ == '__main__':
64+
unittest.main()

vcf_validation.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,29 @@
1919
# 1.0.0 : WMF : Created.
2020
# 1.0.1 : WMF : Updated with Congenica rules.
2121
# 1.0.2 : WMF : Added support for bgzipped files. Updated error message for Alternate Alleles.
22+
# 1.0.3 : WMF : Added header validations and duplicate sample detection.
23+
#
24+
# Header line syntax
25+
# ------------------
26+
# The header line names the 8 fixed, mandatory columns. These columns are as follows:
27+
# 1. #CHROM
28+
# 2. POS
29+
# 3. ID
30+
# 4. REF
31+
# 5. ALT
32+
# 6. QUAL
33+
# 7. FILTER
34+
# 8. INFO
35+
# If genotype data is present in the file, these are followed by a FORMAT column header, then an arbitrary number
36+
# of sample IDs. Duplicate sample IDs are not allowed. The header line is tab-delimited.
37+
#
38+
# Congenica Strict Rules
39+
# contig = ID=1-22, ID=chr* - this fails
40+
# Must have a FORMAT field (9 columns)
41+
# INFO Must contain SVTYPE=CNV
42+
# ALT must be <CNV> for CNV types
43+
# ID must contain "LOSS" or "GAIN"
44+
# FORMAT field must have "CN"
2245

2346
import sys
2447
import re
@@ -119,20 +142,33 @@ def validate_vcf(vcf_file, strict=False, report=False):
119142
with open_func(vcf_file, 'rt') as file:
120143

121144
line_number = 0
145+
fileformat_found = False
146+
header_found = False
122147
for line in file:
123148
line_number += 1
124149
if line.startswith("##"):
125-
if strict and line.startswith("##contig"):
150+
if line.startswith("##fileformat"):
151+
fileformat_found = True
152+
if line.startswith("##contig"):
126153
contig_info = line.split('<',1)[1].split('>')[0]
127154
id_info = [x for x in contig_info.split(',') if x.startswith('ID=')]
128155
if id_info:
129156
contig_id = id_info[0].split('=')[1]
130157
if contig_id.startswith("chr"):
131158
print(f"Error: Contig ID starts with 'chr' on line {line_number}: {line.strip()}")
132159
sys.exit(1)
133-
134160
continue
135161
elif line.startswith("#CHROM"):
162+
header_found = True
163+
header_fields = line.strip().split('\t')
164+
if len(header_fields) > 8 and header_fields[8] != "FORMAT":
165+
print("Error: FORMAT column missing from header line")
166+
sys.exit(1)
167+
if len(header_fields) > 9:
168+
sample_names = header_fields[9:]
169+
if len(sample_names) != len(set(sample_names)):
170+
print("Error: Duplicate sample names in header line")
171+
sys.exit(1)
136172
continue
137173
else:
138174
fields = line.strip().split('\t')
@@ -150,6 +186,13 @@ def validate_vcf(vcf_file, strict=False, report=False):
150186
validate_info(fields[7], line_number, line)
151187
validate_format(fields[8], line_number, line)
152188

189+
if not fileformat_found:
190+
print("Error: Missing ##fileformat header")
191+
sys.exit(1)
192+
if not header_found:
193+
print("Error: Missing #CHROM header line")
194+
sys.exit(1)
195+
153196
if report:
154197
print("VCF file validation completed. No structural errors found.")
155198

0 commit comments

Comments
 (0)