1919# 1.0.0 : WMF : Created.
2020# 1.0.1 : WMF : Updated with Congenica rules.
2121# 1.0.2 : WMF : Added support for bgzipped files. Updated error message for Alternate Alleles.
22+ # 1.0.3 : WMF : Added header validations and duplicate sample detection.
23+ #
24+ # Header line syntax
25+ # ------------------
26+ # The header line names the 8 fixed, mandatory columns. These columns are as follows:
27+ # 1. #CHROM
28+ # 2. POS
29+ # 3. ID
30+ # 4. REF
31+ # 5. ALT
32+ # 6. QUAL
33+ # 7. FILTER
34+ # 8. INFO
35+ # If genotype data is present in the file, these are followed by a FORMAT column header, then an arbitrary number
36+ # of sample IDs. Duplicate sample IDs are not allowed. The header line is tab-delimited.
37+ #
38+ # Congenica Strict Rules
39+ # contig = ID=1-22, ID=chr* - this fails
40+ # Must have a FORMAT field (9 columns)
41+ # INFO Must contain SVTYPE=CNV
42+ # ALT must be <CNV> for CNV types
43+ # ID must contain "LOSS" or "GAIN"
44+ # FORMAT field must have "CN"
2245
2346import sys
2447import re
@@ -119,20 +142,33 @@ def validate_vcf(vcf_file, strict=False, report=False):
119142 with open_func (vcf_file , 'rt' ) as file :
120143
121144 line_number = 0
145+ fileformat_found = False
146+ header_found = False
122147 for line in file :
123148 line_number += 1
124149 if line .startswith ("##" ):
125- if strict and line .startswith ("##contig" ):
150+ if line .startswith ("##fileformat" ):
151+ fileformat_found = True
152+ if line .startswith ("##contig" ):
126153 contig_info = line .split ('<' ,1 )[1 ].split ('>' )[0 ]
127154 id_info = [x for x in contig_info .split (',' ) if x .startswith ('ID=' )]
128155 if id_info :
129156 contig_id = id_info [0 ].split ('=' )[1 ]
130157 if contig_id .startswith ("chr" ):
131158 print (f"Error: Contig ID starts with 'chr' on line { line_number } : { line .strip ()} " )
132159 sys .exit (1 )
133-
134160 continue
135161 elif line .startswith ("#CHROM" ):
162+ header_found = True
163+ header_fields = line .strip ().split ('\t ' )
164+ if len (header_fields ) > 8 and header_fields [8 ] != "FORMAT" :
165+ print ("Error: FORMAT column missing from header line" )
166+ sys .exit (1 )
167+ if len (header_fields ) > 9 :
168+ sample_names = header_fields [9 :]
169+ if len (sample_names ) != len (set (sample_names )):
170+ print ("Error: Duplicate sample names in header line" )
171+ sys .exit (1 )
136172 continue
137173 else :
138174 fields = line .strip ().split ('\t ' )
@@ -150,6 +186,13 @@ def validate_vcf(vcf_file, strict=False, report=False):
150186 validate_info (fields [7 ], line_number , line )
151187 validate_format (fields [8 ], line_number , line )
152188
189+ if not fileformat_found :
190+ print ("Error: Missing ##fileformat header" )
191+ sys .exit (1 )
192+ if not header_found :
193+ print ("Error: Missing #CHROM header line" )
194+ sys .exit (1 )
195+
153196 if report :
154197 print ("VCF file validation completed. No structural errors found." )
155198
0 commit comments