-
Notifications
You must be signed in to change notification settings - Fork 1
/
filterVcfOnVcf.pl
executable file
·1502 lines (1358 loc) · 55.1 KB
/
filterVcfOnVcf.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl
use warnings;
use strict;
use Getopt::Long;
use Parallel::ForkManager;
use Pod::Usage;
use Data::Dumper;
use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
use POSIX qw(strftime);
use FindBin qw($RealBin);
use lib "$RealBin/lib";
use lib "$RealBin/lib/dapPerlGenomicLib";
use VcfReader 0.3;
use VcfhacksUtils;
=head1 NAME
filterVcfOnVcf.pl - filter a VCF file using variants from one or more other VCF files.
=head1 SYNOPSIS
filterVcfOnVcf.pl -i <input vcf file> -f <vcf to use to filter input> [options]
filterVcfOnVcf.pl -i <input vcf file> -d <directory containing vcfs to use to filter input> [options]
filterVcfOnVcf.pl -h (show help message)
filterVcfOnVcf.pl -m (show manual page)
=head1 ARGUMENTS
=over 8
=item B<-i --input>
Input VCF file.
=item B<-o --output>
Output file. Optional - defaults to STDOUT.
=item B<-f --filter>
A VCF file to use to filter input. Variants from the input VCF that match a variant in this file will be filtered from the output. By default, if any sample matches a variant it will be filtered but combinations of --reject, --not_samples, --allele_frequency_filter, --threshold and --filter_homozygotes can be used to modify this behaviour. However, if more than one allele is present for a variant in your input VCF it will only be filtered if ALL alleles are matched in your filter VCF.
Alternatively, if you want to filter ANY matching variant regardless of genotypes in your --filter VCF, simply add the -w/--info_filter argument without specifying any values for --allele_frequency_filter, --threshold or --filter_homozygotes.
=item B<-s --samples>
Samples from input file to check. If specified only variant alleles from these samples will be used to compare against variants in filter VCFs. Any line in your input without a variant call in these samples will be filtered. Default is to look at all alleles for each variant.
=item B<-r --reject>
Samples from filter VCF files to use for filtering. If specified only variant alleles from these samples will be used to compare with the input VCF. Default is to look at all alleles.
=item B<-x --not_samples>
Samples from filter VCF files to ignore. If specified variant alleles from all samples except these will be used to compare with the input VCF. Default is to look at all alleles.
=item B<-w --info_filter>
Use this flag to use metrics written to the INFO field of your filter VCF for filtering on frequency, homozygosity or threshold counts rather than checking sample genotypes. Filtering on allele frequency (see --allele_frequency_filter option below) can be performed as long as your VCF contains allele frequency (AF) INFO tags and optionally population specific allele count (AC) and allele number (AN) tags (see --population_ids option below). This option is particularly useful for filtering against VCFs from ExAC using global and population allele frequencies. Alternatively, you can pre-process your own filter VCFs with the sampleCallsToInfo.pl script to reduce genotype information to AF, PGTS and GTC fields. Converting filter VCFs containing many hundreds/thousands of samples with sampleCallsToInfo.pl for use with this option will speed up your analyses by many orders of magnitude.
This option can also be used without using --allele_frequency_filter/--threshold/--filter_homozygotes arguments in order to filter any matching variant regardless of sample genotypes.
=item B<-y --allele_frequency_filter>
Reject variants if the allele frequency in your --filter VCF is equal to or greater than this value. If samples from the filter VCF have been specified using the --reject or --not_samples argument, only the relevant samples will be used for calculating allele frequency. Variants will only be counted if the genotype quality is greater than the value given for --un_quality. The value must be a float between 0 and 1.
=item B<--population_ids>
If using the --info_filter option and --allele_frequency_filter option, by default the filter VCF will be scanned for allele counts (e.g. AFR_AC) and allele number (e.g. AFR_AN) INFO tags for the population codes as used by EXAC (AFR, AMR, EAS, FIN, NFE, SAS). An alternative list of population IDs to search can be entered here. To disable this feature use this option specifying 'disable' as an argument.
=item B<-n --minimum_allele_number>
Minimum number of allele counts for a variant found in a --filter VCF before it will be used for filtering on allele frequency. If filtering using population allele counts (see --population_ids above) this sets the minimum number of alleles that have to be called in a population (default = 100) before the frequency value will be used for filtering. If filtering using samples in a VCF the default is 0 (i.e. no minimum allele number).
=item B<-l --threshold>
Filter lines if the number of samples containing at least one matching variant allele is equal to or greater than this value.
=item B<-z --filter_homozygotes>
Filter lines if a sample is homozygous for a variant. If --reject or --not_samples options are used only the relevant samples will be checked. If this option is used with --allele_frequency_filter or --threshold options lines will be filtered if the variant is homozygous in a sample regardless of allele frequency; if there is no homozygous sample lines will be filtered n frequency as normal. If used without --allele_frequency_filter or --threshold options lines will only be filtered if the variant is homozygous in at least one sample.
=item B<-q --quality>
Minimum variant quality as defined by the QUAL field. Variants in the input will only be printed if they have an equal or higher variant quality score. Variants in filter VCFs will only be used for filtering if they have an equal or higher variant quality score.
=item B<-g --genotype_quality>
Minimum genotype quality. Only applicable if --samples, --reject or --threshold arguments are used. Sample alleles will only be counted if they have a genotype quality score equal to or higher than this value. Default is 0 (i.e. no filtering on genotype quality).
=item B<-a --aff_quality>
Minimum genotype qualities to consider for samples specified by --samples argument only. Any sample call with a genotype quality below this threshold will be considered a no call. Default is 0.
=item B<-u --un_quality>
Minimum genotype qualities to consider for samples specified by --reject argument only. Any sample call with a genotype quality below this threshold will be considered a no call. Default is 0.
=item B<-p --print_matching>
Use this flag to reverse the script so that only matching lines are printed.
=item B<--annotation>
Use this option to specify an annotation for allele frequencies and allele numbers calculated from samples/INFO fields in your --filter VCF. These annotations will be added to the INFO field of the output as 'annotation_AF' and 'annotation_AN' where 'annotation' is the value you supply to this option. For example, if specifying '--annotation controls' it would add 'controls_AF' and 'controls_AN' INFO fields giving the allele frequency and allele numbers respectively for the samples used in your --filter VCF. These annotations can be used by programs such as findBiallelic.pl and getFunctionalVariants.pl for filtering using custom allele frequencies.
=item B<-t --forks>
Number of forks to create for parallelising your analysis. By default no forking is done. To speed up your analysis you may specify the number of parallel processes to use here. (N.B. forking only occurs if a value of 2 or more is given here as creating 1 fork only results in increased overhead with no performance benefit).
=item B<-c --cache>
Cache size. Variants are processed in batches to allow for efficient parallelisation. When forks are used the default is to process up to 10,000 variants at once or 1,000 x no. forks if more than 10 forks are used. If you find this program comsumes too much memory when forking you may want to set a lower number here. When using forks you may get improved performance by specifying a higher cache size, however the increase in memory usage is proportional to your cache size multiplied by the number of forks.
=item B<-b --progress>
Show a progress bar.
=item B<-h --help>
Show help message.
=item B<-m --manual>
Show manual page.
=back
=head1 DESCRIPTION
Filter variants from a VCF using another VCFs. You may specify samples or a number of samples with matching variants to filter with. You can also use minimum genotype or variant qualities to filter with. Alternatively, filtering can be performed on certain annotations in the INFO field of each variant using the --info_filter option.
By default, each variant from the given --input file will be assessed and if all alternative/variant alleles for a given variant are represented in the --filter VCF provided, the variant will be filtered. You may use --reject or --not_samples arguments to only filter variant alleles if present in specific samples in the --filter VCF. You may also use the --samples argument to only compare variants from specific samples in your --input VCF and filter any variants that do not have alleles represented by these samples. See above for details of all available options.
=head1 EXAMPLES
filterVcfOnVcf.pl -i input.vcf -f controls.vcf -o filtered.vcf
(filter variants in input.vcf if present in controls.vcf - at least one sample in controls.vcf must carry the variant for it to be filtered)
filterVcfOnVcf.pl -i input.vcf -f controls.vcf -o filtered.vcf -y 0.01
(filter variants in input.vcf if present at an allele frequency of 1% or higher in controls.vcf)
filterVcfOnVcf.pl -i input.vcf -f controls.vcf -o filtered.vcf -w
(filter variants in input.vcf if present in controls.vcf - genotypes of samples in controls.vcf will not be checked)
filterVcfOnVcf.pl -i input.vcf -f ExAC.r0.3.sites.vep.vcf.gz -o filtered.vcf -y 0.01 -w
(filter variants in input.vcf if present at an allele frequency of 1% or higher in ExAC using INFO fields for filtering)
=head1 AUTHOR
David A. Parry
=head1 COPYRIGHT AND LICENSE
Copyright 2013, 2014, 2015 David A. Parry
This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
=cut
my $vcf;
my $filter_vcf;
my @dirs;
my @samples
; #samples in vcf input to check calls for - filter only if they contain same allele as in filter_vcf. Default is to simply check alleles in ALT field and ignore samples.
my @reject
; #if specified will only check alleles for these samples in filter_vcfs
my @ignore_samples; #these samples will be ignored in either VCF
my $filter_with_info
; #use allele counts/genotype counts in INFO field to filter with, not samples
my $threshold =
0; #only filter if we see the allele this many times in filter_vcf
my $filter_homozygotes
; #flag to filter if any of the reject samples are homozygous
my $maf = 0;
my $print_matching = 0
; #flag telling the script to invert so we print matching lines and filter non-matching lines
my $out;
my $min_qual = 0;
my $minGQ = 0;
my $aff_quality; #will convert to $minGQ value if not specified
my $unaff_quality; #will convert to $minGQ value if not specified
my $help;
my $man;
my $progress;
my $regex; #match this regex if looking in dir
my @pop_acs = ();
#if we have only one vcf and find population allele counts and allele numbers
#we'll use those to calculate population specific frequencies and filter on MAF
#if --allele_frequency_filter argument is specified
my $MIN_AN ;
# min number of alleles required before @pop_acs values are used for filtering if using info
# or min number of alleles for freq filtering on samples
my @pop_ids = ();
# population IDs to look for for @pop_acs
my $annotate_af = '';
my $forks = 0;
my $buffer_size;
my %opts = (
a => \$aff_quality,
annotation => \$annotate_af,
b => \$progress,
c => \$buffer_size,
f => \$filter_vcf,
g => \$minGQ,
h => \$help,
i => \$vcf,
l => \$threshold,
m => \$man,
n => \$MIN_AN,
o => \$out,
p => \$print_matching,
population_ids => \@pop_ids,
q => \$min_qual,
r => \@reject,
s => \@samples,
t => \$forks,
u => \$unaff_quality,
w => \$filter_with_info,
x => \@ignore_samples,
y => \$maf,
z => \$filter_homozygotes,
);
GetOptions(
\%opts,
'annotation=s',
'a|aff_quality=i',
'b|progress',
'c|cache=i',
'f|filter=s{,}',
'g|genotype_quality=f',
'h|?|help',
'i|input=s',
'l|threshold=i',
'n|minimum_allele_number=i',
'm|manual',
'o|output=s',
'population_ids=s{,}',
'p|print_matching',
'q|quality=f',
'r|reject=s{,}',
's|samples=s{,}',
't|forks=i',
'u|un_quality=i',
'w|info_filter',
'x|not_samples=s{,}',
'y|allele_frequency_filter=f',
'z|filter_homozygotes',
) or pod2usage
(
-message => "Syntax error.",
-exitval => 2
);
pod2usage( -verbose => 2, -exitval => 0 ) if $man;
pod2usage( -verbose => 1, -exitval => 0 ) if $help;
if ( not $vcf or ( not $filter_vcf ) ){
pod2usage( -message => "Syntax error", exitval => 2 );
}
if ( @reject and @ignore_samples ){
pod2usage
(
-message => "Syntax error - cannot use --reject ".
"and --not_samples argument together",
exitval => 2,
);
}
if (( @reject or @ignore_samples ) and $filter_with_info){
pod2usage
(
-message => "Syntax error - cannot use --reject or --not_samples ".
"arguments with --info_filter option",
exitval => 2,
);
}
if ( $min_qual < 0 ){
pod2usage
(
-message => "Variant quality scores must be 0 or greater.\n",
-exitval => 2
);
}
if ( $minGQ < 0 ){
pod2usage
(
-message => "Genotype quality scores must be 0 or greater.\n",
-exitval => 2
);
}
if ( $maf < 0 or $maf > 1 ){
pod2usage
(
-message => "--allele_frequency_filter (-y) value ".
"must be between 0 and 1.\n",
-exitval => 2
);
}
if ( defined $aff_quality ) {
pod2usage
(
-message => "Genotype quality scores must be 0 or greater.\n",
-exitval => 2
) if $aff_quality < 0;
}
else {
$aff_quality = $minGQ;
}
if ( defined $unaff_quality ) {
pod2usage(
-message => "Genotype quality scores must be 0 or greater.\n",
-exitval => 2
) if $unaff_quality < 0;
}
else {
$unaff_quality = $minGQ;
}
if (not @pop_ids){
@pop_ids = qw /POPMAX AFR AMR EAS FIN NFE SAS/
;#default are gnomAD/ExAC pop codes except for OTH and ASJ
}
elsif ( grep{ /^disable$/i } @pop_ids ){
@pop_ids = ();
my $time = strftime( "%H:%M:%S", localtime );
print STDERR "[INFO - $time] Filtering on population allele count / allele".
" number disabled.\n";
}
if (not defined $MIN_AN){
#if using --info_filter we use require at least 100 alleles by default
# before filtering on freq
if ($filter_with_info) {
$MIN_AN = 100;
}else{
$MIN_AN = 0;
}
}
if ( $forks < 2 ) {
$forks = 0; #no point having overhead of forks for one fork
}
else {
if ( not $buffer_size ) {
$buffer_size = 10000 > $forks * 1000 ? 10000 : $forks * 1000;
}
my $time = strftime( "%H:%M:%S", localtime );
print STDERR "[INFO - $time] Processing in batches of $buffer_size ".
"variants split among $forks forks.\n";
}
my $time = strftime( "%H:%M:%S", localtime );
my $total_variants = 0;
print STDERR "[INFO - $time] Initializing input VCF...\n";
my ($header, $first_var, $VCF) = VcfReader::getHeaderAndFirstVariant($vcf);
die "[ERROR] Header not ok for input ($vcf) "
if not VcfReader::checkHeader( header => $header );
my %sample_to_col = ();
if (@samples) {
%sample_to_col = VcfReader::getSamples(
get_columns => 1,
header => $header,
);
foreach my $samp (@samples){
if (not exists $sample_to_col{$samp}){
die "ERROR - Sample $samp was not found in input VCF ($vcf)\n";
}
}
}
$time = strftime( "%H:%M:%S", localtime );
print STDERR "[INFO - $time] Finished initializing input VCF\n";
print STDERR "[INFO - $time] Initializing filter VCF ($filter_vcf)...\n";
my (
$filter_vcf_index,
$filter_vcf_samples,
$filter_info
) = initializeFilterVcfs( $filter_vcf );
print STDERR "[INFO - $time] Finished initializing filter VCF.\n";
if ($filter_with_info) {
check_filter_vcf_info_fields();
}
#only retain samples from filter vcfs if not specified by --not_samples argument
#if --reject argument is used only keep samples specified by --reject
#if neither argument is used keep all
if ( keys %{ $filter_vcf_samples } == 0 ) {
if (@reject) {
die
"Filter VCF $filter_vcf has no samples - cannot run with --reject argument.\n";
}
if (@ignore_samples) {
die "Filter VCF $filter_vcf has no samples - cannot run with ".
"--not_samples argument.\n";
}
if ( $threshold or $maf or $filter_homozygotes ) {
if ( not $filter_with_info ) {
die "Filter VCF $filter_vcf has no samples. Use of ".
"--allele_frequency_filter, --threshold or ".
"--filter_homozygotes options is not allowed with ".
"filter VCFs without samples unless used with the ".
"--info_filter option.\n";
}
}
}
foreach my $ignore (@ignore_samples) {
if ( exists $filter_vcf_samples->{$ignore} ) {
delete $filter_vcf_samples->{$ignore};
}else{
die "[ERROR] Sample $ignore not found in --filter VCF ($filter_vcf).\n";
}
}
foreach my $rej (@reject){
if (not exists $filter_vcf_samples->{$rej} ) {
die "[ERROR] Sample $rej not found in --filter VCF ($filter_vcf).\n";
}
}
if (@reject) {
foreach my $samp ( keys %{ $filter_vcf_samples } ) {
if ( not grep { $_ eq $samp } @reject ) {
delete $filter_vcf_samples->{$samp};
}
}
}
my $OUT;
if ($out) {
open( $OUT, ">$out" ) or die "[ERROR] Can't open $out for writing: $!\n";
}
else {
$OUT = \*STDOUT;
}
printHeader();
my $prev_chrom = 0;
my $progressbar;
my $next_update = 0;
if ($progress) {
($progressbar, $total_variants) = VcfhacksUtils::getProgressBar(
input => $vcf,
name => "Filtering",
factor => 3,
);
}else{
$time = strftime( "%H:%M:%S", localtime );
print STDERR "[INFO - $time] Filtering started.\n";
}
my $kept = 0;
my $filtered = 0;
my $n = 0;
my $variants_done = 0;
my @lines_to_process = ();
my %no_fork_args = ();
if ( $forks < 2 ) {
%no_fork_args = VcfReader::getSearchArguments
(
$filter_vcf,
$filter_vcf_index
);
}
processLine($first_var);
LINE: while ( my $line = <$VCF> ) {
processLine($line);
}
process_buffer() if $forks > 1;
if ($progressbar) {
$time = strftime( "%H:%M:%S", localtime );
$progressbar->update( $total_variants * 3 )
if $total_variants * 3 >= $next_update;
$progressbar->message( "[INFO - $time] $variants_done variants processed" );
}
$time = strftime( "%H:%M:%S", localtime );
print STDERR "\n[INFO - $time] $filtered variants filtered, $kept printed ";
print STDERR "($total_variants total)" if $total_variants;
print STDERR "\n";
################################################
#####################SUBS#######################
################################################
sub processLine{
my $line = shift;
return if $line =~ /^#/;
$variants_done++;
$n++;
checkProgress(1);
if ( $forks > 1 ) {
push @lines_to_process, $line;
checkProgress();
if ( @lines_to_process >= $buffer_size ) {
process_buffer();
@lines_to_process = ();
}
}
else {
chomp $line;
my @split = split( "\t", $line );
my $l = filter_on_vcf( \@split, \%no_fork_args );
if ($l) {
print $OUT join("\t", @$l) . "\n";
$kept++;
}
else {
$filtered++;
}
$n += 2;
checkProgress();
}
}
################################################
sub checkProgress{
return if not $progress;
my $do_count_check = shift;
if ($progressbar) {
$next_update = $progressbar->update($n) if $n >= $next_update;
}elsif($do_count_check){#input from STDIN/pipe
VcfhacksUtils::simpleProgress($variants_done, 0, " variants read" );
}
}
################################################
sub process_buffer {
return if not @lines_to_process;
my @lines_to_print;
my $lines_per_slice = @lines_to_process;
if ( $forks > 0 ) {
$lines_per_slice =
int( @lines_to_process / $forks ) > 1
? int( @lines_to_process / $forks )
: 1;
}
my @batch = ();
#get a batch for each thread
for ( my $i = 0 ; $i < @lines_to_process ; $i += $lines_per_slice ) {
my $last =
( $i + $lines_per_slice - 1 ) < $#lines_to_process
? $i + $lines_per_slice - 1
: $#lines_to_process;
if ( $i + $lines_per_slice >= @lines_to_process ) {
$last = $#lines_to_process;
}
my @temp = @lines_to_process[ $i .. $last ];
push @batch, \@temp;
}
my $pm = Parallel::ForkManager->new($forks);
$pm->run_on_finish( # called BEFORE the first call to start()
sub {
my ( $pid, $exit_code, $ident, $exit_signal, $core_dump,
$data_structure_reference )
= @_;
if ( defined($data_structure_reference) ) {
my %res = %{$data_structure_reference};
if ( ref $res{keep} eq 'ARRAY' ) {
if (@{ $res{keep} }){
$lines_to_print[$res{order}] = \@{ $res{keep} } ;
}
}
if ( $res{filter} ) {
$filtered += $res{filter};
}
$n += $res{batch_size};
checkProgress();
}
else {
die "[ERROR] no message received from child process $pid!\n";
}
}
);
my $order = -1;
foreach my $b (@batch) {
$order++;
$pm->start() and next;
my %results = process_batch($b, $order);
$pm->finish( 0, \%results );
}
$pm->wait_all_children;
#print them
if (@lines_to_print) {
my $incr_per_batch = @lines_to_process / @lines_to_print;
foreach my $batch (@lines_to_print) {
if (not defined $batch){
$n += $incr_per_batch;
checkProgress();
next;
}
my $incr_per_line = $incr_per_batch / @$batch;
foreach my $l (@$batch) {
if ( ref $l eq 'ARRAY' ) {
print $OUT join( "\t", @$l ) . "\n";
}
else {
print $OUT "$l\n";
}
$kept++;
$n += $incr_per_line;
checkProgress();
}
}
}
else {
$n += @lines_to_process;
checkProgress();
}
}
################################################
sub process_batch {
#filter a set of lines
my ($batch, $order) = @_;
my %sargs = VcfReader::getSearchArguments( $filter_vcf, $filter_vcf_index );
my %results =
(
batch_size => scalar(@$batch),
order => $order,
);
foreach my $line ( @{$batch} ) {
chomp $line;
my @split = split( "\t", $line );
my $l = filter_on_vcf( \@split, \%sargs );
if ($l) {
push @{ $results{keep} }, $l;
}
else {
$results{filter}++;
}
}
return %results;
}
################################################
sub printHeader{
my $meta_head = join("\n", grep {/^##/} @$header);
print $OUT "$meta_head\n";
foreach my $pop (@pop_acs){
my $desc = $filter_info->{"AC_$pop"}->{Description} ;
$desc =~ s/\"/\'/g;
my $an_desc = $filter_info->{"AN_$pop"}->{Description} ;
$an_desc =~ s/\"/\'/g;
my %af_info =
(
ID => "FVOV_AF_$pop",
Number => "A",
Type => "Float",
Description => "Putative population allele number from $filter_vcf.".
" Description of original AC_$pop was as follows:" .
" $desc",
);
my %ac_info =
(
ID => "FVOV_AN_$pop",
Number => "A",
Type => "Integer",
Description => "Putative population allele number from $filter_vcf. ".
"Description of original AN_$pop was as follows:".
" $an_desc",
);
print $OUT VcfhacksUtils::getInfoHeader(%af_info) . "\n";
print $OUT VcfhacksUtils::getInfoHeader(%ac_info) . "\n";
}
if ($annotate_af){
$annotate_af =~ s/\s/_/g;#no white space in INFO field
my %af_info =
(
ID => "$annotate_af" ."_AF",
Number => "A",
Type => "Float",
Description => "Allele frequency calculated by ".
"filterVcfOnVcf.pl from $filter_vcf."
);
my %ac_info =
(
ID => "$annotate_af" ."_AN",
Number => "A",
Type => "Integer",
Description => "Allele number annotated by ".
"filterVcfOnVcf.pl from $filter_vcf.",
);
my $time = strftime( "%H:%M:%S", localtime );
print STDERR "[INFO - $time] Adding INFO fields $annotate_af" .
"_AF and $annotate_af" ."_AN to output for allele ".
"frequency and allele numbers calculated from ".
"$filter_vcf...\n";
print $OUT VcfhacksUtils::getInfoHeader(%af_info) . "\n";
print $OUT VcfhacksUtils::getInfoHeader(%ac_info) . "\n";
}
print $OUT VcfhacksUtils::getOptsVcfHeader(%opts) . "\n";
print $OUT "$header->[-1]\n";
}
################################################
sub filter_on_info_fields {
my ( $vcf_line, $search_args ) = @_;
my $qual = VcfReader::getVariantField( $vcf_line, 'QUAL' );
my $chrom = VcfReader::getVariantField( $vcf_line, 'CHROM' );
if ($min_qual && $qual < $min_qual ) {
return;
}
#process each allele separately in case we have MNVs/deletions
#that need to be simplified
my %min_vars = VcfReader::minimizeAlleles( $vcf_line, );
my %sample_alleles = ();
if (@samples) {
%sample_alleles = map { $_ => undef } VcfReader::getSampleCall(
line => $vcf_line,
multiple => \@samples,
return_alleles_only => 1,
minGQ => $aff_quality,
sample_to_columns => \%sample_to_col
);
deleteRefAndNoCallAlleles(\%sample_alleles, \%min_vars);
return
if not keys
%sample_alleles; #filter if we don't have any variants in our samples
}
else {
#if no samples specified use all alleles for %sample_alleles
%sample_alleles = map { $_ => undef } keys %min_vars;
deleteRefAndNoCallAlleles(\%sample_alleles, \%min_vars);
}
my %sample_matches = (); #check each allele matches in all filter_vcfs
# but don't reset after each file
my %thresh_counts = (); #count samples in all filter_vcfs
# i.e. don't reset after each file
my %af_counts; #count allele occurences and total alleles to calc
# allele frequency and don't reset after each file
my %f_genos = (); #store genotypes as keys if using $filter_homozygote
my %alleles_over_maf = (); #store alleles that have exceeded maf in here
ALLELE: foreach my $allele ( keys %sample_alleles ) {
if (
my @snp_hits = VcfReader::searchForPosition(
%{$search_args},
chrom => $min_vars{$allele}->{CHROM},
pos => $min_vars{$allele}->{POS}
)
)
{
#get genotype call codes (0, 1, 2 etc.) for filter samples
FILTER_LINE: foreach my $snp_line (@snp_hits) {
my @f_alts = ();
my %geno_counts = ();
my @snp_split = split( "\t", $snp_line );
if ($min_qual) {
my $filter_qual =
VcfReader::getVariantField( \@snp_split, 'QUAL' );
next FILTER_LINE if $filter_qual < $min_qual;
}
my %filter_min = VcfReader::minimizeAlleles( \@snp_split, );
my @alts = VcfReader::readAlleles( line => \@snp_split, );
for ( my $i = 0 ; $i < @alts ; $i++ ) {
push @f_alts, $i;
}
if ( $filter_homozygotes )
{
my @gtcs = split(
",",
VcfReader::getVariantInfoField( \@snp_split,
"GTC", )
);
my @pgts = split(
",",
VcfReader::getVariantInfoField(
\@snp_split, "PGTS",
)
);
die "Genotype count (GTC) does not have the same number ".
"of alleles as possible genotypes (PGTS) field for ".
"line:\n$snp_line\n"
if @gtcs != @pgts;
for ( my $i = 0 ; $i < @gtcs ; $i++ ) {
$geno_counts{ $pgts[$i] } = $gtcs[$i];
}
}
my $filter_match = ''; #if one of the filter's ALTs matches
# store the ALT allele code here
ALT: foreach my $alt (@f_alts) {
next ALT if $alt eq '.';
next ALT if $alt == 0;
next ALT
if $min_vars{$allele}->{POS} ne
$filter_min{$alt}->{POS};
next ALT
if $min_vars{$allele}->{REF} ne
$filter_min{$alt}->{REF};
next ALT
if $min_vars{$allele}->{ALT} ne
$filter_min{$alt}->{ALT};
$min_vars{$allele}->{CHROM} =~ s/^chr//;
$filter_min{$alt}->{CHROM} =~ s/^chr//;
next ALT
if $min_vars{$allele}->{CHROM} ne
$filter_min{$alt}->{CHROM};
#filter ALT matches input allele
$filter_match = $alt;
$sample_matches{$allele}++;
last ALT;
}
if ( not $filter_match ) {
next FILTER_LINE;
}
if(@pop_acs){
add_pop_freqs_to_allele
(
$min_vars{$allele},
\@snp_split,
$filter_match
);
}
if ($annotate_af){
add_annotated_af_to_allele
(
$min_vars{$allele},
\@snp_split,
$filter_match
);
}
if ($threshold) {
foreach my $k ( keys %geno_counts ) {
my @g_alleles = split( /[\/\|]/, $k );
if ( grep { $_ eq $filter_match } @g_alleles ) {
$thresh_counts{$allele} += $geno_counts{$k};
}
}
}
if ($maf) {
if ( exists $filter_info->{AF}) {
my @afs = split(
",",
VcfReader::getVariantInfoField(
\@snp_split, "AF",
)
);
my $freq = $afs[ $filter_match - 1 ];
if ( exists $filter_info->{AN}) {
my $an = VcfReader::getVariantInfoField
(
\@snp_split,
"AN"
);
if ($an >= $MIN_AN){
if ( $freq >= $maf ) {
$alleles_over_maf{$allele}++;
}
}
}elsif ( $freq >= $maf ) {
$alleles_over_maf{$allele}++;
}
if(@pop_acs and not $alleles_over_maf{$allele}){
#if total AF not over maf check individual populations
if ( pop_freq_over_maf($min_vars{$allele}) ) {
$alleles_over_maf{$allele}++;
}
}
}
else {
foreach my $k ( keys %geno_counts ) {
my @g_alleles = split( /[\/\|]/, $k );
foreach my $g (@g_alleles) {
if ( $g eq $filter_match ) {
$af_counts{$allele}->{counts} +=
$geno_counts{$k};
}
$af_counts{$allele}->{total} +=
$geno_counts{$k};
}
}
my $freq = 0;
eval {
$freq =
$af_counts{$allele}->{counts} /
$af_counts{$allele}->{total};
};
$alleles_over_maf{$allele}++ if $freq >= $maf;
}
}
} #read pos
} #search
} #foreach allele
#done each allele - see if we've got enough data to filter this line
if ( not $maf and not $filter_homozygotes ) {
# no filters - default is filter if we found a match
if (keys %sample_matches == keys %sample_alleles ){#found all alleles
if ($print_matching) {
$vcf_line = annotate_pop_freqs(\%min_vars, $vcf_line);
return $vcf_line;
}else {
return;
}
}else{#not all alleles found so don't filter
if (not $print_matching) {
$vcf_line = annotate_pop_freqs(\%min_vars, $vcf_line);
return $vcf_line;
}else {
return;
}
}
}
my $homozygous_alleles = 0;
if ($filter_homozygotes) {
foreach my $allele ( keys %sample_alleles ) {
if ( exists $f_genos{"$allele/$allele"}
or exists $f_genos{"$allele|$allele"} )
{
$homozygous_alleles++;
}
}
if ( $homozygous_alleles == keys %sample_alleles ) {
if ($print_matching) {
return $vcf_line;
}
else {
return;
}
}
}
#check allele frequency if given
if ($maf) {
if ( keys %alleles_over_maf == keys %sample_alleles ) {
if ($print_matching) {
$vcf_line = annotate_pop_freqs(\%min_vars, $vcf_line);
return $vcf_line;
}
else {
return;
}
}
}
if ($print_matching) {
return;
}
else {
$vcf_line = annotate_pop_freqs(\%min_vars, $vcf_line);
return $vcf_line;
}
}
#################################################
sub deleteRefAndNoCallAlleles{
my $al_hash = shift;
my $min = shift;
my @del = ();
foreach my $k (keys %$al_hash){
if ($k eq '0' or $k eq '.'){
push @del, $k;
}elsif($min->{$k}->{ORIGINAL_ALT} eq '*'){
push @del, $k;
}
}
foreach my $d (@del){
delete $al_hash->{$d};
}
}
#################################################
sub pop_freq_over_maf{
my $min_allele = shift;
foreach my $pop (@pop_acs){
next if not $min_allele->{"FVOV_AN_$pop"} ;
next if $min_allele->{"FVOV_AN_$pop"} eq '.';
next if $min_allele->{"FVOV_AN_$pop"} < $MIN_AN;
if ( $min_allele->{"FVOV_AF_$pop"} >= $maf){
return 1;
}
}
return 0;
}
#################################################
sub annotate_pop_freqs{
my $m_var = shift;