diff --git a/eForge/eForge.pm b/eForge/eForge.pm index 501d7f7..fb16d05 100644 --- a/eForge/eForge.pm +++ b/eForge/eForge.pm @@ -49,7 +49,7 @@ our $VERSION = '0.01'; our (@ISA, @EXPORT); use Exporter; @ISA = qw(Exporter); -@EXPORT = qw(get_all_datasets get_all_arrays get_all_proximity_filters process_file get_random_matching_picks process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset assign bkgrdstat proximity_filter); +@EXPORT = qw(get_all_datasets get_all_arrays get_all_proximity_filters process_file get_random_matching_picks process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset assign save_probe_annotation_stats proximity_filter); =head1 SYNOPSIS @@ -66,36 +66,47 @@ process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset assign -bkgrdstat +save_probe_annotation_stats proximity_filter =head1 SUBROUTINES/METHODS -=head2 bkgrdstat - -Gives the background statistics. - -=cut - -sub bkgrdstat{ - my ($test, $lab, $flag) = @_; - my $bfh; - my $file = "$lab.bkgrd.stats"; - if ($flag eq "test") { - open $bfh, ">", $file or die "cannot open $file"; - } - else{ - open $bfh, ">>", $file or die "cannot open $file"; - } - my (@feature, @cpg_island_relationship); - foreach my $probeid (keys %{$$test{'MVPS'}}){ - my ($feature, $cpg_island_relationship) = split "\t", $$test{'MVPS'}{$probeid}{'PARAMS'}; - push @feature, $feature; - push @cpg_island_relationship, $cpg_island_relationship; - } - say $bfh join("\t", $flag, "feature", @feature); - say $bfh join("\t", $flag, "cpg_island_relationship", @cpg_island_relationship); - } + +=head2 save_probe_annotation_stats + + Arg[1] : hashref $overlaps (see get_overlaps) + Arg[2] : string $outdir + Arg[3] : string $label + Arg[4] : string $set_id + Returns : + Example : save_probe_annotation_stats($overlaps, ".", "Unnamed", "test"); + Example : save_probe_annotation_stats($overlaps, ".", "Unnamed", 23); + Description : Save stats about the probe annotations on a text file. The $set_id is either + "test" (which relates to the input probe_ids, the ones to be tested for + enrichment) or the random pick number. + The file will contain for each set the whole list of gene features and CpG islands + relationships for the probes in that set. + Exceptions : Dies if file cannot be opened + +=cut + +sub save_probe_annotation_stats { + my ($overlaps, $out_dir, $lab, $flag) = @_; + + my $fh; + my $file = "$out_dir/$lab.overlaps.stats.txt"; + open(STATS, ">>$file") or die "cannot open $file"; + my (@gene_features, @cpg_island_relationships); + foreach my $probeid (keys %{$overlaps->{'MVPS'}}){ + my ($this_gene_feature, $this_cpg_island_relationship) = + split("\t", $overlaps->{'MVPS'}->{$probeid}->{'PARAMS'}); + push @gene_features, $this_gene_feature; + push @cpg_island_relationships, $this_cpg_island_relationship; + } + say STATS join("\t", $flag, "gene_features", @gene_features); + say STATS join("\t", $flag, "cpg_island_relationships", @cpg_island_relationships); + close(STATS); +} =head2 process_file diff --git a/eforge.pl b/eforge.pl index f4281e1..3709497 100644 --- a/eforge.pl +++ b/eforge.pl @@ -94,9 +94,9 @@ =head1 OPTIONS Give a value as the lower threshold and only MVPs with -log10 pvalues >= to the threshold will be analysed. Default is no filtering. -=item B<--bkgrd> +=item B<--save_stats> -Output background stats for investigation. +Output annotation stats for the original and the random picks. =item B<--reps INT> @@ -108,7 +108,9 @@ =head1 OPTIONS eForge will report MVPs removed due to proximity with another MVP in the list and will randomly pick one of the probes among the set of probes that are in proximity (within 1 kb of each other). -To turn off proximity filtering specify -noproxy +At the moment, this is a dummy flag as only one proximity filter is available for each array. It +will become useful if the database and code support more than one. At the moment to turn off +proximity filtering, simply specify -noproxy =item B<--noproxy> @@ -202,13 +204,13 @@ =head1 ACKNOWLEDGEMENTS my $min_num_probes = 5; # the minimum number of probes allowed for test. Set to 5 as we have binomial p -my ($dataset, $filename, $bkgrdstat, $noplot, +my ($dataset, $filename, $save_probe_annotation_stats, $noplot, $help, $man, $proxy, $noproxy, $depletion, $filter, $out_dir, $probe_list, $web, $autoopen); GetOptions ( 'dataset=s' => \$dataset, - 'bkgrd' => \$bkgrdstat, + 'save_stats|bkgrd' => \$save_probe_annotation_stats, 'array|bkgd=s' => \$array, 'label=s' => \$label, 'f=s' => \$filename, @@ -238,6 +240,8 @@ =head1 ACKNOWLEDGEMENTS my $ug = new Data::UUID; $out_dir = $ug->to_hexstring($ug->create()); } +mkdir $out_dir; + # Define the thresholds to use. if ($thresh) { @@ -390,8 +394,8 @@ =head1 ACKNOWLEDGEMENTS my $overlaps = process_overlaps($annotated_probes, $cells, $dataset); # generate stats on the background selection -if (defined $bkgrdstat) { - bkgrdstat($overlaps, $lab, "test"); +if (defined $save_probe_annotation_stats) { + save_probe_annotation_stats($overlaps, $out_dir, $lab, "test"); } @@ -409,21 +413,21 @@ =head1 ACKNOWLEDGEMENTS # for bkgrd set need to get distribution of counts instead # make a hash of data -> cell -> bkgrd-Set -> overlap counts -my %overlaps_per_cell; #this hash is going to store the bkgrd overlaps +my %overlaps_per_cell; #this hash is going to store the overlaps for the random picks, per cell # Get the bits for the background sets and process my $total_num_probes_in_random_picks; warn "[".scalar(localtime())."] Running the analysis with $num_of_valid_probes MVPs...\n"; -my $num = 0; +my $count = 0; foreach my $this_random_pick (@$random_picks) { - warn "[".scalar(localtime())."] Repetition $num out of ".$reps."\n" if (++$num%100 == 0); + warn "[".scalar(localtime())."] Repetition $count out of ".$reps."\n" if (++$count%100 == 0); $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh, $dataset, $array, $this_random_pick); $total_num_probes_in_random_picks += scalar @$annotated_probes; unless (scalar @$annotated_probes == $num_of_valid_probes) { - warn "Random pick #$num only has " . scalar @$annotated_probes . " probes compared to $num_of_valid_probes in the input set.\n"; + warn "Random pick #$count only has " . scalar @$annotated_probes . " probes compared to $num_of_valid_probes in the input set.\n"; } my $this_pick_overlaps = process_overlaps($annotated_probes, $cells, $dataset); @@ -433,8 +437,8 @@ =head1 ACKNOWLEDGEMENTS push @{$overlaps_per_cell{$cell}}, $this_pick_overlaps->{'CELLS'}->{$cell}->{'COUNT'}; } - if (defined $bkgrdstat) { - bkgrdstat($this_pick_overlaps, $lab, $this_random_pick); + if (defined $save_probe_annotation_stats) { + save_probe_annotation_stats($this_pick_overlaps, $out_dir, $lab, $count); } } @@ -446,8 +450,6 @@ =head1 ACKNOWLEDGEMENTS #the table to be read into R for plotting. -mkdir $out_dir; - if (!$web) { open(BACKGROUND, "| gzip -9 > $out_dir/background.tsv.gz") or die "Cannot open background.tsv"; }