From 9368afc269384d3dc31e8ee96d1501327cc57db1 Mon Sep 17 00:00:00 2001 From: Javier Herrero Date: Wed, 11 May 2016 10:48:58 +0100 Subject: [PATCH] Even more code cleanup --- eForge/eForge.pm | 136 ++++++++++++++++++++----------------------------------- eforge.pl | 63 ++++++++++++++------------ 2 files changed, 82 insertions(+), 117 deletions(-) diff --git a/eForge/eForge.pm b/eForge/eForge.pm index 51fd76e..501d7f7 100644 --- a/eForge/eForge.pm +++ b/eForge/eForge.pm @@ -49,7 +49,7 @@ our $VERSION = '0.01'; our (@ISA, @EXPORT); use Exporter; @ISA = qw(Exporter); -@EXPORT = qw(get_all_datasets get_all_arrays get_all_proximity_filters process_file match process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset assign bkgrdstat proximity_filter); +@EXPORT = qw(get_all_datasets get_all_arrays get_all_proximity_filters process_file get_random_matching_picks process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset assign bkgrdstat proximity_filter); =head1 SYNOPSIS @@ -61,7 +61,7 @@ get_all_datasets get_all_arrays get_all_proximity_filters process_file -match +get_random_matching_picks process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset @@ -164,95 +164,54 @@ sub process_file { return $probe_ids; } -=head2 match -Identifies the bins that each of the probes in a probe hash lies in, and then picks matching probes for the number of reps specified. +=head2 get_random_matching_picks -=cut + Arg[1] : hashref $overlaps + Arg[2] : string $array_tag + Arg[3] : string $data_dir + Arg[4] : int $num_random_picks + Returns : arrayref of arrays of $probe_ids (string) + Example : my $random_picks = get_random_matching_picks($overlaps, "450k", ".", 1000); + Description : Get several random picks of probes matching the criteria defined in the $overlaps + hash. The random picks are selected from a pre-built hash stored in the $data_dir + called mvp_450k_bins (or so). + Exceptions : Cannot find the bins file for the selected array. -#it is not enough to change match, we also have to change bkgrdstat and process_overlaps and get_bits -#(we don't use "assign" as we do not use percentile bins so no point in changing "assign") -sub match{ - #we take out the percentile bins ($per): - #my ($mvps, $array, $datadir, $per, $reps) = @_; - my ($mvps, $array, $datadir, $reps) = @_; - #my ($bins, $params, %bins, %params); - my ($bins, %bins); - # load up the stored hashes that contain the bins of mvps by feature and cpg island relationship. - # These are precalculated according to the parameters that are hard coded above. - # the hash to load is defined by the bkgd option - defaults to '450k' - #$bins = $datadir . "/mvp_bins"; - #$bins = $datadir . "snp_bins.$per"; - #$params = $datadir . "snp_params.$per"; - - if ($array =~ "27k"){ - $bins = $datadir . "/mvp_27k_bins"; - } - else{ - $bins = $datadir . "/mvp_450k_bins"; +=cut - } +sub get_random_matching_picks { + my ($overlaps, $array, $datadir, $num_random_picks) = @_; + my $picks = []; - #took params out, do not need params - #if (-e $bins && -e $params){ - if (-e $bins){ - %bins = %{ retrieve($bins) }; - #took params out, do not need params - #%params = %{ retrieve($params)}; - } - else{ - die "Cannot retrieve the file $bins\n"; + # load up the stored hashes that contain the bins of mvps by feature and cpg island relationship. + my %bins; + my $bins_file = $datadir . "/mvp_${array}_bins"; + if (-e $bins_file) { + %bins = %{ retrieve($bins_file) }; + } else { + die "Cannot retrieve the file $bins_file\n"; } - my (%picks); - - - - foreach my $cg (keys %{$$mvps{'MVPS'}}){ - srand; - my ($feature, $cpg_island_relationship) = split("\t", join("\t", $$mvps{'MVPS'}{$cg}{'PARAMS'})); - #$cg is the test mvp, $cgid is the matched mvp. + foreach my $probe_id (keys %{$overlaps->{'MVPS'}}) { + my ($feature, $cpg_island_relationship) = split("\t", join("\t", $overlaps->{'MVPS'}->{$probe_id}->{'PARAMS'})); #range has to be the number of probes to choose from in that hash subclass - my $range=scalar @{$bins{$feature}{$cpg_island_relationship}}; + my $range = scalar @{$bins{$feature}{$cpg_island_relationship}}; - for (my $n = 1; $n <= $reps; $n++) { - my ($mvp_string, $cgid); - while (1){ + for (my $n = 0; $n < $num_random_picks; $n++) { + my $picked_probe_id; + while (1) { my $pick = int(rand($range)); - $mvp_string = ${$bins{$feature}{$cpg_island_relationship}}[$pick]; #pick the $pick'th element in the array as the chosen mvp - #(undef, undef, undef, $cgid) = split /\t/, $mvp_string; #did not set the splitting on, check whether this can be done - $cgid=$mvp_string; - last unless $cgid eq $cg; # must not pick the test mvp itself. - } - push @{$picks{$n}}, $cgid; # each $n array is a set of probes matching the test set/ it is allowed to pick the same probe more than once in this background selection - } - } - return \%picks; - } - -#commented previous foreach with three parameters(maf, tss, gc content) can be found below: -#foreach my $cg (keys %{$$mvps{'SNPS'}}){ -# srand; -# my ($maf, $tss, $gc) = split("\t", join("\t", $$mvps{'SNPS'}{$cg}{'PARAMS'})); -# #$cg is the test mvp, $cgid is the matched mvp. -# my ($i, $j, $k) = assign ($gc, $tss, $maf, \%params); -# -# my $range = scalar @{$bins{$i}{$j}{$k}}; -# for (my $n = 1; $n <= $reps; $n++) { -# my ($mvp_string, $cgid); -# while (1){ -# my $pick = int(rand($range)); -# $mvp_string = ${$bins{$i}{$j}{$k}}[$pick]; #pick the $pick'th element in the array as the chosen mvp " -# (undef, undef, undef, $cgid) = split /\t/, $mvp_string; -# last unless $cgid eq $cg; # must not pick the test mvp itself. -# } -# push @{$picks{$n}}, $cgid; # each $n array is a set of probes matching the test set/ it is allowed to pick the same probe more than once in this background selection -# } -# } -# return \%picks; - + $picked_probe_id = ${$bins{$feature}{$cpg_island_relationship}}[$pick]; #pick the $pick'th element in the array as the chosen mvp + last unless $picked_probe_id eq $probe_id; # must not pick the test mvp itself. + } + push(@{$picks->[$n]}, $picked_probe_id); + } + } + return $picks; +} =head2 process_overlaps @@ -271,30 +230,31 @@ sub match{ sub process_overlaps { my ($rows, $cells, $data) = @_; - my %test; - my @test_cells; + my $overlaps; + my @overlapping_probes_per_cell; my @indexes = 0..(@$cells-1); foreach my $row (@{$rows}){ my ($probeid, $sum, $bit_string, $feature, $cpg_island_relationship) = @$row; - $test{'MVPS'}{$probeid}{'SUM'} = $sum; - $test{'MVPS'}{$probeid}{'PARAMS'} = join("\t", $feature, $cpg_island_relationship); + $overlaps->{'MVPS'}->{$probeid}->{'SUM'} = $sum; + $overlaps->{'MVPS'}->{$probeid}->{'PARAMS'} = join("\t", $feature, $cpg_island_relationship); die "For $data, found ".scalar(@$cells)." cells for ".length($bit_string)." bits\n" if (scalar(@$cells) ne length($bit_string)); foreach my $index (@indexes) { ## $bit_string is a string made of 0s and 1s. If it is a 1 for this position, count and push if (substr($bit_string, $index, 1)) { - $test_cells[$index][0]++; - push @{$test_cells[$index][1]}, $probeid; + push @{$overlapping_probes_per_cell[$index]}, $probeid; } } } my $index = 0; foreach my $cell (@$cells){ - $test{'CELLS'}{$cell}{'COUNT'} = $test_cells[$index][0] if ($test_cells[$index][0]); - $test{'CELLS'}{$cell}{'MVPS'} = $test_cells[$index][1] if ($test_cells[$index][1]); + if ($overlapping_probes_per_cell[$index] and @{$overlapping_probes_per_cell[$index]}) { + $overlaps->{'CELLS'}->{$cell}->{'COUNT'} = scalar(@{$overlapping_probes_per_cell[$index]}); + $overlaps->{'CELLS'}->{$cell}->{'MVPS'} = $overlapping_probes_per_cell[$index]; + } $index++; } - return \%test; + return $overlaps; } diff --git a/eforge.pl b/eforge.pl index 5a6d09a..f4281e1 100644 --- a/eforge.pl +++ b/eforge.pl @@ -382,16 +382,16 @@ =head1 ACKNOWLEDGEMENTS my ($cells, $samples) = get_samples_from_dataset($dbh, $dataset); # unpack the bitstrings and store the overlaps by cell. -# $test is a complex hash like: -# $test{'MVPS'}{$probe_id}{'SUM'} (total number of overlaps of this probe with DHS/etc in this dataset) -# $test{'MVPS'}{$probe_id}{'PARAMS'} (gene and CGI annotations for this probe) -# $test{'CELLS'}{$cell}{'COUNT'} (number of input MVPs that overlap with the signal on this sample) -# $test{'CELLS'}{$cell}{'MVPS'} (list of input MVPs that overlap with the signal on this sample) -my $test = process_overlaps($annotated_probes, $cells, $dataset); +# $overlaps is a complex hash like: +# $overlaps->{'MVPS'}->{$probe_id}->{'SUM'} (total number of overlaps of this probe with features in this dataset) +# $overlaps->{'MVPS'}->{$probe_id}->{'PARAMS'} (gene and CGI annotations for this probe) +# $overlaps->{'CELLS'}->{$cell}->{'COUNT'} (number of input MVPs that overlap with the signal on this sample) +# $overlaps->{'CELLS'}->{$cell}->{'MVPS'} (list of input MVPs that overlap with the signal on this sample) +my $overlaps = process_overlaps($annotated_probes, $cells, $dataset); # generate stats on the background selection if (defined $bkgrdstat) { - bkgrdstat($test, $lab, "test"); + bkgrdstat($overlaps, $lab, "test"); } @@ -401,36 +401,40 @@ =head1 ACKNOWLEDGEMENTS #(in subroutines process_overlaps, etc) -# identify the feature and cpg island relationship, and then make bkgrd picks (old version just below) -#my $picks = match(\%$test, $array, $datadir, $per, $reps); +# Identify the feature and cpg island relationship, and then make random picks warn "[".scalar(localtime())."] Loading the $array background...\n"; -my $picks = match(\%$test, $array, $datadir, $reps); +my $random_picks = get_random_matching_picks($overlaps, $array, $datadir, $reps); ########check below lines: # for bkgrd set need to get distribution of counts instead # make a hash of data -> cell -> bkgrd-Set -> overlap counts -my %bkgrd; #this hash is going to store the bkgrd overlaps +my %overlaps_per_cell; #this hash is going to store the bkgrd overlaps # Get the bits for the background sets and process -my $backmvps; +my $total_num_probes_in_random_picks; warn "[".scalar(localtime())."] Running the analysis with $num_of_valid_probes MVPs...\n"; my $num = 0; -foreach my $bkgrd (keys %{$picks}) { +foreach my $this_random_pick (@$random_picks) { warn "[".scalar(localtime())."] Repetition $num out of ".$reps."\n" if (++$num%100 == 0); - #$annotated_probes = get_probe_annotations_and_overlap_for_dataset(\@{$$picks{$bkgrd}}, $sth); - $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh, $dataset, $array, \@{$$picks{$bkgrd}}); - $backmvps += scalar @$annotated_probes; #$backmvps is the total number of background probes analysed + $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh, $dataset, $array, $this_random_pick); + + $total_num_probes_in_random_picks += scalar @$annotated_probes; + unless (scalar @$annotated_probes == $num_of_valid_probes) { - warn "Background " . $bkgrd . " only " . scalar @$annotated_probes . " probes out of $num_of_valid_probes\n"; + warn "Random pick #$num only has " . scalar @$annotated_probes . " probes compared to $num_of_valid_probes in the input set.\n"; } - my $result = process_overlaps($annotated_probes, $cells, $dataset); - foreach my $cell (keys %{$$result{'CELLS'}}) { - push @{$bkgrd{$cell}}, $$result{'CELLS'}{$cell}{'COUNT'}; # accumulate the overlap counts by cell + + my $this_pick_overlaps = process_overlaps($annotated_probes, $cells, $dataset); + + # accumulate the overlap counts by cell + foreach my $cell (keys %{$this_pick_overlaps->{'CELLS'}}) { + push @{$overlaps_per_cell{$cell}}, $this_pick_overlaps->{'CELLS'}->{$cell}->{'COUNT'}; } + if (defined $bkgrdstat) { - bkgrdstat($result, $lab, $bkgrd); + bkgrdstat($this_pick_overlaps, $lab, $this_random_pick); } } @@ -458,18 +462,18 @@ =head1 ACKNOWLEDGEMENTS # ultimately want a data frame of names(results)<-c("Zscore", "Cell", "Tissue", "File", "MVPs") if (!$web) { - print BACKGROUND join("\t", @{$bkgrd{$cell}}), "\n"; + print BACKGROUND join("\t", @{$overlaps_per_cell{$cell}}), "\n"; } - my $teststat = ($test->{'CELLS'}->{$cell}->{'COUNT'} or 0); #number of overlaps for the test MVPs + my $teststat = ($overlaps->{'CELLS'}->{$cell}->{'COUNT'} or 0); #number of overlaps for the test MVPs # binomial pvalue, probability of success is derived from the background overlaps over the tests for this cell # $backmvps is the total number of background mvps analysed # $tests is the number of overlaps found over all the background tests - my $tests; - foreach (@{$bkgrd{$cell}}) { - $tests += $_; + my $total_num_overlaps_in_random_picks; + foreach (@{$overlaps_per_cell{$cell}}) { + $total_num_overlaps_in_random_picks += $_; } - my $p = sprintf("%.6f", $tests/$backmvps); + my $p = sprintf("%.6f", $total_num_overlaps_in_random_picks / $total_num_probes_in_random_picks); # binomial probability for $teststat or more hits out of $mvpcount mvps # sum the binomial for each k out of n above $teststat @@ -491,10 +495,11 @@ =head1 ACKNOWLEDGEMENTS $pbinom = sprintf("%.2e", $pbinom); # Z score calculation (note: this is here only for legacy reasons. Z-scores assume normal distribution) - my $zscore = zscore($teststat, $bkgrd{$cell}); + my $zscore = zscore($teststat, $overlaps_per_cell{$cell}); my $mvp_string = ""; - $mvp_string = join(",", @{$$test{'CELLS'}{$cell}{'MVPS'}}) if defined $$test{'CELLS'}{$cell}{'MVPS'}; + $mvp_string = join(",", @{$overlaps->{'CELLS'}->{$cell}->{'MVPS'}}) + if defined $overlaps->{'CELLS'}->{$cell}->{'MVPS'}; # This gives the list of overlapping MVPs for use in the tooltips. If there are a lot of them this can be a little useless my ($shortcell, undef) = split('\|', $cell); # undo the concatenation from earlier to deal with identical cell names.