Permalink
Browse files

Even more code cleanup

  • Loading branch information...
1 parent a97934e commit 9368afc269384d3dc31e8ee96d1501327cc57db1 @jherrero jherrero committed May 11, 2016
Showing with 82 additions and 117 deletions.
  1. +48 −88 eForge/eForge.pm
  2. +34 −29 eforge.pl
View
@@ -49,7 +49,7 @@ our $VERSION = '0.01';
our (@ISA, @EXPORT);
use Exporter;
@ISA = qw(Exporter);
-@EXPORT = qw(get_all_datasets get_all_arrays get_all_proximity_filters process_file match process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset assign bkgrdstat proximity_filter);
+@EXPORT = qw(get_all_datasets get_all_arrays get_all_proximity_filters process_file get_random_matching_picks process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset assign bkgrdstat proximity_filter);
=head1 SYNOPSIS
@@ -61,7 +61,7 @@ get_all_datasets
get_all_arrays
get_all_proximity_filters
process_file
-match
+get_random_matching_picks
process_overlaps
get_probe_annotations_and_overlap_for_dataset
get_samples_from_dataset
@@ -164,95 +164,54 @@ sub process_file {
return $probe_ids;
}
-=head2 match
-Identifies the bins that each of the probes in a probe hash lies in, and then picks matching probes for the number of reps specified.
+=head2 get_random_matching_picks
-=cut
+ Arg[1] : hashref $overlaps
+ Arg[2] : string $array_tag
+ Arg[3] : string $data_dir
+ Arg[4] : int $num_random_picks
+ Returns : arrayref of arrays of $probe_ids (string)
+ Example : my $random_picks = get_random_matching_picks($overlaps, "450k", ".", 1000);
+ Description : Get several random picks of probes matching the criteria defined in the $overlaps
+ hash. The random picks are selected from a pre-built hash stored in the $data_dir
+ called mvp_450k_bins (or so).
+ Exceptions : Cannot find the bins file for the selected array.
-#it is not enough to change match, we also have to change bkgrdstat and process_overlaps and get_bits
-#(we don't use "assign" as we do not use percentile bins so no point in changing "assign")
-sub match{
- #we take out the percentile bins ($per):
- #my ($mvps, $array, $datadir, $per, $reps) = @_;
- my ($mvps, $array, $datadir, $reps) = @_;
- #my ($bins, $params, %bins, %params);
- my ($bins, %bins);
- # load up the stored hashes that contain the bins of mvps by feature and cpg island relationship.
- # These are precalculated according to the parameters that are hard coded above.
- # the hash to load is defined by the bkgd option - defaults to '450k'
- #$bins = $datadir . "/mvp_bins";
- #$bins = $datadir . "snp_bins.$per";
- #$params = $datadir . "snp_params.$per";
-
- if ($array =~ "27k"){
- $bins = $datadir . "/mvp_27k_bins";
- }
- else{
- $bins = $datadir . "/mvp_450k_bins";
+=cut
- }
+sub get_random_matching_picks {
+ my ($overlaps, $array, $datadir, $num_random_picks) = @_;
+ my $picks = [];
- #took params out, do not need params
- #if (-e $bins && -e $params){
- if (-e $bins){
- %bins = %{ retrieve($bins) };
- #took params out, do not need params
- #%params = %{ retrieve($params)};
- }
- else{
- die "Cannot retrieve the file $bins\n";
+ # load up the stored hashes that contain the bins of mvps by feature and cpg island relationship.
+ my %bins;
+ my $bins_file = $datadir . "/mvp_${array}_bins";
+ if (-e $bins_file) {
+ %bins = %{ retrieve($bins_file) };
+ } else {
+ die "Cannot retrieve the file $bins_file\n";
}
- my (%picks);
-
-
-
- foreach my $cg (keys %{$$mvps{'MVPS'}}){
- srand;
- my ($feature, $cpg_island_relationship) = split("\t", join("\t", $$mvps{'MVPS'}{$cg}{'PARAMS'}));
- #$cg is the test mvp, $cgid is the matched mvp.
+ foreach my $probe_id (keys %{$overlaps->{'MVPS'}}) {
+ my ($feature, $cpg_island_relationship) = split("\t", join("\t", $overlaps->{'MVPS'}->{$probe_id}->{'PARAMS'}));
#range has to be the number of probes to choose from in that hash subclass
- my $range=scalar @{$bins{$feature}{$cpg_island_relationship}};
+ my $range = scalar @{$bins{$feature}{$cpg_island_relationship}};
- for (my $n = 1; $n <= $reps; $n++) {
- my ($mvp_string, $cgid);
- while (1){
+ for (my $n = 0; $n < $num_random_picks; $n++) {
+ my $picked_probe_id;
+ while (1) {
my $pick = int(rand($range));
- $mvp_string = ${$bins{$feature}{$cpg_island_relationship}}[$pick]; #pick the $pick'th element in the array as the chosen mvp
- #(undef, undef, undef, $cgid) = split /\t/, $mvp_string; #did not set the splitting on, check whether this can be done
- $cgid=$mvp_string;
- last unless $cgid eq $cg; # must not pick the test mvp itself.
- }
- push @{$picks{$n}}, $cgid; # each $n array is a set of probes matching the test set/ it is allowed to pick the same probe more than once in this background selection
- }
- }
- return \%picks;
- }
-
-#commented previous foreach with three parameters(maf, tss, gc content) can be found below:
-#foreach my $cg (keys %{$$mvps{'SNPS'}}){
-# srand;
-# my ($maf, $tss, $gc) = split("\t", join("\t", $$mvps{'SNPS'}{$cg}{'PARAMS'}));
-# #$cg is the test mvp, $cgid is the matched mvp.
-# my ($i, $j, $k) = assign ($gc, $tss, $maf, \%params);
-#
-# my $range = scalar @{$bins{$i}{$j}{$k}};
-# for (my $n = 1; $n <= $reps; $n++) {
-# my ($mvp_string, $cgid);
-# while (1){
-# my $pick = int(rand($range));
-# $mvp_string = ${$bins{$i}{$j}{$k}}[$pick]; #pick the $pick'th element in the array as the chosen mvp "
-# (undef, undef, undef, $cgid) = split /\t/, $mvp_string;
-# last unless $cgid eq $cg; # must not pick the test mvp itself.
-# }
-# push @{$picks{$n}}, $cgid; # each $n array is a set of probes matching the test set/ it is allowed to pick the same probe more than once in this background selection
-# }
-# }
-# return \%picks;
-
+ $picked_probe_id = ${$bins{$feature}{$cpg_island_relationship}}[$pick]; #pick the $pick'th element in the array as the chosen mvp
+ last unless $picked_probe_id eq $probe_id; # must not pick the test mvp itself.
+ }
+ push(@{$picks->[$n]}, $picked_probe_id);
+ }
+ }
+ return $picks;
+}
=head2 process_overlaps
@@ -271,30 +230,31 @@ sub match{
sub process_overlaps {
my ($rows, $cells, $data) = @_;
- my %test;
- my @test_cells;
+ my $overlaps;
+ my @overlapping_probes_per_cell;
my @indexes = 0..(@$cells-1);
foreach my $row (@{$rows}){
my ($probeid, $sum, $bit_string, $feature, $cpg_island_relationship) = @$row;
- $test{'MVPS'}{$probeid}{'SUM'} = $sum;
- $test{'MVPS'}{$probeid}{'PARAMS'} = join("\t", $feature, $cpg_island_relationship);
+ $overlaps->{'MVPS'}->{$probeid}->{'SUM'} = $sum;
+ $overlaps->{'MVPS'}->{$probeid}->{'PARAMS'} = join("\t", $feature, $cpg_island_relationship);
die "For $data, found ".scalar(@$cells)." cells for ".length($bit_string)." bits\n" if (scalar(@$cells) ne length($bit_string));
foreach my $index (@indexes) {
## $bit_string is a string made of 0s and 1s. If it is a 1 for this position, count and push
if (substr($bit_string, $index, 1)) {
- $test_cells[$index][0]++;
- push @{$test_cells[$index][1]}, $probeid;
+ push @{$overlapping_probes_per_cell[$index]}, $probeid;
}
}
}
my $index = 0;
foreach my $cell (@$cells){
- $test{'CELLS'}{$cell}{'COUNT'} = $test_cells[$index][0] if ($test_cells[$index][0]);
- $test{'CELLS'}{$cell}{'MVPS'} = $test_cells[$index][1] if ($test_cells[$index][1]);
+ if ($overlapping_probes_per_cell[$index] and @{$overlapping_probes_per_cell[$index]}) {
+ $overlaps->{'CELLS'}->{$cell}->{'COUNT'} = scalar(@{$overlapping_probes_per_cell[$index]});
+ $overlaps->{'CELLS'}->{$cell}->{'MVPS'} = $overlapping_probes_per_cell[$index];
+ }
$index++;
}
- return \%test;
+ return $overlaps;
}
View
@@ -382,16 +382,16 @@ =head1 ACKNOWLEDGEMENTS
my ($cells, $samples) = get_samples_from_dataset($dbh, $dataset);
# unpack the bitstrings and store the overlaps by cell.
-# $test is a complex hash like:
-# $test{'MVPS'}{$probe_id}{'SUM'} (total number of overlaps of this probe with DHS/etc in this dataset)
-# $test{'MVPS'}{$probe_id}{'PARAMS'} (gene and CGI annotations for this probe)
-# $test{'CELLS'}{$cell}{'COUNT'} (number of input MVPs that overlap with the signal on this sample)
-# $test{'CELLS'}{$cell}{'MVPS'} (list of input MVPs that overlap with the signal on this sample)
-my $test = process_overlaps($annotated_probes, $cells, $dataset);
+# $overlaps is a complex hash like:
+# $overlaps->{'MVPS'}->{$probe_id}->{'SUM'} (total number of overlaps of this probe with features in this dataset)
+# $overlaps->{'MVPS'}->{$probe_id}->{'PARAMS'} (gene and CGI annotations for this probe)
+# $overlaps->{'CELLS'}->{$cell}->{'COUNT'} (number of input MVPs that overlap with the signal on this sample)
+# $overlaps->{'CELLS'}->{$cell}->{'MVPS'} (list of input MVPs that overlap with the signal on this sample)
+my $overlaps = process_overlaps($annotated_probes, $cells, $dataset);
# generate stats on the background selection
if (defined $bkgrdstat) {
- bkgrdstat($test, $lab, "test");
+ bkgrdstat($overlaps, $lab, "test");
}
@@ -401,36 +401,40 @@ =head1 ACKNOWLEDGEMENTS
#(in subroutines process_overlaps, etc)
-# identify the feature and cpg island relationship, and then make bkgrd picks (old version just below)
-#my $picks = match(\%$test, $array, $datadir, $per, $reps);
+# Identify the feature and cpg island relationship, and then make random picks
warn "[".scalar(localtime())."] Loading the $array background...\n";
-my $picks = match(\%$test, $array, $datadir, $reps);
+my $random_picks = get_random_matching_picks($overlaps, $array, $datadir, $reps);
########check below lines:
# for bkgrd set need to get distribution of counts instead
# make a hash of data -> cell -> bkgrd-Set -> overlap counts
-my %bkgrd; #this hash is going to store the bkgrd overlaps
+my %overlaps_per_cell; #this hash is going to store the bkgrd overlaps
# Get the bits for the background sets and process
-my $backmvps;
+my $total_num_probes_in_random_picks;
warn "[".scalar(localtime())."] Running the analysis with $num_of_valid_probes MVPs...\n";
my $num = 0;
-foreach my $bkgrd (keys %{$picks}) {
+foreach my $this_random_pick (@$random_picks) {
warn "[".scalar(localtime())."] Repetition $num out of ".$reps."\n" if (++$num%100 == 0);
- #$annotated_probes = get_probe_annotations_and_overlap_for_dataset(\@{$$picks{$bkgrd}}, $sth);
- $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh, $dataset, $array, \@{$$picks{$bkgrd}});
- $backmvps += scalar @$annotated_probes; #$backmvps is the total number of background probes analysed
+ $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh, $dataset, $array, $this_random_pick);
+
+ $total_num_probes_in_random_picks += scalar @$annotated_probes;
+
unless (scalar @$annotated_probes == $num_of_valid_probes) {
- warn "Background " . $bkgrd . " only " . scalar @$annotated_probes . " probes out of $num_of_valid_probes\n";
+ warn "Random pick #$num only has " . scalar @$annotated_probes . " probes compared to $num_of_valid_probes in the input set.\n";
}
- my $result = process_overlaps($annotated_probes, $cells, $dataset);
- foreach my $cell (keys %{$$result{'CELLS'}}) {
- push @{$bkgrd{$cell}}, $$result{'CELLS'}{$cell}{'COUNT'}; # accumulate the overlap counts by cell
+
+ my $this_pick_overlaps = process_overlaps($annotated_probes, $cells, $dataset);
+
+ # accumulate the overlap counts by cell
+ foreach my $cell (keys %{$this_pick_overlaps->{'CELLS'}}) {
+ push @{$overlaps_per_cell{$cell}}, $this_pick_overlaps->{'CELLS'}->{$cell}->{'COUNT'};
}
+
if (defined $bkgrdstat) {
- bkgrdstat($result, $lab, $bkgrd);
+ bkgrdstat($this_pick_overlaps, $lab, $this_random_pick);
}
}
@@ -458,18 +462,18 @@ =head1 ACKNOWLEDGEMENTS
# ultimately want a data frame of names(results)<-c("Zscore", "Cell", "Tissue", "File", "MVPs")
if (!$web) {
- print BACKGROUND join("\t", @{$bkgrd{$cell}}), "\n";
+ print BACKGROUND join("\t", @{$overlaps_per_cell{$cell}}), "\n";
}
- my $teststat = ($test->{'CELLS'}->{$cell}->{'COUNT'} or 0); #number of overlaps for the test MVPs
+ my $teststat = ($overlaps->{'CELLS'}->{$cell}->{'COUNT'} or 0); #number of overlaps for the test MVPs
# binomial pvalue, probability of success is derived from the background overlaps over the tests for this cell
# $backmvps is the total number of background mvps analysed
# $tests is the number of overlaps found over all the background tests
- my $tests;
- foreach (@{$bkgrd{$cell}}) {
- $tests += $_;
+ my $total_num_overlaps_in_random_picks;
+ foreach (@{$overlaps_per_cell{$cell}}) {
+ $total_num_overlaps_in_random_picks += $_;
}
- my $p = sprintf("%.6f", $tests/$backmvps);
+ my $p = sprintf("%.6f", $total_num_overlaps_in_random_picks / $total_num_probes_in_random_picks);
# binomial probability for $teststat or more hits out of $mvpcount mvps
# sum the binomial for each k out of n above $teststat
@@ -491,10 +495,11 @@ =head1 ACKNOWLEDGEMENTS
$pbinom = sprintf("%.2e", $pbinom);
# Z score calculation (note: this is here only for legacy reasons. Z-scores assume normal distribution)
- my $zscore = zscore($teststat, $bkgrd{$cell});
+ my $zscore = zscore($teststat, $overlaps_per_cell{$cell});
my $mvp_string = "";
- $mvp_string = join(",", @{$$test{'CELLS'}{$cell}{'MVPS'}}) if defined $$test{'CELLS'}{$cell}{'MVPS'};
+ $mvp_string = join(",", @{$overlaps->{'CELLS'}->{$cell}->{'MVPS'}})
+ if defined $overlaps->{'CELLS'}->{$cell}->{'MVPS'};
# This gives the list of overlapping MVPs for use in the tooltips. If there are a lot of them this can be a little useless
my ($shortcell, undef) = split('\|', $cell); # undo the concatenation from earlier to deal with identical cell names.

0 comments on commit 9368afc

Please sign in to comment.