From a97934e9c409a997a8f0dc32e711d1a4c3e7d4e1 Mon Sep 17 00:00:00 2001 From: Javier Herrero Date: Wed, 11 May 2016 09:28:56 +0100 Subject: [PATCH] More code cleanup --- eForge/eForge.pm | 156 +++++++++++++++++++---------------- eforge.pl | 242 ++++++++++++++++++++++++++++++++----------------------- 2 files changed, 231 insertions(+), 167 deletions(-) diff --git a/eForge/eForge.pm b/eForge/eForge.pm index 7b3df83..51fd76e 100644 --- a/eForge/eForge.pm +++ b/eForge/eForge.pm @@ -49,7 +49,7 @@ our $VERSION = '0.01'; our (@ISA, @EXPORT); use Exporter; @ISA = qw(Exporter); -@EXPORT = qw(get_all_datasets get_all_arrays get_all_proxy_filters process_file match process_bits get_bits get_cells assign bkgrdstat prox_filter); +@EXPORT = qw(get_all_datasets get_all_arrays get_all_proximity_filters process_file match process_overlaps get_probe_annotations_and_overlap_for_dataset get_samples_from_dataset assign bkgrdstat proximity_filter); =head1 SYNOPSIS @@ -59,15 +59,15 @@ Provide functional modules for eForge get_all_datasets get_all_arrays -get_all_proxy_filters +get_all_proximity_filters process_file match -process_bits -get_bits -get_cells +process_overlaps +get_probe_annotations_and_overlap_for_dataset +get_samples_from_dataset assign bkgrdstat -prox_filter +proximity_filter =head1 SUBROUTINES/METHODS @@ -170,7 +170,7 @@ Identifies the bins that each of the probes in a probe hash lies in, and then pi =cut -#it is not enough to change match, we also have to change bkgrdstat and process_bits and get_bits +#it is not enough to change match, we also have to change bkgrdstat and process_overlaps and get_bits #(we don't use "assign" as we do not use percentile bins so no point in changing "assign") sub match{ #we take out the percentile bins ($per): @@ -255,13 +255,13 @@ sub match{ -=head2 process_bits +=head2 process_overlaps - Arg[1] : arrayref of arrays $rows ($probe_id, $sum, $bit_string, $feature, $CGI_context) + Arg[1] : arrayref of arrays $annotated_probes ($probe_id, $sum, $bit_string, $feature, $CGI_context) Arg[2] : arrayref of strings $cells (shortname for the cells in the dataset) Arg[3] : string $dataset Returns : hashref $stats - Example : my $result = process_bits($rows, $cells, 'erc'); + Example : my $result = process_overlaps($rows, $cells, 'erc'); Description : Returns a reference to a complex hash with stats from the rows. These are split into 'MVPS' and 'CELLS'. The former contains 'SUM' and 'PARAMS' for each probe ID while the latter contains 'COUNT' and 'MVPS' for each cell. @@ -269,7 +269,7 @@ sub match{ =cut -sub process_bits { +sub process_overlaps { my ($rows, $cells, $data) = @_; my %test; my @test_cells; @@ -315,7 +315,7 @@ sub get_all_datasets { my ($dbh, $species_name) = @_; my $datasets; - my $sql = "SELECT dataset_tag, dataset_name FROM dataset ORDER BY dataset_id"; + my $sql = "SELECT dataset_tag, dataset_name FROM dataset ORDER BY dataset_id DESC"; my @bind_params = (); if ($species_name) { $sql .= " WHERE species_name = ?"; @@ -350,7 +350,7 @@ sub get_all_arrays { my ($dbh, $species_name) = @_; my $arrays; - my $sql = "SELECT array_tag, array_name FROM array order by array_id"; + my $sql = "SELECT array_tag, array_name FROM array order by array_id DESC"; my @bind_params = (); if ($species_name) { $sql .= " WHERE species_name = ?"; @@ -368,12 +368,12 @@ sub get_all_arrays { } -=head2 get_all_proxy_filters +=head2 get_all_proximity_filters Arg[1] : DB-handle $dbh Arg[2] : (optional) string $array Returns : hashref of proxy-filters (string) - Example : my $proxy_filters = get_all_proxy_filters($dbh, '450k'); + Example : my $proximity_filters = get_all_proximity_filters($dbh, '450k'); Description : Returns the list of proxy filters available in the DB. It returns a hashref whose keys are the bkgd names (i.e. array tags) and values are the name of the proxy filter. At the moment the schema supports just one proxy filter per array. @@ -384,9 +384,9 @@ sub get_all_arrays { =cut -sub get_all_proxy_filters { +sub get_all_proximity_filters { my ($dbh, $array_tag) = @_; - my $proxy_filters; + my $proximity_filters; my $sql = "SELECT array_tag, description FROM proxy_filter_info JOIN array USING (array_id)"; my @bind_params = (); @@ -399,26 +399,37 @@ sub get_all_proxy_filters { $sth->execute(); $sth->bind_columns(\$this_array_tag, \$description); while ($sth->fetch) { - $proxy_filters->{$this_array_tag} = $description; + $proximity_filters->{$this_array_tag} = $description; } - return($proxy_filters); + return($proximity_filters); } -=head2 get_bits -Get the bitstrings for an array of mvps from the sqlite db +=head2 get_probe_annotations_and_overlap_for_dataset + + Arg[1] : DB-handle $dbh + Arg[2] : string $dataset_tag + Arg[3] : string $array + Arg[4] : arrayref of strings $probe_ids + Returns : arrayref of arrays containing the probe_id, sum, bitstring, gene-group and + cgi-group + Example : my $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh, + 'erc', '450k', $probe_list); + Description : Fetches the gene and CGI related annotation for the set of probes as well as the + overlaps with the features defined in the selected dataset + Exceptions : =cut -sub get_bits{ - my ($dbh, $dataset_tag, $array_tag, $mvps) = @_; - my @results; - - for (my $loop = 0; $loop * $MAX_SQL_VARIABLES < @$mvps; $loop++) { +sub get_probe_annotations_and_overlap_for_dataset { + my ($dbh, $dataset_tag, $array_tag, $probe_ids) = @_; + my $results = []; + + for (my $loop = 0; $loop * $MAX_SQL_VARIABLES < @$probe_ids; $loop++) { my $start = $loop * $MAX_SQL_VARIABLES; my $end = ($loop + 1) * $MAX_SQL_VARIABLES - 1; - $end = @$mvps -1 if ($end >= @$mvps); + $end = @$probe_ids - 1 if ($end >= @$probe_ids); my $sql = "SELECT probe_id, sum, bit, gene_group, cgi_group FROM probe_annotation @@ -427,16 +438,17 @@ sub get_bits{ JOIN array ON (array.array_id = probe_annotation.array_id) WHERE array_tag = ? and dataset_tag = ? AND probe_id IN (?". (",?" x ($end - $start)).")"; - my $sth = $dbh->prepare_cached($sql); #get the blocks form the ld table - $sth->execute($array_tag, $dataset_tag, @$mvps[$start..$end]); + + my $sth = $dbh->prepare_cached($sql); + $sth->execute($array_tag, $dataset_tag, @$probe_ids[$start..$end]); while (my $row = $sth->fetchrow_arrayref()) { - push @results, [@$row]; + push @$results, [@$row]; } $sth->finish(); } - return \@results;# return the bitstring line from the database + return $results; } @@ -454,7 +466,6 @@ sub get_bits{ =cut sub fetch_all_probe_ids { - #gets the rsid for a SNP where a location is given my ($dbh, $array, $locations) = @_; my $sth = $dbh->prepare("SELECT probe_id @@ -475,74 +486,83 @@ sub fetch_all_probe_ids { return $probe_ids; } -=head2 prox_filter +=head2 proximity_filter Filter MVPs from the MVP list if they are within 1 kb of each other. The rationale is that the first MVP to be identified in a block is chosen, and others are removed. =cut -sub prox_filter{ - my ($mvps, $dbh) = @_; - my %prox_excluded; # a hash to store MVPs found in proximity (1 kb) with an MVP in the list - my @mvps_filtered; # The list of MVPs filtered - my %mvps; - foreach my $mvp (@$mvps){ - $mvps{$mvp} = 1; - } - for (my $loop = 0; $loop * $MAX_SQL_VARIABLES < @$mvps; $loop++) { +sub proximity_filter { + my ($dbh, $array_tag, $probe_ids, $filter) = @_; + my %prox_excluded_probes; # a hash to store MVPs found in proximity (1 kb) with an MVP in the list + my %filtered_probes; # The list of MVPs filtered (i.e. after filtering, the ones to keep) + my %missing_probes; + + # Get the full list of probes as a hash (also removes redundancy) + my %probe_id_hash; + foreach my $probe_id (@$probe_ids){ + $probe_id_hash{$probe_id} = 1; + } + $probe_ids = [keys %probe_id_hash]; + + for (my $loop = 0; $loop * $MAX_SQL_VARIABLES < @$probe_ids; $loop++) { my $start = $loop * $MAX_SQL_VARIABLES; my $end = ($loop + 1) * $MAX_SQL_VARIABLES - 1; - $end = @$mvps -1 if ($end >= @$mvps); + $end = @$probe_ids - 1 if ($end >= @$probe_ids); - my $sql = "SELECT probe_id,proxy_probes FROM proxy_filter WHERE probe_id IN (?". (",?" x ($end - $start)).")"; + my $sql = "SELECT probe_id, proxy_probes FROM proxy_filter JOIN array USING (array_id) WHERE". + " array_tag = ? AND probe_id IN (?". (",?" x ($end - $start)).")"; my $sth = $dbh->prepare($sql); #get the blocks form the ld table - $sth->execute(@$mvps[$start..$end]); + $sth->execute($array_tag, @$probe_ids[$start..$end]); my $result = $sth->fetchall_arrayref(); $sth->finish(); + foreach my $row (@{$result}){ - my ($mvp, $block) = @$row; - next if exists $prox_excluded{$mvp}; # if the mvp is in the proximity filtered set already ignore it - push @mvps_filtered, $mvp; # if this is the first time it is seen, add it to the filtered mvps, and remove anything in proximity with it - next if $block =~ /NONE/; # nothing is in proximity - my (@block) = split (/\|/, $block); - foreach my $proxmvp (@block){ - if (exists $mvps{$proxmvp}) { - $prox_excluded{$proxmvp} = $mvp; #Add to the excluded mvps, if it is in proximity with the current mvp, and it its one of the test mvps. - } - } - } - } + my ($probe_id, $probe_id_list) = @$row; + # if the probe is in the proximity filtered set already ignore it + next if exists $prox_excluded_probes{$probe_id}; + # if this is the first time it is seen, add it to the filtered mvps, and remove anything in proximity with it + $filtered_probes{$probe_id} = 1; + next if $probe_id_list =~ /NONE/; # nothing is in proximity + my (@other_probe_ids) = split (/\|/, $probe_id_list); + foreach my $other_probe_id (@other_probe_ids) { + if (exists $probe_id_hash{$other_probe_id}) { + $prox_excluded_probes{$other_probe_id} = $probe_id; #Add to the excluded mvps, if it is in proximity with the current mvp, and it its one of the test mvps. + } + } + } + } + #note that if an MVP doesn't exist in the proximity file it will be rejected regardless, may need to add these back - return (\%prox_excluded, @mvps_filtered); + return (\%prox_excluded_probes, [keys %filtered_probes]); } -=head2 get_cells +=head2 get_samples_from_dataset Read the correct cell list based on data (erc, erc2, blueprint or encode). Also gets the tissue names for the cells. =cut -sub get_cells { - my ($dataset_tag, $dbh) = @_; - my $samples; +sub get_samples_from_dataset { + my ($dbh, $dataset_tag) = @_; + my ($cells, $samples); my $sth = $dbh->prepare("SELECT shortcell, tissue, datatype, file, acc FROM dataset JOIN sample USING (dataset_id) WHERE dataset_tag = ? ORDER BY sample_order"); $sth->execute($dataset_tag); my ($shortcell, $tissue, $datatype, $file, $acc); $sth->bind_columns(\$shortcell, \$tissue, \$datatype, \$file, \$acc); - my ($cells, $tissues); while ($sth->fetch) { $shortcell = "$shortcell|$file"; # Sometimes the same cell is used twice, with a differnt file so need to record separately (e.g. WI-38). push @$cells, $shortcell; - $$tissues{$shortcell}{'tissue'} = $tissue; # this is the hash that is used to connect cells and tissues and ultimately provide the sorting order - $$tissues{$shortcell}{'datatype'} = $datatype; - $$tissues{$shortcell}{'file'} = $file; - $$tissues{$shortcell}{'acc'} = $acc; + $$samples{$shortcell}{'tissue'} = $tissue; # this is the hash that is used to connect cells and tissues and ultimately provide the sorting order + $$samples{$shortcell}{'datatype'} = $datatype; + $$samples{$shortcell}{'file'} = $file; + $$samples{$shortcell}{'acc'} = $acc; } # use Data::Dumper; # print Dumper %$tissues; - return ($cells, $tissues); # return + return ($cells, $samples); # return } 1; diff --git a/eforge.pl b/eforge.pl index 5774f00..5a6d09a 100644 --- a/eforge.pl +++ b/eforge.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl =head1 NAME @@ -200,10 +200,10 @@ =head1 ACKNOWLEDGEMENTS my $t_marginal = 0.05; # default marginal p-value threshold my $t_strict = 0.01; # default strict p-value threshold -my $min_mvps = 5; # the minimum number of mvps allowed for test. Set to 5 as we have binomial p +my $min_num_probes = 5; # the minimum number of probes allowed for test. Set to 5 as we have binomial p my ($dataset, $filename, $bkgrdstat, $noplot, - $help, $man, $proxy, $noproxy, $depletion, $filter, $out_dir, $mvp_list, + $help, $man, $proxy, $noproxy, $depletion, $filter, $out_dir, $probe_list, $web, $autoopen); GetOptions ( @@ -213,8 +213,8 @@ =head1 ACKNOWLEDGEMENTS 'label=s' => \$label, 'f=s' => \$filename, 'format=s' => \$format, - 'mvps=s@' => \$mvp_list, - 'min_mvps=i' => \$min_mvps, + 'probes|mvps=s@' => \$probe_list, + 'min_num_probes|min_mvps=i' => \$min_num_probes, 'noplot' => \$noplot, 'reps=i' => \$reps, 'thresh=s' => \$thresh, @@ -309,8 +309,10 @@ =head1 ACKNOWLEDGEMENTS ## Check the proxy_filter (A.K.A. filter) against DB ## ============================================================================ # Set proximity filter -unless (defined $noproxy) { - my $all_proxy_filters = get_all_proxy_filters($dbh); +if (defined $noproxy) { + $proxy = undef; +} else { + my $all_proxy_filters = get_all_proximity_filters($dbh); if ($all_proxy_filters->{$array}) { $proxy = $all_proxy_filters->{$array}; } @@ -318,50 +320,74 @@ =head1 ACKNOWLEDGEMENTS ## ============================================================================ +## ============================================================================ +## Append main options (depletion on/off; array; dataset) to $label +## ============================================================================ if (defined $depletion) { $label = "$label.depletion"; } - -#regexp puts underscores where labels before -(my $lab = $label) =~ s/\s/_/g; +(my $lab = $label) =~ s/\s/_/g; # Avoid whitespaces on the label $lab = "$lab.$array.$dataset"; +## ============================================================================ ## ============================================================================ ## Read and process the input MVPs ## ============================================================================ warn "[".scalar(localtime())."] Processing input...\n"; -my $mvps = get_input_mvps($filename, $mvp_list); +# This will read the probes from the file if provided, from the probe list otherwise or use the +# example data set as a last resort. +my $mvps = get_input_probes($filename, $probe_list); +my $original_mvps = [@$mvps]; +my $num_of_input_mvps = scalar(@$mvps); + +# Apply the proximity filter if requested +my ($proximity_excluded); +if(defined $proxy) { + ($proximity_excluded, $mvps) = proximity_filter($dbh, $array, $mvps); + while (my ($excluded_mvp, $mvp) = each %$proximity_excluded) { + warn "$excluded_mvp excluded for $proxy proximity filter with $mvp\n"; + } +} -my @origmvps = @$mvps; +# $annotated_probes is an arrayref with probe_id, sum, bit, gene_group, cgi_group for each input probe +my $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh, $dataset, $array, $mvps); +my $existing_probes = {map {$_->[0] => 1} @$annotated_probes}; +$mvps = [keys %$existing_probes]; -#######!!!!!### proximity filtering starts below: -my ($prox_excluded, $output, $input); -unless(defined $noproxy) { - $input = scalar @$mvps; - ($prox_excluded, @$mvps) = prox_filter($mvps, $dbh); - while (my ($excluded_mvp, $mvp) = each %$prox_excluded) { - warn "$excluded_mvp excluded for proximity (1 kb) with $mvp\n"; - } +## Detect and remove the missing probes. +my $num_missing_probes = find_missing_probes($original_mvps, $existing_probes); - $output = scalar @$mvps; +# Print summary of filtering and checks: +my $msg = "For $label, $num_of_input_mvps MVPs provided, ". scalar @$mvps. + " retained: $num_missing_probes were not found"; +if (defined $proxy) { + $msg .= " and " . scalar(keys %$proximity_excluded) . " excluded using $proxy proximity filter"; } +warn $msg, ".\n"; -# Check we have enough MVPs -if (scalar @$mvps < $min_mvps) { - die "Fewer than $min_mvps MVPs. Analysis not run\n"; +# Check we have enough MVPs left +my $num_of_valid_probes = scalar @$mvps; +if ($num_of_valid_probes < $min_num_probes) { + die "Fewer than $min_num_probes MVPs. Analysis not run\n"; } ## ============================================================================ # get the cell list array and the hash that connects the cells and tissues -my ($cells, $tissues) = get_cells($dataset, $dbh); - -# get the bit strings for the test mvps from the database file -my $rows = get_bits($dbh, $dataset, $array, $mvps); +# $samples is a hash whose keys are the $cells (short name for the cell type/lines) and value is +# another hash with 'tissue', 'datatype', 'file' and 'acc' keys. +# IMPORTANT: $cells contains the list of cells in the order defined in the DB. This is critical +# to correctly assign each bit to the right sample. +my ($cells, $samples) = get_samples_from_dataset($dbh, $dataset); # unpack the bitstrings and store the overlaps by cell. -my $test = process_bits($rows, $cells, $dataset); +# $test is a complex hash like: +# $test{'MVPS'}{$probe_id}{'SUM'} (total number of overlaps of this probe with DHS/etc in this dataset) +# $test{'MVPS'}{$probe_id}{'PARAMS'} (gene and CGI annotations for this probe) +# $test{'CELLS'}{$cell}{'COUNT'} (number of input MVPs that overlap with the signal on this sample) +# $test{'CELLS'}{$cell}{'MVPS'} (list of input MVPs that overlap with the signal on this sample) +my $test = process_overlaps($annotated_probes, $cells, $dataset); # generate stats on the background selection if (defined $bkgrdstat) { @@ -370,36 +396,9 @@ =head1 ACKNOWLEDGEMENTS -# Identify probeids that weren't found and warn about them. -#the reference to hash key 'MVPS' is because of use of perl module eforge.pm from the eForge tool -#(in subroutines process_bits etc) - -my @missing; -foreach my $probeid (@origmvps) { - if (defined $proxy) { - next if exists $$prox_excluded{$probeid}; - } - unless (exists $$test{'MVPS'}{$probeid}) { - push @missing, $probeid; - } -} - -if (scalar @missing > 0) { - warn "The following " . scalar @missing . " MVPs have not been analysed because they were not found on the ".$array_label."\n"; - warn join("\n", @missing) . "\n"; -} -if (defined $proxy) { - if ($output < $input) { - warn "For $label, $input MVPs provided, " . scalar @$mvps . " retained, " . scalar @missing . " not analysed, " . scalar(keys %$prox_excluded) . " proximity filtered at 1 kb\n"; - } -} - # only pick background mvps matching mvps that had bitstrings originally. #reference to hash key 'MVPS' is due to use of eforge.pm module from eForge tool -#(in subroutines process_bits, etc) - -my @foundmvps = keys %{$$test{'MVPS'}}; -my $mvpcount = scalar @foundmvps; +#(in subroutines process_overlaps, etc) # identify the feature and cpg island relationship, and then make bkgrd picks (old version just below) @@ -416,17 +415,17 @@ =head1 ACKNOWLEDGEMENTS # Get the bits for the background sets and process my $backmvps; -warn "[".scalar(localtime())."] Running the analysis with $mvpcount MVPs...\n"; +warn "[".scalar(localtime())."] Running the analysis with $num_of_valid_probes MVPs...\n"; my $num = 0; foreach my $bkgrd (keys %{$picks}) { warn "[".scalar(localtime())."] Repetition $num out of ".$reps."\n" if (++$num%100 == 0); - #$rows = get_bits(\@{$$picks{$bkgrd}}, $sth); - $rows = get_bits($dbh, $dataset, $array, \@{$$picks{$bkgrd}}); - $backmvps += scalar @$rows; #$backmvps is the total number of background probes analysed - unless (scalar @$rows == scalar @foundmvps) { - warn "Background " . $bkgrd . " only " . scalar @$rows . " probes out of " . scalar @foundmvps . "\n"; + #$annotated_probes = get_probe_annotations_and_overlap_for_dataset(\@{$$picks{$bkgrd}}, $sth); + $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh, $dataset, $array, \@{$$picks{$bkgrd}}); + $backmvps += scalar @$annotated_probes; #$backmvps is the total number of background probes analysed + unless (scalar @$annotated_probes == $num_of_valid_probes) { + warn "Background " . $bkgrd . " only " . scalar @$annotated_probes . " probes out of $num_of_valid_probes\n"; } - my $result = process_bits($rows, $cells, $dataset); + my $result = process_overlaps($annotated_probes, $cells, $dataset); foreach my $cell (keys %{$$result{'CELLS'}}) { push @{$bkgrd{$cell}}, $$result{'CELLS'}{$cell}{'COUNT'}; # accumulate the overlap counts by cell } @@ -454,14 +453,14 @@ =head1 ACKNOWLEDGEMENTS my @results; my @pvalues; ###ncmp is a function from Sort::Naturally -foreach my $cell (sort {ncmp($$tissues{$a}{'tissue'},$$tissues{$b}{'tissue'}) || ncmp($a,$b)} @$cells){ - # above line sorts by the tissues alphabetically (from $tissues hash values) +foreach my $cell (sort {ncmp($$samples{$a}{'tissue'},$$samples{$b}{'tissue'}) || ncmp($a,$b)} @$cells){ + # above line sorts by the tissues alphabetically (from $samples hash values) # ultimately want a data frame of names(results)<-c("Zscore", "Cell", "Tissue", "File", "MVPs") if (!$web) { print BACKGROUND join("\t", @{$bkgrd{$cell}}), "\n"; } - my $teststat = ($$test{'CELLS'}{$cell}{'COUNT'} or 0); #number of overlaps for the test MVPs + my $teststat = ($test->{'CELLS'}->{$cell}->{'COUNT'} or 0); #number of overlaps for the test MVPs # binomial pvalue, probability of success is derived from the background overlaps over the tests for this cell # $backmvps is the total number of background mvps analysed @@ -477,11 +476,11 @@ =head1 ACKNOWLEDGEMENTS my $pbinom; if (defined $depletion) { foreach my $k (0 .. $teststat) { - $pbinom += binomial($k, $mvpcount, $p); + $pbinom += binomial($k, $num_of_valid_probes, $p); } } else { - foreach my $k ($teststat .. $mvpcount) { - $pbinom += binomial($k, $mvpcount, $p); + foreach my $k ($teststat .. $num_of_valid_probes) { + $pbinom += binomial($k, $num_of_valid_probes, $p); } } if ($pbinom >1) { @@ -499,33 +498,44 @@ =head1 ACKNOWLEDGEMENTS # This gives the list of overlapping MVPs for use in the tooltips. If there are a lot of them this can be a little useless my ($shortcell, undef) = split('\|', $cell); # undo the concatenation from earlier to deal with identical cell names. - push(@results, [$zscore, $pbinom, $shortcell, $$tissues{$cell}{'tissue'}, $$tissues{$cell}{'datatype'}, $$tissues{$cell}{'file'}, $mvp_string, $$tissues{$cell}{'acc'}]); + push(@results, [$zscore, $pbinom, $shortcell, $$samples{$cell}{'tissue'}, $$samples{$cell}{'datatype'}, $$samples{$cell}{'file'}, $mvp_string, $$samples{$cell}{'acc'}]); } if (!$web) { close(BACKGROUND); } -# Correct the p-values for multiple testing using the Benjamini-Yekutieli FDR control method +## ============================================================================ +## Correct the p-values for multiple testing using the Benjamini-Yekutieli FDR control method +## ============================================================================ my $qvalues = BY(\@pvalues); $qvalues = [map {sprintf("%.2e", $_)} @$qvalues]; +## ============================================================================ + -# Write the results to a tab-separated file -my $filename = "$lab.chart.tsv.gz"; -open(TSV, "| gzip -9 > $out_dir/$filename") or die "Cannot open $out_dir/$filename: $!"; +## ============================================================================ +## Write the results to a tab-separated file +## ============================================================================ +my $results_filename = "$lab.chart.tsv.gz"; +open(TSV, "| gzip -9 > $out_dir/$results_filename") or die "Cannot open $out_dir/$results_filename: $!"; print TSV join("\t", "Zscore", "Pvalue", "Cell", "Tissue", "Datatype", "File", "Probe", "Accession", "Qvalue"), "\n"; for (my $i = 0; $i < @results; $i++) { print TSV join("\t", @{$results[$i]}, $qvalues->[$i]), "\n"; } close(TSV); +## ============================================================================ +## ============================================================================ +## Generate plots +## ============================================================================ warn "[".scalar(localtime())."] Generating plots...\n"; unless (defined $noplot){ #Plotting and table routines - Chart($filename, $lab, $out_dir, $tissues, $cells, $label, $t_marginal, $t_strict, $dataset); # basic pdf plot - dChart($filename, $lab, $out_dir, $dataset, $label, $t_marginal, $t_strict, $web); # rCharts Dimple chart - table($filename, $lab, $out_dir, $web); # Datatables chart + Chart($results_filename, $lab, $out_dir, $samples, $cells, $label, $t_marginal, $t_strict, $dataset); # basic pdf plot + dChart($results_filename, $lab, $out_dir, $dataset, $label, $t_marginal, $t_strict, $web); # rCharts Dimple chart + table($results_filename, $lab, $out_dir, $web); # Datatables chart } +## ============================================================================ warn "[".scalar(localtime())."] Done.\n"; @@ -535,6 +545,7 @@ =head1 ACKNOWLEDGEMENTS system("open $out_dir/$lab.chart.pdf"); } + #################################################################################################### #################################################################################################### ## @@ -576,16 +587,16 @@ sub parse_pvalue_thresholds { } -=head2 get_input_mvps +=head2 get_input_probes Arg[1] : string $filename - Arg[2] : arrayref $mvp_list + Arg[2] : arrayref $probe_list Returns : arrayref of probe IDs (string) - Example : $mvps = get_input_mvps("input.txt", undef); - Example : $mvps = get_input_mvps(undef, ["cg13430807", "cg10480329,cg06297318,cg19301114"]); - Example : $mvps = get_input_mvps(undef, undef); + Example : $mvps = get_input_probes("input.txt", undef); + Example : $mvps = get_input_probes(undef, ["cg13430807", "cg10480329,cg06297318,cg19301114"]); + Example : $mvps = get_input_probes(undef, undef); Description : This function returns the list of input probe IDs. This can come from either - $filename if defined or from $mvp_list otherwise. Each element in $mvp_list is a + $filename if defined or from $probe_list otherwise. Each element in $probe_list is a string which contains one or more probe IDs separated by commas (see Examples). Falls back to the default data set from Jaffe and Irizarry. The set of probe IDs is checked to remove redundant entries. @@ -593,9 +604,9 @@ =head2 get_input_mvps =cut -sub get_input_mvps { - my ($filename, $mvp_list) = @_; - my $mvps; +sub get_input_probes { + my ($filename, $probe_list) = @_; + my $probes; if (defined $filename) { my $fh; @@ -606,31 +617,64 @@ sub get_input_mvps { } else { open($fh, "$filename") or die "cannot open file $filename : $!"; } - $mvps = process_file($fh, $format, $dbh, $array, $filter); + $probes = process_file($fh, $format, $dbh, $array, $filter); - } elsif ($mvp_list and @$mvp_list) { - @$mvps = split(/,/, join(',', @$mvp_list)); + } elsif ($probe_list and @$probe_list) { + @$probes = split(/,/, join(',', @$probe_list)); } else{ # Test MVPs from Liu Y et al. Nat Biotechnol 2013 Pulmonary_function.snps.bed (*put EWAS bedfile here) # If no options are given it will run on the default set of MVPs warn "No probe input given, so running on default set of probes, a set of monocyte tDMPs from Jaffe AE and Irizarry RA, Genome Biol 2014."; - @$mvps = qw(cg13430807 cg10480329 cg06297318 cg19301114 cg23244761 cg26872907 cg18066690 cg04468741 cg16636767 cg10624395 cg20918393); + @$probes = qw(cg13430807 cg10480329 cg06297318 cg19301114 cg23244761 cg26872907 cg18066690 cg04468741 cg16636767 cg10624395 cg20918393); } # Remove redundancy in the input - my %nonredundant; - foreach my $mvp (@$mvps) { - $nonredundant{$mvp}++; + my %probes_hash; + foreach my $probe (@$probes) { + $probes_hash{$probe}++; + } + + while (my ($probe, $num) = each %probes_hash) { + if ($num > 1) { + say "$probe is present $num times in the input. Analysing only once." + } } - foreach my $mvp (keys %nonredundant) { - if ($nonredundant{$mvp} > 1) { - say "$mvp is present " . $nonredundant{$mvp} . " times in the input. Analysing only once." + @$probes = keys %probes_hash; + + return($probes); +} + + +=head2 find_missing_probes + + Arg[1] : arrayref of strings $original_probe_ids + Arg[2] : hashref $existing_probe_ids (keys are probe_ids, values are ignored) + Returns : int $num__missing_probes + Example : my $num_missing_probes = find_missing_probes(['cg001', 'cg002', 'cg003', 'cg004'], + {'cg001' => 1, 'cg003 => 1}); + Description : Detects and prints the list of missing probes if any. + Exceptions : + +=cut + +sub find_missing_probes { + my ($original_probes, $existing_probes_hash) = @_; + my $num_missing_probes = 0; + + my $missing_probes = []; + foreach my $probe_id (@$original_probes) { + unless ($existing_probes_hash->{$probe_id}) { + push @$missing_probes, $probe_id; } } + $num_missing_probes = scalar @$missing_probes; - @$mvps = keys %nonredundant; + if ($num_missing_probes > 0) { + warn "The following $num_missing_probes MVPs have not been analysed because they were not found on the selected array\n"; + warn join("\n", @$missing_probes) . "\n"; + } - return($mvps); + return $num_missing_probes; }