From fa557f9863c3b6aa3fd0bcb5614b6c1b616e0fb3 Mon Sep 17 00:00:00 2001 From: Javier Herrero Date: Mon, 18 Apr 2016 11:00:29 +0100 Subject: [PATCH] Use compressed files to save space on the server --- eforge.pl | 31 ++++++++++++++++++++++--------- webserver/cgi-bin/index.pl | 22 +++++++++++----------- 2 files changed, 33 insertions(+), 20 deletions(-) diff --git a/eforge.pl b/eforge.pl index 6a9dd51..59914f1 100644 --- a/eforge.pl +++ b/eforge.pl @@ -366,7 +366,14 @@ =head1 ACKNOWLEDGEMENTS warn "You have specified p value filtering, but this isn't implemented for files of format $format. No filtering will happen." } } - open my $fh, "<", $file or die "cannot open file $file : $!"; + my $fh; + if ($file =~ /\.gz$/) { + open($fh, "gunzip -c $file |") or die "cannot open file $file : $!"; + } elsif ($file =~ /\.bz2$/) { + open($fh, "bunzip2 -c $file |") or die "cannot open file $file : $!"; + } else { + open($fh, "<$file") or die "cannot open file $file : $!"; + } $mvps = process_file($fh, $format, $dbh, $bkgd, $filter); } elsif (@mvplist) { @@ -508,7 +515,9 @@ =head1 ACKNOWLEDGEMENTS mkdir $out_dir; -open my $bfh, ">", "$out_dir/background.tsv" or die "Cannot open background.tsv"; +if (!$web) { + open(BACKGROUND, "| gzip -9 > $out_dir/background.tsv.gz") or die "Cannot open background.tsv"; +} @@ -519,7 +528,9 @@ =head1 ACKNOWLEDGEMENTS # above line sorts by the tissues alphabetically (from $tissues hash values) # ultimately want a data frame of names(results)<-c("Zscore", "Cell", "Tissue", "File", "MVPs") - say $bfh join("\t", @{$bkgrd{$cell}}); + if (!$web) { + print BACKGROUND join("\t", @{$bkgrd{$cell}}), "\n"; + } my $teststat = ($$test{'CELLS'}{$cell}{'COUNT'} or 0); #number of overlaps for the test MVPs # binomial pvalue, probability of success is derived from the background overlaps over the tests for this cell @@ -560,20 +571,22 @@ =head1 ACKNOWLEDGEMENTS push(@results, [$zscore, $pbinom, $shortcell, $$tissues{$cell}{'tissue'}, $$tissues{$cell}{'datatype'}, $$tissues{$cell}{'file'}, $mvp_string, $$tissues{$cell}{'acc'}]); } -close($bfh); +if (!$web) { + close(BACKGROUND); +} # Correct the p-values for multiple testing using the Benjamini-Yekutieli FDR control method my $qvalues = BY(\@pvalues); $qvalues = [map {sprintf("%.2e", $_)} @$qvalues]; # Write the results to a tab-separated file -my $filename = "$lab.chart.tsv"; -open my $ofh, ">", "$out_dir/$filename" or die "Cannot open $out_dir/$filename: $!"; -print $ofh join("\t", "Zscore", "Pvalue", "Cell", "Tissue", "Datatype", "File", "Probe", "Accession", "Qvalue"), "\n"; +my $filename = "$lab.chart.tsv.gz"; +open(TSV, "| gzip -9 > $out_dir/$filename") or die "Cannot open $out_dir/$filename: $!"; +print TSV join("\t", "Zscore", "Pvalue", "Cell", "Tissue", "Datatype", "File", "Probe", "Accession", "Qvalue"), "\n"; for (my $i = 0; $i < @results; $i++) { - print $ofh join("\t", @{$results[$i]}, $qvalues->[$i]), "\n"; + print TSV join("\t", @{$results[$i]}, $qvalues->[$i]), "\n"; } -close($ofh); +close(TSV); warn "[".scalar(localtime())."] Generating plots...\n"; diff --git a/webserver/cgi-bin/index.pl b/webserver/cgi-bin/index.pl index 6f0ba40..21e7863 100755 --- a/webserver/cgi-bin/index.pl +++ b/webserver/cgi-bin/index.pl @@ -40,7 +40,7 @@ # The location of the bin dir w.r.t. the cgi-bin dir (DO NOT CHANGE) my $BIN_DIR = "../bin"; # The name of the input data file -my $INPUT_DATAFILE = "input.txt"; +my $INPUT_DATAFILE = "input.txt.gz"; # The name of the output data file my $STDOUT_FILE = "output.txt"; @@ -432,7 +432,7 @@ sub validate_form { ## It seems like all the options are valid, so we can now store the input data in the output ## directory my $absolute_outdir = get_absolute_outdir(); - open(INPUT, ">$absolute_outdir/$INPUT_DATAFILE") or + open(INPUT, "| gzip -9 > $absolute_outdir/$INPUT_DATAFILE") or die "Cannot open $absolute_outdir/$INPUT_DATAFILE"; foreach my $this_line (@lines) { print INPUT $this_line, "\n"; @@ -562,19 +562,19 @@ sub print_result { my $web_outdir = get_web_outdir(); opendir(DIR, $absolute_outdir); - my @files = grep {/(.pdf|.html|.tsv|.R)$/} readdir(DIR); + my @files = grep {/(.pdf|.html|.tsv|.R|.gz|.bz2)$/} readdir(DIR); closedir(DIR); - my $table_file = (grep {/.table.html$/} @files)[0]; - my $table_R = (grep {/.table.R$/i} @files)[0]; - my $dchart_file = (grep {/.dchart.html$/} @files)[0]; - my $dchart_R = (grep {/.dchart.R$/i} @files)[0]; - my $tsv_file = (grep {/.chart.tsv$/} @files)[0]; - my $pdf_file = (grep {/.chart.pdf$/} @files)[0]; + my $table_file = (grep {/\.table\.html$/} @files)[0]; + my $table_R = (grep {/\.table\.R$/i} @files)[0]; + my $dchart_file = (grep {/\.dchart\.html$/} @files)[0]; + my $dchart_R = (grep {/\.dchart\.R$/i} @files)[0]; + my $tsv_file = (grep {/\.chart\.tsv(\.gz|\.bz2)?$/} @files)[0]; + my $pdf_file = (grep {/\.chart\.pdf$/} @files)[0]; my $pdf_R = (grep {/.chart.R$/i} @files)[0]; print $fh Template::content_box_1("Results", - "Input data (txt)", - "Raw data (tsv)", + "Input data (txt.gz)", + "Raw data (tsv.gz)", "Static chart (PDF)", "Interactive chart (HTML)", "Interactive table (HTML)",