Oct 23, 2014
eFORGE first commit
2
3
=head1 NAME
4
5
eforge.pl - Experimentally derived Functional element Overlap analysis of ReGions from EWAS.
6
7
=head1 SYNOPSIS
8
9
eforge.pl options (-f file) (-mvp mvplist)
10
11
=head1 DESCRIPTION
12
13
Analyse a set of MVPs for their overlap with DNase 1 hotspots compared to matched background MVPs.
14
Identifies enrichment in DHS by tissue and plots graphs and table to display. Arbitrarily a minumum of 5* MVPs is required.
15
Note that if no MVPs are given the script will run on A DEFAULT EWAS* as an example output.
16
17
Several outputs are made.
18
19
A straight base R graphics pdf chart of the data.
20
21
A polychart (https://github.com/Polychart/polychart2) interactive javascript graphic using rCharts (http://ramnathv.github.io/rCharts/).
22
23
A dimple (http://dimplejs.org) d3 interactive graphic using rCharts.
24
25
A table using the Datatables (https://datatables.net) plug-in for the jQuery Javascript library, again accessed through rCharts.
26
27
In each of the graphics the colouring should be consistent. Blue (p value > 0.05), light red or pink (0.05 => p value > 0.01), red or dark red (p value <= 0.01 ) for the 95% and 99% cIs.
28
Or whatever other thresholds are specified.
29
30
eForge functions, plotting options and stats are provided by eForge::eForge, eForge::ePlot and eForge::eStats modules.
31
32
=head1 OPTIONS
33
34
=over
35
May 10, 2016
Clean up the code
195
my $array ; # Default value
196
my $array_label ;
197
my $format = ' probeid' ; # Input format
198
my $label = ' Unnamed' ; # Label for plots
199
my $reps = 1000;
200
# set binomial p values, multiple test correction is used
201
my $thresh ; # string for command line option
202
my $t_marginal = 0.05; # default marginal p-value threshold
203
my $t_strict = 0.01; # default strict p-value threshold
204
Sep 30, 2015
New DB schema - both code and DB are data-agnostic
252
# # ============================================================================
253
# # Connect to the DB
254
# # ============================================================================
255
# This reads the config file and sets up the $datadir variable
256
my $dirname = dirname(__FILE__ );
257
my $cfg = Config::IniFiles-> new( -file => " $dirname /eforge.ini" );
258
my $datadir = $cfg -> val(' Files' , ' datadir' );
259
260
unless (-s " $datadir /$dbname " ) {
261
die " Database $dbname not found or empty" ;
262
}
263
my $dsn = " dbi:SQLite:dbname=$datadir /$dbname " ;
264
my $dbh = DBI-> connect ($dsn , " " , " " ) or die $DBI::errstr ;
265
# # ============================================================================
266
267
268
# # ============================================================================
269
# # Check the dataset against the info on the DB
270
# # ============================================================================
271
my $all_datasets = get_all_datasets($dbh );
272
if (!defined ($all_datasets )) {
273
die " Empty database: no dataset loaded!\n " ;
May 11, 2016
More code cleanup
342
# This will read the probes from the file if provided, from the probe list otherwise or use the
343
# example data set as a last resort.
344
my $mvps = get_input_probes($filename , $probe_list );
345
my $original_mvps = [@$mvps ];
346
my $num_of_input_mvps = scalar (@$mvps );
347
348
# Apply the proximity filter if requested
349
my ($proximity_excluded );
350
if (defined $proxy ) {
351
($proximity_excluded , $mvps ) = proximity_filter($dbh , $array , $mvps );
352
while (my ($excluded_mvp , $mvp ) = each %$proximity_excluded ) {
353
warn " $excluded_mvp excluded for $proxy proximity filter with $mvp \n " ;
354
}
355
}
May 11, 2016
More code cleanup
357
# $annotated_probes is an arrayref with probe_id, sum, bit, gene_group, cgi_group for each input probe
358
my $annotated_probes = get_probe_annotations_and_overlap_for_dataset($dbh , $dataset , $array , $mvps );
359
my $existing_probes = {map {$_ -> [0] => 1} @$annotated_probes };
360
$mvps = [keys %$existing_probes ];
May 11, 2016
More code cleanup
382
# $samples is a hash whose keys are the $cells (short name for the cell type/lines) and value is
383
# another hash with 'tissue', 'datatype', 'file' and 'acc' keys.
384
# IMPORTANT: $cells contains the list of cells in the order defined in the DB. This is critical
385
# to correctly assign each bit to the right sample.
386
my ($cells , $samples ) = get_samples_from_dataset($dbh , $dataset );
May 11, 2016
Even more code cleanup
389
# $overlaps is a complex hash like:
390
# $overlaps->{'MVPS'}->{$probe_id}->{'SUM'} (total number of overlaps of this probe with features in this dataset)
391
# $overlaps->{'MVPS'}->{$probe_id}->{'PARAMS'} (gene and CGI annotations for this probe)
392
# $overlaps->{'CELLS'}->{$cell}->{'COUNT'} (number of input MVPs that overlap with the signal on this sample)
393
# $overlaps->{'CELLS'}->{$cell}->{'MVPS'} (list of input MVPs that overlap with the signal on this sample)
394
my $overlaps = process_overlaps($annotated_probes , $cells , $dataset );
May 11, 2016
More code cleanup
541
Chart($results_filename , $lab , $out_dir , $samples , $cells , $label , $t_marginal , $t_strict , $dataset ); # basic pdf plot
542
dChart($results_filename , $lab , $out_dir , $dataset , $label , $t_marginal , $t_strict , $web ); # rCharts Dimple chart
543
table($results_filename , $lab , $out_dir , $web ); # Datatables chart
May 10, 2016
Clean up the code
556
# ###################################################################################################
557
# ###################################################################################################
558
# #
559
# # Sub-functions
560
# #
561
# ###################################################################################################
562
# ###################################################################################################
563
564
565
=head2 parse_pvalue_thresholds
566
567
Arg[1] : string $thresholds
568
Returns : arrayref of marginal and strict thresholds (floats)
569
Example : ($t_marginal, $t_strict) = parse_pvalue_thesholds("0.05,0.01");
570
Description : This function returns the both marginal and strict p-value thresholds as read from
571
the command line option. The input string should contain both numbers separated by
572
a comma.
573
Exceptions : Dies if $thresholds is empty, does not contain numbers or are not defined between
574
0 and 1 and/or the marginal threshold is not larger or equal to the strict one.
575
576
=cut
577
578
sub parse_pvalue_thresholds {
579
my ($thresh ) = @_ ;
580
my ($t_marginal , $t_strict );
581
582
if (!$thresh ) {
583
die " Cannot read p-value thresholds from an empty string\n " ;
584
}
585
586
($t_marginal , $t_strict ) = split (" ," , $thresh );
587
unless (looks_like_number($t_marginal ) && looks_like_number($t_strict )){
588
die " You must specify numerical p-value thresholds in a comma separated list\n " ;
589
}
590
unless ((1 >= $t_marginal ) && ($t_marginal >= $t_strict ) && ($t_strict >= 0)) {
591
die " The p-value thresholds should be 1 >= T.marginal >= T.strict >= 0\n " ;
592
}
593
return ($t_marginal , $t_strict );
594
}
595
596