cf

#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long;
use POSIX qw(strftime);
use FindBin qw($Bin);
use lib "$FindBin::Bin/source";
use CF::Constants;
use CF::Helpers;
use CF::Headnodehelpers;
# use Data::Dumper;

no warnings qw(once);

##########################################################################
# Copyright 2014, Philip Ewels (phil.ewels@babraham.ac.uk)               #
#                                                                        #
# This file is part of Cluster Flow.                                     #
#                                                                        #
# Cluster Flow is free software: you can redistribute it and/or modify   #
# it under the terms of the GNU General Public License as published by   #
# the Free Software Foundation, either version 3 of the License, or      #
# (at your option) any later version.                                    #
#                                                                        #
# Cluster Flow is distributed in the hope that it will be useful,        #
# but WITHOUT ANY WARRANTY; without even the implied warranty of         #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          #
# GNU General Public License for more details.                           #
#                                                                        #
# You should have received a copy of the GNU General Public License      #
# along with Cluster Flow.  If not, see <http://www.gnu.org/licenses/>.  #
##########################################################################

my $CF_VERSION = $CF::Constants::CF_VERSION;
my $homedir = $ENV{"HOME"};
my %config = %CF::Constants::config;
my @pipeline_folders = ('./', "$homedir/clusterflow/pipelines/", "$Bin/pipelines/");
my @module_folders = ('./', "$homedir/clusterflow/modules/", "$Bin/modules/");
my $pipeline;

# Get command line parameters
my $GENOME;
my $cl_genome_path;
my $cl_bowtie_path;
my $cl_gtf_path;
my $SPLIT_FILES = 1;
my $cl_max_runs;
my $cl_paired_end;
my $cl_single_end;
my $cl_no_fn_check;
my $file_list;
my $cl_params;
my $cl_email;
my $cl_priority;
my $cl_total_cores;
my $cl_total_mem;
my $cl_notifications;
my $cl_list_pipelines;
my $cl_list_modules;
my $cl_list_genomes;
my $cl_dryrun;
my $cl_qstat;
my $cl_qstatall;
my $cl_qstatcolours;
my $cl_qdel;
my $cl_make_config;
my $cl_add_genome;
my $cl_version;
my $cl_check_updates;
my $cl_help;

my $config_result = GetOptions(
	"genome=s" => \$GENOME,
	"genome_path=s" => \$cl_genome_path,
	"bowtie_path=s" => \$cl_bowtie_path,
	"gtf_path=s" => \$cl_gtf_path,
	"split_files=i" => \$SPLIT_FILES,
	"max_runs=i" => \$cl_max_runs,
	"paired" => \$cl_paired_end,
	"single" => \$cl_single_end,
	"no_fn_check" => \$cl_no_fn_check,
	"file_list=s" => \$file_list,
	"params=s" => \$cl_params,
	"email=s" => \$cl_email,
	"priority=i" => \$cl_priority,
	"cores=i" => \$cl_total_cores,
	"mem=s" => \$cl_total_mem,
	"notifications=s"  => \$cl_notifications,
	"list_pipelines" => \$cl_list_pipelines,
	"list_modules" => \$cl_list_modules,
	"list_genomes" => \$cl_list_genomes,
	"dry_run" => \$cl_dryrun,
	"qstat" => \$cl_qstat,
	"qstatall" => \$cl_qstatall,
	"qstatcols" => \$cl_qstatcolours,
	"qdel=s" => \$cl_qdel,
	"make_config" => \$cl_make_config,
	"add_genome" => \$cl_add_genome,
	"version" => \$cl_version,
	"check_updates" => \$cl_check_updates,
	"help" => \$cl_help
);

if(!$config_result){
	die "Error! could not parse command line options.. For help, run cf --help\n";
}

# Get the pipeline and file list
$pipeline = shift(@ARGV);
my @files = @ARGV;

# Set up parameters
my $EMAIL = $CF::Constants::EMAIL;
my $CHECK_UPDATES = $CF::Constants::CHECK_UPDATES;
my $AVAILABLE_VERSION = $CF::Constants::AVAILABLE_VERSION;
my $UPDATES_LAST_CHECKED = $CF::Constants::UPDATES_LAST_CHECKED;
my @NOTIFICATIONS = @CF::Constants::NOTIFICATIONS;
my $PRIORITY = $CF::Constants::PRIORITY;
my $MAX_RUNS = $CF::Constants::MAX_RUNS;
my $TOTAL_CORES = $CF::Constants::TOTAL_CORES;
my $TOTAL_MEM = $CF::Constants::TOTAL_MEM;
my $CLUSTER_ENVIRONMENT = $CF::Constants::CLUSTER_ENVIRONMENT;
my $CUSTOM_JOB_SUBMIT_COMMAND = $CF::Constants::CUSTOM_JOB_SUBMIT_COMMAND;
my $ENV_MODULES_PATH = $CF::Constants::ENV_MODULES_PATH;
my $PROJECT_ID = $CF::Constants::PROJECT_ID;
my $JOB_TIMELIMIT = $CF::Constants::JOB_TIMELIMIT;
my $CF_MODULES = $CF::Constants::CF_MODULES;

# Set up genomes
my %GENOME_PATH_CONFIGS = %CF::Constants::GENOME_PATH_CONFIGS;
my %BOWTIE_PATH_CONFIGS = %CF::Constants::BOWTIE_PATH_CONFIGS;
my %GTF_PATH_CONFIGS = %CF::Constants::GTF_PATH_CONFIGS;
my %GENOME_PATHS = %CF::Constants::GENOME_PATHS;
my %BOWTIE_PATHS = %CF::Constants::BOWTIE_PATHS;
my %GTF_PATHS = %CF::Constants::GTF_PATHS;
my %GENOME_SPECIES = %CF::Constants::GENOME_SPECIES;
my %BOWTIE_SPECIES = %CF::Constants::BOWTIE_SPECIES;
my %GTF_SPECIES = %CF::Constants::GTF_SPECIES;
my %GENOME_ASSEMBLIES = %CF::Constants::GENOME_ASSEMBLIES;
my %BOWTIE_ASSEMBLIES = %CF::Constants::BOWTIE_ASSEMBLIES;
my %GTF_ASSEMBLIES = %CF::Constants::GTF_ASSEMBLIES;

my %JOB_NUM_IDS;

if($cl_email){
	$EMAIL = $cl_email;
}
if($cl_notifications){
	my @new_nots = split(//, $cl_notifications);
	@NOTIFICATIONS = ();
	foreach my $not (@new_nots){
		push (@NOTIFICATIONS, 'complete') if($not eq 'c');
		push (@NOTIFICATIONS, 'run') if($not eq 'r');
		push (@NOTIFICATIONS, 'success') if($not eq 's');
		push (@NOTIFICATIONS, 'error') if($not eq 'e');
		push (@NOTIFICATIONS, 'abort') if($not eq 'a');
	}
}
if($cl_genome_path){
	$GENOME_PATHS{"command_line"} = $cl_genome_path;
	$GENOME = "command_line";
}
if($cl_bowtie_path){
	$BOWTIE_PATHS{"command_line"} = $cl_bowtie_path;
	$GENOME = "command_line";
}
if($cl_gtf_path){
	$GTF_PATHS{"command_line"} = $cl_gtf_path;
	$GENOME = "command_line";
}
if(defined($cl_priority)){
	$PRIORITY = $cl_priority;
}
if($cl_total_cores){
	$TOTAL_CORES = $cl_total_cores;
}
if($cl_total_mem){
	$TOTAL_MEM = $cl_total_mem;
}

# Printing command line options
if($cl_list_pipelines){
	print "".("=" x 32)."\nCluster Flow - available pipelines\n".("=" x 32)."\n";
	print "Installed pipelines:\n";
	foreach my $folder (@pipeline_folders){
		if(-e $folder){
			print "    Directory $folder\n";
			opendir (DIR, $folder) or die $!;
			my @dir_files = sort readdir(DIR);
			while ( my $file = shift @dir_files ) {
				if(substr($file, -7) eq ".config" && substr($file, 0, -7) ne "clusterflow"){
					print "\t- ".substr($file, 0, -7)."\n";
				}
			}
			closedir(DIR);
		} else {
			print "    Directory $folder (not found)\n";
		}
	}
	print "\n";
	exit;
}
if($cl_list_modules){
	print "".("=" x 32)."\nCluster Flow - available modules\n".("=" x 32)."\n";
	print "Available modules:\n";
	foreach my $folder (@module_folders){
		if(-e $folder){
			print "    Directory $folder\n";
			opendir (DIR, $folder) or die $!;
			my @dir_files = sort readdir(DIR);
			while ( my $file = shift @dir_files ) {
				if($file =~ /\.cfmod$/){
					print "\t- ".substr($file, 0, -6)."\n";
				}
			}
			closedir(DIR);
		} else {
			print "    Directory $folder (not found)\n";
		}
	}
	print "\n";
	exit;
}
if($cl_list_genomes){
	print "".("=" x 32)."\nCluster Flow - available genomes\n".("=" x 32)."\n";
	print CF::Constants::list_clusterflow_genomes();
	exit;
}
if($cl_dryrun){
	print "\n### Cluster Flow is running in Dry Run mode. No cluster jobs will be set off, only printed. Run files will be created. ###\n\n";
	sleep(1);
}
if($cl_qstat){
	if($CLUSTER_ENVIRONMENT eq 'GRIDEngine'){
		if($cl_qstatcolours){
			print CF::Headnodehelpers::parse_qstat(0, 1);
		} else {
			print CF::Headnodehelpers::parse_qstat(0, 0);
		}
	} elsif($CLUSTER_ENVIRONMENT eq 'SLURM'){
		if($cl_qstatcolours){
                        print CF::Headnodehelpers::parse_squeue(0, 1);
                } else {
                        print CF::Headnodehelpers::parse_squeue(0, 0);
                }
	} else {
		print "Apologies, this function is not yet supported for systems other than GRIDEngine.\n";
	}
	exit;
}
if($cl_qstatall){
	unless($CLUSTER_ENVIRONMENT eq 'GRIDEngine'){
		print "Apologies, this function is not yet supported for systems other than GRIDEngine.\n";
		exit;
	}
	if($cl_qstatcolours){
		print CF::Headnodehelpers::parse_qstat(1, 1);
	} else {
		print CF::Headnodehelpers::parse_qstat(1, 0);
	}
	exit;
}
if($cl_make_config){
	CF::Constants::clusterflow_make_config();
	exit;
} else {
	# Warn user if we don't have a home directory config file
	# (probably means no notifications)
	unless(-e "$homedir/clusterflow/clusterflow.config"){
		warn "\nYou don't seem to have a config file set up in your
home directory! It's a good idea to set one up, so that
you can get e-mailed notifications from Cluster Flow.
To create one, run cf --make_config\n\n\n";
	}
}
if($cl_add_genome){
	CF::Constants::clusterflow_add_genome();
	exit;
}
if($cl_qdel){
	if($CLUSTER_ENVIRONMENT eq 'GRIDEngine'){
		print "\nDeleting jobs from Pipeline id $cl_qdel\n\n";
		print CF::Headnodehelpers::cf_pipeline_qdel($cl_qdel);
		exit;
	} elsif($CLUSTER_ENVIRONMENT eq 'SLURM'){
		print "\nDeleting jobs from Pipeline id $cl_qdel\n\n";
		print CF::Headnodehelpers::cf_pipeline_scancel($cl_qdel);
		exit;
	} else {
		print "Apologies, this function is not yet supported for systems other than GRIDEngine.\n";
		exit;
	}
}

if($cl_version){
	print "Cluster Flow v$CF_VERSION\n\n";
	exit;
}

# Check for updates - either by command line or config file
if($cl_check_updates){
	print "\nCurrent Cluster Flow version: v$CF_VERSION\n\n";
	print CF::Headnodehelpers::cf_check_updates($CF_VERSION)."\n";
	exit;
} elsif($CHECK_UPDATES){
	# Work out the check interval in seconds
	my $check_interval = 0;
	$check_interval += $1 * 86400 if ($CHECK_UPDATES =~ /(\d+)d/i);
	$check_interval += $1 * 604800 if ($CHECK_UPDATES =~ /(\d+)w/i);
	$check_interval += $1 * 2628000 if ($CHECK_UPDATES =~ /(\d+)m/i);
	$check_interval += $1 * 31536000 if ($CHECK_UPDATES =~ /(\d+)y/i);
	
	# See if it's been this many seconds since we last checked
	if($check_interval){
		if(!$UPDATES_LAST_CHECKED || (time - $UPDATES_LAST_CHECKED) > $check_interval){
			my $message = CF::Headnodehelpers::cf_check_updates($CF_VERSION);
		}
	}
}
# Print warning if we're running a version of Cluster Flow that is out of date
if($CHECK_UPDATES){
	if(CF::Headnodehelpers::cf_compare_version_numbers($CF_VERSION, $AVAILABLE_VERSION)){
		print "\n".("="x45)."\n A new version of Cluster Flow is available!\n Running v$CF_VERSION, v$AVAILABLE_VERSION available.\n".("="x45)."\n
You can download the latest version of Cluster Flow from\nhttp://www.bioinformatics.babraham.ac.uk/projects/cluster_flow/\n\n";
	}
}


if($cl_help){
	if($pipeline){
		print "\n".CF::Constants::clusterflow_pipeline_help($pipeline)."\n";
	} else {
		print CF::Constants::clusterflow_help();
	}
	exit;
}
if(!$pipeline){
	die("Error - no pipeline specified. Use --help for instructions.\nSyntax: cf [flags] pipeline_name file_1 file_2..\n\n");
}

if(!$file_list && scalar(@files) == 0){
	die("Error - no input files specified. Use --help for instructions.\nSyntax: cf [flags] pipeline_name file_1 file_2..\n\n");
}

# Check whether this cluster has environment modules
if($CF_MODULES){
	unless(-x $ENV_MODULES_PATH){
		$CF_MODULES = 0;
		warn "\nWarning - Environment module path $ENV_MODULES_PATH not found.\nIf your cluster doesn't use environment modules you are responsible for adding programs to the PATH..\nSee the Cluster Flow documentation for more information.\n\n";
	}
}


# Load in the pipeline config file
my $config_found = 0;
foreach my $folder (@pipeline_folders){
	if(-e $folder."$pipeline.config"){
		
		# Open the pipeline config file handle
		open (CONFIG,$folder."$pipeline.config") or die "Can't read ".$folder."$pipeline.config: $!";
		
		$config_found = 1;
		last;
	}
}

# No pipeline found - was a single module specified instead?
if(!$config_found){
	foreach my $folder (@module_folders){
		if(-e $folder."$pipeline.cfmod"){
			
			# Make a pseudo pipeline config file handle
			my $pipeline_config = "\n#$pipeline\n\n";
			open(CONFIG, "<", \$pipeline_config) or die "Can't open variable file handle for single module pipeline: $!\n\n";
			
			$config_found = 1;
			last;
		}
	}
}
# No pipeline or module found - die with error
if(!$config_found){
	die "Can't find pipeline or module called $pipeline";
}


# If we have a file list input file, take input from there
my @download_fns;
if($file_list){
	unless(-e $file_list){
		print "Error - file list not found: $file_list\nExiting...\n\n";
		exit;
	}
	@files = ();
	open (FILES, $file_list) or die "Can't read $file_list: $!";
	while (<FILES>) {
		chomp;
		s/[\n\r]//;
		if ($_ =~ /^$/) { next; } # Skip blank lines
		my @sections = split(/\s+/);
		push @files, $sections[0];
		if(defined($sections[1])){
			push @download_fns, $sections[1];
		} else {
			push @download_fns, 0;
		}
	}
	close(FILES);
}
my $num_files = scalar @files;


# Check file types
my $file_ext;
my $fastq_count = 0;
for (my $i = 0; $i <= $#files; $i++){
	$fastq_count++ if ($files[$i] =~ /f(ast)?q(\.gz)?$/i);
	if(!$cl_no_fn_check){
		if ($files[$i] =~ /\.(\w+)(\.gz)?$/i){
			if($file_ext && $file_ext ne $1){
				die "\nERROR - found a mixture of input file types! (.$file_ext and .$1) Exiting..\nUse --no_fn_check to disable this.\n\n";
			}
			$file_ext = $1;
		}
	}
}

# Auto-detect paired end files for fastq input
if(!$cl_paired_end && !$cl_single_end && $fastq_count > 1){
	my $paired_count = 0;
	my $single_count = 0;
	for (my $i = 0; $i <= $#files; $i++){
		if($i < $#files){
			# Make stripped copies of the fns for comparison
			(my $fn1 = $files[$i]) =~ s/_R?[1-4]//g;
			(my $fn2 = $files[$i+1]) =~ s/_R?[1-4]//g;
			if($fn1 eq $fn2){
				$paired_count++;
				$i++;
			} else {
				$single_count++;
			}
		}
	}
	
	if($paired_count > 0 && $single_count == 0 && $num_files % 2 == 0){
		$cl_paired_end = 1;
		warn "\nFilenames look like paired-end input. Setting --paired\n";
		warn "Specify --single to prevent this behaviour.\n\n";
	} elsif($paired_count == 0 && $single_count > 0){
		$cl_single_end = 1;
		warn "\nFilenames look like single-end input. Setting --single\n";
		warn "Specify --paired to prevent this behaviour.\n\n";
	} else {
		die "\n\nERROR - found a mixture of single end and paired end files! Exiting..\nSpecify --single or --paired to override.\n\n";
	}
}


#
# MAX RUNS and SPLIT FILES
#
# Is max_runs set on the command line? Override config defaults if so
if(defined($cl_max_runs)){
	$cl_max_runs =~ s/\D//g; # remove any non-numbers
	$MAX_RUNS = $cl_max_runs;
}
# Is split_files set on the command line? Disable max_runs if so
if($SPLIT_FILES > 2 && !defined($cl_max_runs)){
	$MAX_RUNS = 0;
}
# Forcing paired end or single end
if ($cl_paired_end){
	if($SPLIT_FILES == 1){
		$SPLIT_FILES = 2;
	}
}
# Set split_files now that we know how many files we have
if($MAX_RUNS > 0){
	$SPLIT_FILES = int(($num_files / $MAX_RUNS) + 0.99); # + 0.99 is to do a ceiling round
	if ($cl_paired_end && ($SPLIT_FILES % 2 == 1)){
		$SPLIT_FILES++;
	}
	warn "Processing $num_files files in $MAX_RUNS runs. Submitting $SPLIT_FILES files per run.\n\n";
}
my $num_runs = int(($num_files / $SPLIT_FILES) + 0.99); # + 0.99 is to do a ceiling round


## Write the run file, parse the pipeline
my $runfile;
my %module_tree;
my @indents;
push @indents, \%module_tree;
my $comment_block = 0;

# Write config variables to runfile header
$runfile = <<"EOT";
\@split_files	$SPLIT_FILES
\@total_cores	$TOTAL_CORES
\@total_mem	$TOTAL_MEM
EOT
if($EMAIL){
	$runfile .= "\@email	$EMAIL\n";	
}
if($PRIORITY){
	$runfile .= "\@priority	$PRIORITY\n";
}
foreach my $not (@NOTIFICATIONS){
	$runfile .= "\@notification\t$not\n";
}

if(defined($GENOME) && defined($GENOME_PATHS{$GENOME})){
	$runfile .= "\@genome_path\t$GENOME_PATHS{$GENOME}\n";
}
if(defined($GENOME) && defined($BOWTIE_PATHS{$GENOME})){
	$runfile .= "\@bowtie_path\t$BOWTIE_PATHS{$GENOME}\n";
}
if(defined($GENOME) && defined($GTF_PATHS{$GENOME})){
	$runfile .= "\@gtf_path\t$GTF_PATHS{$GENOME}\n";
}
if ($cl_paired_end){
	$runfile .= "\@force_paired_end\n";
} elsif ($cl_single_end){
	$runfile .= "\@force_single_end\n";
}

# Parse pipeline config file

my $pipeline_string = "";
while (<CONFIG>){
	chomp;
	s/\n//;
	s/\r//;
	
	# Ignore comment blocks
	if($_ =~ /^\/\*/){
		$runfile .= "$_\n";
		$comment_block = 1;
		next;
	}
	if($_ =~ /^\*\//){
		$runfile .= "$_\n";
		$comment_block = 0;
		next;
	}
	
	# Required variables
	if($_ =~ /^\@require_genome/ && !$comment_block){
		if(!defined $GENOME){
			warn "\n### Error - The pipeline $pipeline requires a genome to be set with --genome or --genome_path. Exiting... ### \n\n";
			exit;
		} elsif(!defined $GENOME_PATHS{$GENOME}){
			warn "\n### Error ###\nNo genome path found for $GENOME.\n\nAvailable genome paths:\n";
			while ( (my $genome, my $path) = each %GENOME_PATHS ) {
				warn "\tGenome: $genome, Path: $GENOME_PATHS{$genome}\n";
			}
			warn "\n\nExiting...\n\n";
			exit;
		}
	}
	if($_ =~ /^\@require_bowtie/ && !$comment_block){
		if(!defined $GENOME){
			warn "\n### Error - The pipeline $pipeline requires a bowtie index to be set with --genome or --bowtie_path. Exiting... ### \n\n";
			exit;
		} elsif(!defined $BOWTIE_PATHS{$GENOME}){
			warn "\n### Error ###\nNo bowtie path found for $GENOME.\n\nAvailable bowtie paths:\n";
			while ( (my $genome, my $path) = each %BOWTIE_PATHS ) {
			  warn "\tGenome: $genome, Path: $BOWTIE_PATHS{$genome}\n";
			}
			warn "\n\nExiting...\n\n";
			exit;
		}
	}
	if($_ =~ /^\@require_gtf/ && !$comment_block){
		if(!defined $GENOME){
			warn "\n### Error - The pipeline $pipeline requires a GTF path to be set with --genome or --gtf_path. Exiting... ### \n\n";
			exit;
		} elsif(!defined $GTF_PATHS{$GENOME}){
			warn "\n### Error ###\nNo GTF path found for $GENOME.\n\nAvailable GTF paths:\n";
			while ( (my $genome, my $path) = each %GTF_PATHS ) {
			  warn "\tGenome: $genome, Path: $GTF_PATHS{$genome}\n";
			}
			warn "\n\nExiting...\n\n";
			exit;
		}
	}
	
	# Read the pipeline tree
	if($_ =~ /^(\t*)#/ && !$comment_block){
	
		# Add in any command line parameters to each module
		if($cl_params){
			$pipeline_string .= "$_ $cl_params\n";
			$runfile .= "$_ $cl_params\n";
			$_ .= " $cl_params";
		} else {
			$pipeline_string .= "$_\n";
			$runfile .= "$_\n";
		}
		
		$_ =~ s/^(\t*)#//; # Remove hash and count tabs
		splice @indents, length($1)+1;
		push @indents, $indents[$#indents]->{$_} = {};
	
	# Add everything else to runfile string
	} else {
		$runfile .= "$_\n";
	}
	
}

close(CONFIG);

warn "Pipeline to be used:\n\n$pipeline_string\n\n";


# Work out quota of cores and memory allowed per job
$TOTAL_MEM = CF::Helpers::human_readable_to_bytes($TOTAL_MEM);
# Count terminal leaves in pipeline and number of files
my ($num_leaves, $num_jobs) = count_leaves(\%module_tree, 0, 0);
my $num_parallel = $num_leaves * $num_runs;
# Calculate per-job resources
my $cores_allocation = int( $TOTAL_CORES / $num_parallel);
my $memory_allocation = int( $TOTAL_MEM / $num_parallel);
# Sanity checks
if($cores_allocation < 1){
	$cores_allocation = 1; # minimum one core
}
if($memory_allocation < 104857600){
	$memory_allocation = 104857600; # minimum one 100 Mb
}
my $memory_allocation_hr = CF::Helpers::bytes_to_human_readable($memory_allocation);


sub count_leaves {
	# Set up parameters
	my ($mod_tree, $num_leaves, $num_jobs) = @_;
	
	# Increase job counter
	$num_jobs++;
	
	# Run through leaves on this branch
	foreach ( keys %{$mod_tree} ){
		
		my $num_keys = scalar keys(%{$mod_tree->{$_}});
		
		# Recursively call this function if we're not at the end of a branch
		if ($num_keys > 0) {
			($num_leaves, $num_jobs) = count_leaves ( $mod_tree->{$_}, $num_leaves, $num_jobs ) ;
		
		# No children, add one to the count for this leaf
		} else {
			$num_leaves++;
		}
	}
	
	return ($num_leaves, $num_jobs);
}


# Go through the supplied starting files
my @qsubs;
my @job_ids;
my $jid_base = 'cf_'.$pipeline.'_'.time.'_';
my %run_job_ids;
my $job_id;
my $prev_dl_id = "";
my @outfns;
my @finished_run_ids;
for (my $i = 0; $i <= $#files; $i++){
	my $fn = $files[$i];
	my $first_fn = $fn;
	my $outfn = $fn."_".$pipeline."_clusterFlow.txt";
	
	# Make the run file
	my $runfn = $fn."_$pipeline.run";
	my $date = strftime "%H:%M, %d-%m-%Y", localtime;
	my $this_runfile = "/*\nCluster Flow Run File\nPipeline: $pipeline\nCreated at $date\n*/\n\n".$runfile."\n\n";
	
	# Add files to run file
	my $max_i = $i + $SPLIT_FILES;
	my $counter = 0;
	for (; $i < $max_i; $i++){
		$counter++;
		if($i > $#files) {
			last;
		}
		
		$fn = $files[$i];
		
		# Write status update
		if($CLUSTER_ENVIRONMENT =~ /SLURM/i){ warn "\n"; }
		warn "Processing file ".($i+1)." - $fn\n";
		
		# Is this filename a URL?
		if ($fn =~ /^(((ht|f)tp(s?))\:\/\/)/){
			
			# Find download filename if it exists
			my $dl_fn;
			if($download_fns[$i]){
				$dl_fn = $download_fns[$i];
			} else {
				my @parts = split("/", $fn);
				$dl_fn = pop(@parts);
			}
			
			# Only do this the first time, else we create a new file for each download when using --split_files
			if ($runfn =~ /^(((ht|f)tp(s?))\:\/\/)/){
				$runfn = $dl_fn."_$pipeline.run";
				$outfn = $dl_fn."_".$pipeline."_clusterFlow.txt";
			}
			
			push @outfns, $outfn;
			
			# URL - set up download module qsub job, dependent on previous download job
			$job_id = $jid_base.'download_'.sprintf("%03d", rand(999));
			my $dlcmd = "$Bin/modules/cf_download.cfmod $runfn $job_id $fn $dl_fn";
			my @dlhjid = ();
			if(length($prev_dl_id) > 0){
				my @dlhjid = ($prev_dl_id);
			}
			my $qsub = make_submit_cmd($dlcmd, $job_id, \@dlhjid, $prev_dl_id, '1', '1G', $outfn);
			push @qsubs, $qsub;
			push @job_ids, $job_id;
			$prev_dl_id = $job_id;
			
			# Done with the download URL. Set $fn for later stuff
			$fn = $dl_fn;
			if($counter == 1){
				$first_fn = $dl_fn;
			}
		
		} elsif(-e $fn) { # Not a URL. Does this file exist?
			# Add the starting filename to the run file
			$job_id = 'start_000';
			$this_runfile .= "$job_id\t$fn\n";
			push @outfns, $outfn;
		
		} else { # Not a URL, file doesn't exist. Something is wrong!
			die "\nFile $fn doesn't exist.. Something is wrong! Exiting...\n\n";
		}
	
	}
	$i--; # Loop increments $i when it finishes. Revert this increment before next loop
	$fn = $first_fn;
	
	# Deduplicate output fn array
	@outfns = do { my %h; @h{@outfns} = @outfns; values %h };
	
	# Write out the run file
	open (RUNOUT,'>',$runfn) or die "Can't write to $runfn: $!";
	print RUNOUT $this_runfile;
	close(RUNOUT);
	
	# Make up qsub jobs
	make_qsubs (\%module_tree, $job_id, $runfn, $outfn);
	
	# Qsub job to execute on completion of this run
	my $run_finish_id = $jid_base."email_run_complete_".sprintf("%03d", rand(999));
	my $run_finish_cmd = "$Bin/modules/cf_run_finished.cfmod $runfn $run_finish_id null 1 1G $pipeline $outfn";
	# my $run_finish_holdjid = join(",", @{$run_job_ids{$runfn}});
	my $run_finish_qsub = make_submit_cmd($run_finish_cmd, $run_finish_id, \@{$run_job_ids{$runfn}}, 0, '1', '1G', $outfn);
	
	push @qsubs, $run_finish_qsub;
	push @finished_run_ids, $run_finish_id;
	
	# Final qsub job to execute on completion of ALL JOBS
	if($i == $#files){
		my $all_runs_finish_id = $jid_base."email_pipeline_complete_".sprintf("%03d", rand(999));
		my $all_runs_finish_cmd = "$Bin/modules/cf_runs_all_finished.cfmod $runfn $run_finish_id null 1 1G $pipeline ".join(" ", @outfns);
		# my $all_runs_finish_holdjid = join(",", @finished_run_ids);
		my $all_runs_finish_qsub = make_submit_cmd($all_runs_finish_cmd, $all_runs_finish_id, \@finished_run_ids, 0, '1', '1G', $outfn);
		
		push @qsubs, $all_runs_finish_qsub;
	}
}

sub make_qsubs {
	# Set up parameters for this branch
	my ($mod_tree, $prev_job, $runfn, $outfn) = @_;
	my %mod_load_failures;
	
	# Run through leaves
	foreach ( keys %{$mod_tree} ){
	
		# Set up parameters for these leaves
		my ($module, $parameters) = split(/\s/, $_, 2);
		my $job_id = $jid_base.$module.'_'.sprintf("%03d", rand(999));
		
		unless(length($module) > 0){
			next;
		}
		unless($parameters) {
			$parameters = '';
		}
		
		# Find the module file
		my $module_fn;
		foreach (@module_folders){
			if((-e $_."$module.cfmod") && (! -d $_."$module.cfmod")){
				$module_fn = $_."$module.cfmod";
				last;
			}
		}
		if(!$module_fn){
			die "Can't find module file $module.cfmod";
		}
		unless(-x $module_fn){
			die "Can't execute module file $module.cfmod\n";
		}
		
		# Get options for module
		my $cores = `$module_fn --cores $TOTAL_CORES`;
		my $mem = `$module_fn --mem $TOTAL_MEM`;
		my $required_modules = `$module_fn --modules`;
		
		if(!$cores || length($cores) == 0 || $cores =~ /\D/){
			$cores = 1;
		}
		if(!$mem || length($mem) == 0 || length($mem) > 20){
			$mem = '1G';
		}
		if($CF_MODULES){
			# Check that we have the path
			unless (-e $ENV_MODULES_PATH){
				warn "WARNING - could not find environment module path: $ENV_MODULES_PATH\nEnvironment modules have not been loaded.\n\n";
			} else {
				if($required_modules && length($required_modules) > 0){
					my @req_modules = split(/[\s,]+/, $required_modules);
					foreach my $mod (@req_modules) {
						# Redirect STDERR to dev/null but record failures
						my $bash_string = `$ENV_MODULES_PATH bash load $mod 2>/dev/null`;
						if($bash_string && length($bash_string) > 0){
							my @bashes = split(';', $bash_string);
							foreach my $bash (@bashes){
								my @envs = split('=', $bash);
								if($envs[0] ne 'export' && defined($envs[1])){
									$envs[1] =~ s/\\//g; # Hacky fix to get rid of weird backslash problem
									$ENV{$envs[0]} = $envs[1];
								}
							}
						} elsif($?) {
							$mod_load_failures{$mod} = 1;
						}
					}
				}
			}
		}
		
		# Create cluster job submission command
		my $holdjid = $prev_job;
		# Strip number from download job ID so that these files are all read in the next module
		$prev_job =~ s/download_[\d]{3}$/download/;
		my $cmd = "$module_fn $runfn $job_id $prev_job $cores $mem $parameters";
		my @holdids = ($holdjid);
		my $qsub = make_submit_cmd($cmd, $job_id, \@holdids, $prev_job, $cores, $mem, $outfn);
		
		push @qsubs, $qsub;
		push @job_ids, $job_id;
		unless(defined $run_job_ids{$runfn}){
			$run_job_ids{$runfn} = ();
		}
		push @{$run_job_ids{$runfn}}, $job_id;
		
		# Recursively call this function if we're not at the end of a branch
		if ( ref $mod_tree->{$_} eq 'HASH') {
			make_qsubs ( $mod_tree->{$_}, $job_id, $runfn, $outfn ) ;
		}
	}
	
	# Warn about modules which failed to load..
	if(scalar keys(%mod_load_failures) > 0){
		warn "\nWarning - the following environment modules could not be loaded: ".join(", ", keys(%mod_load_failures))."\n";
		warn "To disable this warning, set \@ignore_modules to true in your config file.\n";
	}
}

sub make_submit_cmd {
	
	# Get the required variables to assemble this job submission
	# $holdjid should be an array reference
	my ($cmd, $job_id, $holdjid, $prev_job, $cores, $mem, $outfn) = @_;
	
	my $qsub = "";	
	
	#########################
	# Build command for LSF #
	#########################
	if($CLUSTER_ENVIRONMENT =~ /LSF/i){
		# Build command string
		if($CUSTOM_JOB_SUBMIT_COMMAND && length($CUSTOM_JOB_SUBMIT_COMMAND) > 0){
			$qsub = $CUSTOM_JOB_SUBMIT_COMMAND;
		} else {
			$qsub = 'bsub -n {{cores}} -M {{mem}} -R "rusage[mem={{mem}}]" -o {{outfn}} -J {{job_id}} -N "{{command}}"';
			# Do we have a priority?
			if($PRIORITY && $PRIORITY >= 0){
				$qsub .= "-sp {{priority}} ";
			}
		}
		
		# Memory has to be specified in megabytes for LSF
		$mem = CF::Helpers::mem_return_mbs($mem);
		
		# Swap in real values
		$qsub =~ s/{{command}}/$cmd/;
		$qsub =~ s/{{job_id}}/$job_id/;
		$qsub =~ s/{{outfn}}/$outfn/;
		$qsub =~ s/{{cores}}/$cores/;
		$qsub =~ s/{{mem}}/$mem/;
		$qsub =~ s/{{priority}}/$PRIORITY/;
		$qsub =~ s/{{email}}/$EMAIL/;
		
		# Job dependencies
		if(scalar @{$holdjid} > 0 && $prev_job ne 'start_000'){
			$qsub .= "-w 'done(\"" . join('") && done("', @{$holdjid}) . "\")' ";
		}
	
	###########################
	# Build command for SLURM #
	###########################
	} elsif($CLUSTER_ENVIRONMENT =~ /SLURM/i){
		
		# Build command string
		if($CUSTOM_JOB_SUBMIT_COMMAND && length($CUSTOM_JOB_SUBMIT_COMMAND) > 0){
			$qsub = $CUSTOM_JOB_SUBMIT_COMMAND;
		} else {
			$qsub = 'sbatch -p core -n {{cores}} --open-mode=append -o {{outfn}} -J {{job_id}} {{notifications}} --wrap="{{command}}"';
			# Do we have a priority?
			if($PRIORITY && $PRIORITY >= 0){
				$qsub .= " --priority {{priority}}";
			}
		}
		
		# Work out qsub notification settings
		my $notification_string = "";
		if(defined($EMAIL) && length($EMAIL) > 0 && @NOTIFICATIONS){
			$notification_string .= ' --mail-user={{email}} ';
			if((grep $_ eq 'suspend', @NOTIFICATIONS) or (grep $_ eq 'abort', @NOTIFICATIONS)){
				$notification_string .= '--mail-type=FAIL ';
			}
			if(grep $_ eq 'end', @NOTIFICATIONS){
				$notification_string .= '--mail-type=END ';
			}
		}
		
		# Swap in real values
		$qsub =~ s/{{command}}/$cmd/;
		$qsub =~ s/{{job_id}}/$job_id/;
		$qsub =~ s/{{outfn}}/$outfn/;
		$qsub =~ s/{{cores}}/$cores/;
		$qsub =~ s/{{mem}}/$mem/;
		$qsub =~ s/{{priority}}/$PRIORITY/;
		$qsub =~ s/{{notifications}}/$notification_string/;
		$qsub =~ s/{{email}}/$EMAIL/;
		
		# Job dependencies
		# This is horrible on SLURM (depends on capturing job submission STDOUT)
		if(scalar @{$holdjid} > 0 && $prev_job ne 'start_000'){
			my @jidarray = ();
			foreach my $jname (@{$holdjid}){
				if(exists $JOB_NUM_IDS{$jname}){
					push(@jidarray, $JOB_NUM_IDS{$jname});
				} elsif(!$cl_dryrun) {
					warn "\nCouldn't find numeric job ID for $jname\n";
				}
			}
			if(scalar @jidarray > 0){
				$qsub .= " --dependency=afterany:" . join(':', @jidarray);
			}
		}
		
		# Submit Job and capture ID
		unless($cl_dryrun) {
			my $job_submit = `$qsub`;
			chomp($job_submit);
			if($job_submit =~ /Submitted batch job (\d+)/){
				$JOB_NUM_IDS{$job_id} = $1;
				warn "$job_submit ($job_id)\n";
			} else {
				warn "\nERROR! Couldn't find job id for $job_id: $job_submit\n\n";
			}
		}
	
	
	#################################
	# Build command for GRID Engine #
	#################################
	} elsif($CLUSTER_ENVIRONMENT =~ /GRIDEngine/i){
		
		# Build command string
		if($CUSTOM_JOB_SUBMIT_COMMAND && length($CUSTOM_JOB_SUBMIT_COMMAND) > 0){
			$qsub = $CUSTOM_JOB_SUBMIT_COMMAND;
		} else {
			$qsub = 'echo "{{command}}" | qsub -cwd -V -pe orte {{cores}} -l vf={{mem}} -o {{outfn}} -j y -N {{job_id}} {{notifications}}';
			# Do we have a priority?
			if($PRIORITY && $PRIORITY <= 0){
				$qsub .= " -p {{priority}}";
			}
		}
		
		# Work out qsub notification settings
		my $notification_string = "";
		if(defined($EMAIL) && length($EMAIL) > 0 && @NOTIFICATIONS){
			my $count = 0;
			$notification_string = "-M {{email}} -m ";
			foreach my $not (@NOTIFICATIONS){
				if($not eq 'suspend'){
					$notification_string .= 's';
				}
				if($not eq 'end'){
					$notification_string .= 'e';
				}
				if($not eq 'abort'){
					$notification_string .= 'a';
				}
			}
		}
		
		# Swap in real values
		$qsub =~ s/{{command}}/$cmd/;
		$qsub =~ s/{{job_id}}/$job_id/;
		$qsub =~ s/{{outfn}}/$outfn/;
		$qsub =~ s/{{cores}}/$cores/;
		$qsub =~ s/{{mem}}/$mem/;
		$qsub =~ s/{{priority}}/$PRIORITY/;
		$qsub =~ s/{{notifications}}/$notification_string/;
		$qsub =~ s/{{email}}/$EMAIL/;
		
		# Job dependencies
		if(scalar @{$holdjid} > 0 && $prev_job ne 'start_000'){
			$qsub .= " -hold_jid ". join(',', @{$holdjid});
		}
	
	###########################################
	# Unrecognised Cluster Environment String #
	###########################################
	} else {
		die ("ERROR - Cluster Environment config value not recognised: $CLUSTER_ENVIRONMENT\n\n");
	}
	
	return $qsub;
	
}

# Print qsub jobs to the terminal or submit to cluster
if($cl_dryrun){
	print "\n\n".('-' x 46)."\n Jobs that would be submitted to the cluster\n".('-' x 46)."\n\n";
}
if($cl_dryrun){
	foreach(@qsubs){
		print "$_\n\n";
	}
	if($CLUSTER_ENVIRONMENT =~ /SLURM/i){
		print "\n\n".('-' x 86)."\n Warning - Jobs do not have dependencies (SLURM jobs have to be launched to do this)\n".('-' x 86)."\n\n";
	}
} elsif ($CLUSTER_ENVIRONMENT !~ /SLURM/i){
	foreach(@qsubs){
		system $_;
	}
}