Skip to content

Commit

Permalink
WIP
Browse files Browse the repository at this point in the history
  • Loading branch information
AngryMaciek committed Mar 20, 2024
1 parent 9c015c3 commit 0056f6a
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 83 deletions.
27 changes: 25 additions & 2 deletions modules/CREATE_SITECOUNT_MATRICES/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,19 @@ def CSM_get_abspath(p):
"""
return(p if p[0] == os.sep else os.path.abspath(p))





def CSM_helper(wildcards):
if wildcards.CSM_region_id in config["CSM_regions_files"]:
return config["CSM_regions_files"][wildcards.CSM_region_id]
else:
return config["CSM_additional_regions_files"][wildcards.CSM_region_id]




##############################################################################
### Target rule with final output of the pipeline
##############################################################################
Expand All @@ -117,6 +130,17 @@ rule CSM_all:
CSM_outdir = CSM_generate_all_matrices_wildcards()["output_dir"],
CSM_region_id = CSM_generate_all_matrices_wildcards()["region_IDs"],
CSM_window_id = CSM_generate_all_matrices_wildcards()["matrix_IDs"]
),
LIST_additional_sitecount_matrices_links = expand(
os.path.join(
"{CSM_outdir}",
"{CSM_region_id}",
"{CSM_window_id}",
"matrix.tsv"
),
CSM_outdir = config["CSM_outdir"],
CSM_region_id = config["CSM_additional_regions_files"].keys(),
CSM_window_id = "FULL"
)

##############################################################################
Expand Down Expand Up @@ -177,8 +201,7 @@ rule CSM_extract_window_coord_and_sequence:
"{CSM_outdir}",
"CSM_outdir"
),
BED_region = lambda wildcards: \
config["CSM_regions_files"][wildcards.CSM_region_id],
BED_region = lambda wildcards: CSM_helper(wildcards),
FASTA_genome = config["CSM_genomic_sequence"],
SCRIPT_ = os.path.join(
config["CSM_scripts_dir"],
Expand Down
22 changes: 14 additions & 8 deletions modules/CREATE_SITECOUNT_MATRICES/configs/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ CSM_scripts_dir: "../scripts"

# path for the output directory
# (relative to the 'execution' directory)
CSM_outdir: "../output"
CSM_outdir: "../3pUTR_kmers_3to7mers"

# path to the genomic sequence in fasta format
CSM_genomic_sequence: ""
CSM_genomic_sequence: "/data/scc2/ag-gruber-ext/wsciekly.maciek/resources_ENSEMBL_hsa/Homo_sapiens.GRCh38.dna.primary_assembly.fa"

# relative ranges around respective sites
CSM_regions_ranges:
Expand All @@ -31,25 +31,31 @@ CSM_regions_ranges:

# paths to the bed-formatted sites
CSM_regions_files:
"polyA": ""
"3ss": ""
"5ss": ""
"polyA": "/data/scc2/ag-gruber-ext/wsciekly.maciek/mapp-precomputed-ase-tpa-csm/tpa_outdir_unstranded/tandem_pas.terminal_exons.representative_sites.bed"
"3ss": "/data/scc2/ag-gruber-ext/wsciekly.maciek/mapp-precomputed-ase-tpa-csm/ase_outdir_minlen0/3ss.bed"
"5ss": "/data/scc2/ag-gruber-ext/wsciekly.maciek/mapp-precomputed-ase-tpa-csm/ase_outdir_minlen0/5ss.bed"

# paths to the bed-formatted additional whole regions
CSM_additional_regions_files:
"3pUTR_minlen150_prefix": "/data/scc2/ag-gruber-ext/wsciekly.maciek/mapp-stability-analysis/singular-sites-margin25-minutrlen150/clean-sites-prefix-UTR.bed"
"3pUTR_minlen150_total": "/data/scc2/ag-gruber-ext/wsciekly.maciek/mapp-stability-analysis/singular-sites-margin25-minutrlen150/clean-sites-total-UTR.bed"
"3pUTR_minlen150_suffix": "/data/scc2/ag-gruber-ext/wsciekly.maciek/mapp-stability-analysis/singular-sites-margin25-minutrlen150/clean-sites-suffix-UTR.bed"

# sliding window configuration: window size and slide step
CSM_window_size: "50"
CSM_window_step: "25"

# sitecount matrix type: "kmers" or "pwms" options available
CSM_matrix_type: "pwms"
CSM_matrix_type: "kmers"

# options for "kmers" sitecount matrices:
# inclusive ends specification for the range of k-mers sizes = [kmer_min,kmer_max]
CSM_kmer_min: "3"
CSM_kmer_max: "6"
CSM_kmer_max: "3"

# options for "pwms" sitecount matrices:
# path to the directory with TRANSFAC-formatted PWM files
CSM_pwm_directory: ""
CSM_pwm_directory: "/data/scc2/ag-gruber-ext/wsciekly.maciek/ATtRACT_hsa_clean/motifs"

# MotEvo parameter: prior probability for the background binding
CSM_MotEvo_bg_binding_prior: 0.99
Expand Down
15 changes: 14 additions & 1 deletion modules/CREATE_SITECOUNT_MATRICES/execution/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,20 @@ case "$ENV$TECH" in
sgeconda)
snakemake \
--configfile="$CONFIGFILE" \
--profile="../profiles/sge-conda"
--snakefile ../Snakefile \
--use-conda \
--conda-prefix="$CONDAPREFIX" \
--cluster "qsub -r y -q scc -pe smp {threads} -l h_rt=168:00:00 -l h_vmem={resources.mem_mb}M -e {params.LOG_cluster_log}" \
--jobs 4096 \
--cores all \
--retries 1 \
--default-resources mem_mb=32000*attempt disk_mb=32000*attempt \
--printshellcmds \
--rerun-incomplete \
--jobscript "../../../execution/sge-jobscript.sh" \
--conda-frontend mamba \
--keep-going \
--latency-wait 600 \
;;
sgesingularity)
echo "Singularity technology is not supported for Grid Engine yet."
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
snakefile: "../Snakefile"
cores: 1
cores: 64
printshellcmds: true
rerun-incomplete: true
use-conda: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,82 +69,87 @@ def parse_arguments():
def main():
"""Main body of the script."""

# test that the input BED-file indeed contains coordinates for single sites
outdir = options.outdir

with open(options.bed) as bed_file:
bed_lines = bed_file.read().splitlines()
for line in bed_lines:
line_parsed = line.split("\t")
assert int(line_parsed[2]) - int(line_parsed[1]) == 1

outdir = options.outdir

# parse the relative coordinates from the encoded window name
coords = [i[1:].split("to") for i in options.window.split(".")]
coords = [coords[0][0], coords[0][1], coords[1][0], coords[1][1]]

# Construct bash commands to extract region coordinates into bed
if int(coords[0]) and not int(coords[3]): # whole window upstream the site
assert not int(coords[2])
beg = coords[0]
end = coords[1]
command = (
"cat "
+ options.bed
+ ' | awk -F "\\t" \
\'{{ if ($6 == "+") print $1"\\t"$2-'
+ beg
+ '"\\t"$2-'
+ end
+ '"\\t"$4"\\t"$5"\\t"$6; else if ($6 == "-") print $1"\\t"$3+'
+ end
+ '"\\t"$3+'
+ beg
+ '"\\t"$4"\\t"$5"\\t"$6}}\' \
1> '
+ os.path.join(outdir, "coordinates.bed")
)
if options.window == "FULL":
command = "cp " + options.bed + " " + os.path.join(outdir, "coordinates.bed")

else:
# test that the input BED-file indeed contains coordinates for single sites
for line in bed_lines:
line_parsed = line.split("\t")
assert int(line_parsed[2]) - int(line_parsed[1]) == 1

# parse the relative coordinates from the encoded window name
coords = [i[1:].split("to") for i in options.window.split(".")]
coords = [coords[0][0], coords[0][1], coords[1][0], coords[1][1]]

# Construct bash commands to extract region coordinates into bed
if int(coords[0]) and not int(coords[3]): # whole window upstream the site
assert not int(coords[2])
beg = coords[0]
end = coords[1]
command = (
"cat "
+ options.bed
+ ' | awk -F "\\t" \
\'{{ if ($6 == "+") print $1"\\t"$2-'
+ beg
+ '"\\t"$2-'
+ end
+ '"\\t"$4"\\t"$5"\\t"$6; else if ($6 == "-") print $1"\\t"$3+'
+ end
+ '"\\t"$3+'
+ beg
+ '"\\t"$4"\\t"$5"\\t"$6}}\' \
1> '
+ os.path.join(outdir, "coordinates.bed")
)

elif int(coords[3]) and not int(coords[0]): # whole window downstream the site
assert not int(coords[1])
beg = coords[2]
end = coords[3]
command = (
"cat "
+ options.bed
+ ' | awk -F "\\t" \
\'{{ if ($6 == "+") print $1"\\t"$2+'
+ beg
+ '"\\t"$2+'
+ end
+ '"\\t"$4"\\t"$5"\\t"$6; else if ($6 == "-") print $1"\\t"$3-'
+ end
+ '"\\t"$3-'
+ beg
+ '"\\t"$4"\\t"$5"\\t"$6}}\' \
1> '
+ os.path.join(outdir, "coordinates.bed")
)
elif int(coords[3]) and not int(coords[0]): # whole window downstream the site
assert not int(coords[1])
beg = coords[2]
end = coords[3]
command = (
"cat "
+ options.bed
+ ' | awk -F "\\t" \
\'{{ if ($6 == "+") print $1"\\t"$2+'
+ beg
+ '"\\t"$2+'
+ end
+ '"\\t"$4"\\t"$5"\\t"$6; else if ($6 == "-") print $1"\\t"$3-'
+ end
+ '"\\t"$3-'
+ beg
+ '"\\t"$4"\\t"$5"\\t"$6}}\' \
1> '
+ os.path.join(outdir, "coordinates.bed")
)

else: # window goes through the site
assert not int(coords[1]) and not int(coords[2])
beg = coords[0]
end = coords[3]
command = (
"cat "
+ options.bed
+ ' | awk -F "\\t" \
\'{{ if ($6 == "+") print $1"\\t"$2-'
+ beg
+ '"\\t"$2+'
+ end
+ '"\\t"$4"\\t"$5"\\t"$6; else if ($6 == "-") print $1"\\t"$3-'
+ end
+ '"\\t"$3+'
+ beg
+ '"\\t"$4"\\t"$5"\\t"$6}}\' \
1> '
+ os.path.join(outdir, "coordinates.bed")
)
else: # window goes through the site
assert not int(coords[1]) and not int(coords[2])
beg = coords[0]
end = coords[3]
command = (
"cat "
+ options.bed
+ ' | awk -F "\\t" \
\'{{ if ($6 == "+") print $1"\\t"$2-'
+ beg
+ '"\\t"$2+'
+ end
+ '"\\t"$4"\\t"$5"\\t"$6; else if ($6 == "-") print $1"\\t"$3-'
+ end
+ '"\\t"$3+'
+ beg
+ '"\\t"$4"\\t"$5"\\t"$6}}\' \
1> '
+ os.path.join(outdir, "coordinates.bed")
)

# call bash cat->awk command to extract absolute coordinates of the window
os.system(command)
Expand Down

0 comments on commit 0056f6a

Please sign in to comment.