Skip to content

Commit

Permalink
Various minor bugfixes and improvemnts; updated hg38.fa.mappability_1…
Browse files Browse the repository at this point in the history
…00bp.subsetted_for_testdata.bw
  • Loading branch information
tomazou-lab committed Sep 17, 2021
1 parent b45b13f commit 1595c0e
Show file tree
Hide file tree
Showing 9 changed files with 32 additions and 17 deletions.
3 changes: 2 additions & 1 deletion docs/source/liquorice_commandline_tool.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ using GNU parallel (http://dx.doi.org/10.5281/zenodo.16303), which is automatica
Note that the memory usage will increase with the number of parallel jobs (set by the `-j` parameter of parallel).
We usually allow for 3GB of RAM for each job executed in parallel and set the `-j` parameter accordingly ( `j` = <Total available Memory on the Computer/Server>/3 GB) when running LIQUORICE with default settings on a region-set of 6000 regions.
Note that memory usage also depends on `extend_to`, `binsize`, `speed_mode`, and scales linearly with the number of regions in your region-sets.
Finally: LIQUORICE's results will slightly differ based on whether or not you use `--n_cores 1` or `--n_cores <anything larger than 1>`. This is due to differences in the sampling of fragment lengths and nothing to worry about - both results are equally valid.

Sources for input files
***********************
Expand Down Expand Up @@ -211,7 +212,7 @@ Just follow the example below:
LIQUORICE --bamfile Ctrl_17_testdata.bam --refgenome_fasta "hg38.p12.fa" \
--mappability_bigwig "hg38.fa.mappability_100bp.subsetted_for_testdata.bw" \
--bedpathlist "universal_DHSs.bed" \
--blacklist "hg38" --n_cpus "${N_CPUS}"
--blacklist "hg38" --n_cpus "${N_CPUS}" --extend_to 15000
Example usage of LIQUORICE and the summary tool
Expand Down
8 changes: 7 additions & 1 deletion liquorice/LIQUORICE.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,9 +352,15 @@ def main():
logging.info(f"Did not detect an existing biasmodel under "
f"{os.path.realpath('biasmodel/trained_biasmodel.joblib')}.")
elif not args.detect_existing_biasmodel and pathlib.Path("biasmodel/trained_biasmodel.joblib").exists():
logging.warning(f"Found an existing biasmodel under "
if not args.train_on_rois:
logging.warning(f"Found an existing biasmodel under "
f"{os.path.realpath('biasmodel/trained_biasmodel.joblib')}"
f". This model will be overwritten as --detect_existing_biasmodel was not specified.")
else:
logging.warning(f"Found an existing biasmodel under "
f"{os.path.realpath('biasmodel/trained_biasmodel.joblib')}"
f". This model will be ignored as --detect_existing_biasmodel was not specified."
f"Instead, a seperate model will be trained on each region-set in --bedpathlist.")

# Version in which a single bias-model is trained for the sample, based on a set of seperate training regions
if not args.train_on_rois:
Expand Down
2 changes: 1 addition & 1 deletion liquorice/LIQUORICE_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def plot_overlay(dirname: str, summary_df: pd.DataFrame, control_name_list: typi
Line2D([0], [1], lw=1.5, color="darkblue",label='Case samples with significantly weaker coverage drop, '
f'n={nr_samples["Significantly weaker coverage drop"]}', markersize=1),
Line2D([0], [1], lw=1.5, color="silver", label='Case samples not sign. different, '
f'n={nr_samples["n.s."]}', markersize=1),
f'n={nr_samples["n.s."]+nr_samples["N/A"]}', markersize=1),
Line2D([0], [1], lw=1.2, color="darkseagreen", label=f'Control samples, n={nr_samples["ctrl"]}',
markersize=1)]

Expand Down
14 changes: 7 additions & 7 deletions liquorice/create_mappability_bigwigs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,17 @@ NCORES="$3"

FASTA_NAME="$(basename ${GENOME_FASTA})"
# dowload the GEM library binary file:
#wget https://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%203/GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tbz2
#bzip2 -d GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tbz2
#tar -xvf GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tar
#rm GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tar
wget https://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%203/GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tbz2
bzip2 -d GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tbz2
tar -xvf GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tar
rm GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tar
export PATH=$(realpath GEM-binaries-Linux-x86_64-core_i3-20130406-045632/bin):$PATH
#
#wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig
#chmod 744 wigToBigWig
wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig
chmod 744 wigToBigWig
#
## Create GEM index and mappability tracks
#gem-indexer -T $NCORES -i $GENOME_FASTA -o ${FASTA_NAME}.gem_index
gem-indexer -T $NCORES -i $GENOME_FASTA -o ${FASTA_NAME}.gem_index
gem-mappability -T $NCORES -I "${FASTA_NAME}.gem_index.gem" -l $READLENGTH -o "${FASTA_NAME}.mappability_${READLENGTH}bp.gem"

# Convert the mappability track to bigwig format
Expand Down
Binary file not shown.
2 changes: 1 addition & 1 deletion liquorice/tests/test_LIQUORICE_with_simple_testdata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,5 @@ wget https://github.com/epigen/LIQUORICE/raw/master/liquorice/data/universal_DHS
LIQUORICE --bamfile Ctrl_17_testdata.bam --refgenome_fasta "hg38.p12.fa" \
--mappability_bigwig "hg38.fa.mappability_100bp.subsetted_for_testdata.bw" \
--bedpathlist "universal_DHSs.bed" \
--blacklist "hg38" --n_cpus "${N_CPUS}"
--blacklist "hg38" --n_cpus "${N_CPUS}" --extend_to 15000
LIQUORICE_summary
3 changes: 2 additions & 1 deletion liquorice/utils/BiasModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,8 @@ def train_biasmodel_2fold_CV_and_predict(self,exclude_these_bin_nrs:List[int] =[
y_pred=np.append(y_pred,y_pred_second_half)

y_corr = self.df_to_correct[self.y] - y_pred
self.df_to_correct["corrected coverage"]=y_corr
# use .assign to avoid settingWithCopyWarning
self.df_to_correct=self.df_to_correct.assign(**{"corrected coverage":y_corr.values})

r2=r2_score(self.df_to_correct[self.y],y_pred)
logging.info(f"Cross-validated R^2 overall: {r2}")
Expand Down
13 changes: 10 additions & 3 deletions liquorice/utils/CoverageAndSequenceTablePreparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
import pandas as pd
import os
import pybedtools
if os.getenv("TMP") is not None:
pybedtools.set_tempdir(os.getenv("TMP"))
if os.getenv("TMPDIR") is not None:
pybedtools.set_tempdir(os.getenv("TMPDIR"))
import pyBigWig
import logging
import os
import typing
from multiprocessing import Pool
import pysam
import numpy as np
import sys
os.environ["PYTHONPATH"] = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) # required by Ray, which is
# used by modin
import modin.pandas as modinpd
Expand Down Expand Up @@ -341,7 +342,13 @@ def get_complete_table(self) -> pd.DataFrame:

if "sequence" not in self.skip_these_steps:
logging.info("Retrieving genomic sequences for every bin ... ")
self.df["sequence"]=self.get_sequence_per_bin()
try:
self.df["sequence"]=self.get_sequence_per_bin()
except ValueError:
sys.exit("Could not retrieve all required genomic sequences. This can happen if --tmpdir does not have"
"enough space left to store the temporary files produces by pybedtools. Try making some space"
"or specify a different tmpdir with the --tmpdir flag "
"(or change the value of the $TMP environment variable.")

if "mappability" not in self.skip_these_steps:
logging.info("Retrieving mappability information for every bin ... ")
Expand Down
4 changes: 2 additions & 2 deletions liquorice/utils/RegionFilteringAndBinning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import pandas as pd
import pybedtools
import os
if os.getenv("TMP") is not None:
pybedtools.set_tempdir(os.getenv("TMP"))
if os.getenv("TMPDIR") is not None:
pybedtools.set_tempdir(os.getenv("TMPDIR"))
import logging
import sys
import typing
Expand Down

0 comments on commit 1595c0e

Please sign in to comment.