Various minor bugfixes and improvemnts; updated hg38.fa.mappability_1…

…00bp.subsetted_for_testdata.bw
epigen · Sep 17, 2021 · 1595c0e · 1595c0e
1 parent b45b13f
commit 1595c0e
Show file tree

Hide file tree

Showing 9 changed files with 32 additions and 17 deletions.
diff --git a/docs/source/liquorice_commandline_tool.rst b/docs/source/liquorice_commandline_tool.rst
@@ -159,6 +159,7 @@ using GNU parallel (http://dx.doi.org/10.5281/zenodo.16303), which is automatica
 Note that the memory usage will increase with the number of parallel jobs (set by the `-j` parameter of parallel).
 We usually allow for 3GB of RAM for each job executed in parallel and set the `-j` parameter accordingly ( `j` = <Total available Memory on the Computer/Server>/3 GB) when running LIQUORICE with default settings on a region-set of 6000 regions.
 Note that memory usage also depends on `extend_to`, `binsize`, `speed_mode`, and scales linearly with the number of regions in your region-sets.
+Finally: LIQUORICE's results will slightly differ based on whether or not you use `--n_cores 1` or `--n_cores <anything larger than 1>`. This is due to differences in the sampling of fragment lengths and nothing to worry about - both results are equally valid.
 
 Sources for input files
 ***********************
@@ -211,7 +212,7 @@ Just follow the example below:
     LIQUORICE --bamfile Ctrl_17_testdata.bam --refgenome_fasta "hg38.p12.fa" \
             --mappability_bigwig "hg38.fa.mappability_100bp.subsetted_for_testdata.bw" \
             --bedpathlist "universal_DHSs.bed" \
-            --blacklist "hg38" --n_cpus "${N_CPUS}"
+            --blacklist "hg38" --n_cpus "${N_CPUS}" --extend_to 15000
 
 
 Example usage of LIQUORICE and the summary tool

diff --git a/liquorice/LIQUORICE.py b/liquorice/LIQUORICE.py
@@ -352,9 +352,15 @@ def main():
             logging.info(f"Did not detect an existing biasmodel under "
                          f"{os.path.realpath('biasmodel/trained_biasmodel.joblib')}.")
     elif not args.detect_existing_biasmodel and pathlib.Path("biasmodel/trained_biasmodel.joblib").exists():
-        logging.warning(f"Found an existing biasmodel under "
+        if not args.train_on_rois:
+            logging.warning(f"Found an existing biasmodel under "
                         f"{os.path.realpath('biasmodel/trained_biasmodel.joblib')}"
                         f". This model will be overwritten as --detect_existing_biasmodel was not specified.")
+        else:
+            logging.warning(f"Found an existing biasmodel under "
+                            f"{os.path.realpath('biasmodel/trained_biasmodel.joblib')}"
+                            f". This model will be ignored as --detect_existing_biasmodel was not specified."
+                            f"Instead, a seperate model will be trained on each region-set in --bedpathlist.")
 
     # Version in which a single bias-model is trained for the sample, based on a set of seperate training regions
     if not args.train_on_rois:

diff --git a/liquorice/LIQUORICE_summary.py b/liquorice/LIQUORICE_summary.py
@@ -303,7 +303,7 @@ def plot_overlay(dirname: str, summary_df: pd.DataFrame, control_name_list: typi
             Line2D([0], [1], lw=1.5, color="darkblue",label='Case samples with significantly weaker coverage drop, '
                                                             f'n={nr_samples["Significantly weaker coverage drop"]}', markersize=1),
             Line2D([0], [1], lw=1.5, color="silver", label='Case samples not sign. different, '
-                                                           f'n={nr_samples["n.s."]}', markersize=1),
+                                                           f'n={nr_samples["n.s."]+nr_samples["N/A"]}', markersize=1),
             Line2D([0], [1], lw=1.2, color="darkseagreen", label=f'Control samples, n={nr_samples["ctrl"]}',
                    markersize=1)]
 

diff --git a/liquorice/create_mappability_bigwigs.sh b/liquorice/create_mappability_bigwigs.sh
@@ -8,17 +8,17 @@ NCORES="$3"
 
 FASTA_NAME="$(basename ${GENOME_FASTA})"
 # dowload the GEM library binary file:
-#wget https://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%203/GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tbz2
-#bzip2 -d GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tbz2
-#tar -xvf GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tar
-#rm GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tar
+wget https://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%203/GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tbz2
+bzip2 -d GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tbz2
+tar -xvf GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tar
+rm GEM-binaries-Linux-x86_64-core_i3-20130406-045632.tar
 export PATH=$(realpath GEM-binaries-Linux-x86_64-core_i3-20130406-045632/bin):$PATH
 #
-#wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig
-#chmod 744 wigToBigWig
+wget http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/wigToBigWig
+chmod 744 wigToBigWig
 #
 ## Create GEM index and mappability tracks
-#gem-indexer -T $NCORES -i $GENOME_FASTA -o ${FASTA_NAME}.gem_index
+gem-indexer -T $NCORES -i $GENOME_FASTA -o ${FASTA_NAME}.gem_index
 gem-mappability -T $NCORES -I "${FASTA_NAME}.gem_index.gem" -l $READLENGTH -o "${FASTA_NAME}.mappability_${READLENGTH}bp.gem"
 
 # Convert the mappability track to bigwig format

diff --git a/liquorice/data/hg38.fa.mappability_100bp.subsetted_for_testdata.bw b/liquorice/data/hg38.fa.mappability_100bp.subsetted_for_testdata.bw
diff --git a/liquorice/tests/test_LIQUORICE_with_simple_testdata.sh b/liquorice/tests/test_LIQUORICE_with_simple_testdata.sh
@@ -18,5 +18,5 @@ wget https://github.com/epigen/LIQUORICE/raw/master/liquorice/data/universal_DHS
 LIQUORICE --bamfile Ctrl_17_testdata.bam --refgenome_fasta "hg38.p12.fa" \
         --mappability_bigwig "hg38.fa.mappability_100bp.subsetted_for_testdata.bw" \
         --bedpathlist "universal_DHSs.bed" \
-        --blacklist "hg38" --n_cpus "${N_CPUS}"
+        --blacklist "hg38" --n_cpus "${N_CPUS}" --extend_to 15000
 LIQUORICE_summary
diff --git a/liquorice/utils/BiasModel.py b/liquorice/utils/BiasModel.py
@@ -274,7 +274,8 @@ def train_biasmodel_2fold_CV_and_predict(self,exclude_these_bin_nrs:List[int] =[
         y_pred=np.append(y_pred,y_pred_second_half)
 
         y_corr = self.df_to_correct[self.y] - y_pred
-        self.df_to_correct["corrected coverage"]=y_corr
+        # use .assign to avoid settingWithCopyWarning
+        self.df_to_correct=self.df_to_correct.assign(**{"corrected coverage":y_corr.values})
 
         r2=r2_score(self.df_to_correct[self.y],y_pred)
         logging.info(f"Cross-validated R^2 overall: {r2}")

diff --git a/liquorice/utils/CoverageAndSequenceTablePreparation.py b/liquorice/utils/CoverageAndSequenceTablePreparation.py
@@ -2,15 +2,16 @@
 import pandas as pd
 import os
 import pybedtools
-if os.getenv("TMP") is not None:
-    pybedtools.set_tempdir(os.getenv("TMP"))
+if os.getenv("TMPDIR") is not None:
+    pybedtools.set_tempdir(os.getenv("TMPDIR"))
 import pyBigWig
 import logging
 import os
 import typing
 from multiprocessing import Pool
 import pysam
 import numpy as np
+import sys
 os.environ["PYTHONPATH"] = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) # required by Ray, which is
 # used by modin
 import modin.pandas as modinpd
@@ -341,7 +342,13 @@ def get_complete_table(self) -> pd.DataFrame:
 
         if "sequence" not in self.skip_these_steps:
             logging.info("Retrieving genomic sequences for every bin ... ")
-            self.df["sequence"]=self.get_sequence_per_bin()
+            try:
+                self.df["sequence"]=self.get_sequence_per_bin()
+            except ValueError:
+                sys.exit("Could not retrieve all required genomic sequences. This can happen if --tmpdir does not have"
+                         "enough space left to store the temporary files produces by pybedtools. Try making some space"
+                         "or specify a different tmpdir with the --tmpdir flag "
+                         "(or change the value of the $TMP environment variable.")
 
         if "mappability" not in self.skip_these_steps:
             logging.info("Retrieving mappability information for every bin ... ")

diff --git a/liquorice/utils/RegionFilteringAndBinning.py b/liquorice/utils/RegionFilteringAndBinning.py
@@ -2,8 +2,8 @@
 import pandas as pd
 import pybedtools
 import os
-if os.getenv("TMP") is not None:
-    pybedtools.set_tempdir(os.getenv("TMP"))
+if os.getenv("TMPDIR") is not None:
+    pybedtools.set_tempdir(os.getenv("TMPDIR"))
 import logging
 import sys
 import typing