gbouras13 · gbouras13 · Mar 3, 2024 · Feb 28, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
@@ -26,7 +26,12 @@ jobs:
         shell: bash -l {0}
         run: |
           python -m pip install -U pip
-          just install
+          pip install -e .
+          pip install black
+          pip install isort
+          pip install pytest
+          pip install pytest-cov
+          pip install git+https://github.com/rrwick/Unicycler.git
       - name: Build a binary wheel and a source tarball
         shell: bash -l {0}
         run: just build

diff --git a/HISTORY.md b/HISTORY.md
@@ -1,9 +1,14 @@
 # History
 
+1.6.1 (2024-03-02)
+------------------
+
+* Bug fixes and added tests for `--depth_filter` which would crash in some scenarios.
+
 1.6.0 (2024-03-01)
 ------------------
 
-* Addes `--depth_filter`. This will filter out all contigs that have long- (and short-read for `plassembler run`) read copy numbers that are less than the specified depth filter. Defaults to 0.25x.
+* Adds `--depth_filter`. This will filter out all contigs that have long- (and short-read for `plassembler run`) read copy numbers that are less than the specified depth filter. Defaults to 0.25x.
 * Adds `--unicycler_options`  and `--spades_options` which allows passing extra Unicycler options (#46)
 
 1.5.1 (2024-02-01)

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "plassembler"
-version = "1.6.0" # change VERSION too
+version = "1.6.1" # change VERSION too
 description = "Quickly and accurately assemble plasmids in hybrid sequenced bacterial isolates"
 authors = ["George Bouras <george.bouras@adelaide.edu.au>"]
 license = "MIT"

diff --git a/src/plassembler/utils/VERSION b/src/plassembler/utils/VERSION
@@ -1 +1 @@
-1.6.0
+1.6.1
diff --git a/src/plassembler/utils/plass_class.py b/src/plassembler/utils/plass_class.py
@@ -705,62 +705,51 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
             combined_depth_mash_df["contig"].str.contains("chromosome"), "PLSDB_hit"
         ] = ""
 
-        # filter the dataframe by depth filter
-        all_contig_ids = combined_depth_mash_df["contig"].astype(str).tolist()
+        # get chroms and plasmids
 
-        if "mean_depth_short" in combined_depth_mash_df.columns:
-            combined_depth_mash_df = combined_depth_mash_df[
+        combined_chrom_df = combined_depth_mash_df[
+            combined_depth_mash_df["contig"].str.contains("chromosome")
+        ]
+        combined_plasmid_df = combined_depth_mash_df[
+            ~combined_depth_mash_df["contig"].str.contains("chromosome")
+        ]
+
+        # get all plasmid contig ids and then filter
+        all_plasmid_contig_ids = combined_plasmid_df["contig"].astype(str).tolist()
+
+        if "mean_depth_short" in combined_plasmid_df.columns:
+            combined_plasmid_df = combined_plasmid_df[
                 (
-                    combined_depth_mash_df["plasmid_copy_number_long"].astype(float)
+                    combined_plasmid_df["plasmid_copy_number_long"].astype(float)
                     > depth_filter
                 )
                 | (
-                    combined_depth_mash_df["plasmid_copy_number_short"].astype(float)
+                    combined_plasmid_df["plasmid_copy_number_short"].astype(float)
                     > depth_filter
                 )
-                | (
-                    combined_depth_mash_df["contig"].str.contains(
-                        "chromosome", case=False
-                    )
-                )
             ]
         else:  # plassembler long (long only)
-            combined_depth_mash_df = combined_depth_mash_df[
+            combined_plasmid_df = combined_plasmid_df[
                 (
-                    combined_depth_mash_df["plasmid_copy_number_long"].astype(float)
+                    combined_plasmid_df["plasmid_copy_number_long"].astype(float)
                     > depth_filter
                 )
-                | (
-                    combined_depth_mash_df["contig"].str.contains(
-                        "chromosome", case=False
-                    )
-                )
             ]
 
-        # get list of all ids that were kept above threshold
-        kept_contig_ids = combined_depth_mash_df["contig"].astype(str).tolist()
+        # get list of all ids that were kept above depth threshold
+        kept_plasmid_contig_ids = combined_plasmid_df["contig"].astype(str).tolist()
 
         # List of 'contig_ids' that were filtered out
         filtered_out_contig_ids = [
             contig_id
-            for contig_id in all_contig_ids
-            if contig_id not in kept_contig_ids
-        ]
-
-        # Identify the index of the first row that doesn't contain 'chromosome' in 'contig_id' - first plasmid index
-        # None otherwise
-        filtered_indices = combined_depth_mash_df.index[
-            ~combined_depth_mash_df["contig"].str.contains("chromosome")
+            for contig_id in all_plasmid_contig_ids
+            if contig_id not in kept_plasmid_contig_ids
         ]
 
-        if not filtered_indices.empty:
-            p1_idx = filtered_indices.min()
-        else:
-            p1_idx = None
-
+        # logging
         # needs to be at least 1 filtered out id if the filtering did anything
         logger.info(f"Filtering contigs below depth filter: {depth_filter}.")
-        if "mean_depth_short" in combined_depth_mash_df.columns:
+        if "mean_depth_short" in combined_plasmid_df.columns:
             logger.info(
                 f"All plasmids whose short and long read copy numbers are both below {depth_filter} will be removed."
             )
@@ -770,24 +759,24 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
             )
 
         if len(filtered_out_contig_ids) > 0:
-            if p1_idx is None:
+            if len(kept_plasmid_contig_ids) == 0:
                 logger.warning(f"There are 0 plasmids left after depth filtering.")
             else:
                 logger.info(
-                    f"{len(filtered_indices)} plasmids were filtered as they were below the depth filter."
+                    f"{len(filtered_out_contig_ids)} plasmids were filtered as they were below the depth filter."
                 )
-
-            # Updating 'contig_id' names starting from 1 from the identified index
-            # if it is None then there are only chromosome contigs left so no need for this
-            if p1_idx is not None:
-                combined_depth_mash_df.loc[p1_idx:, "contig"] = range(
-                    1, len(combined_depth_mash_df) - p1_idx + 2
-                )
-                # Reset index after renaming
-                combined_depth_mash_df.reset_index(drop=True, inplace=True)
         else:
             logger.info(f"No plasmids were filtered due to low depth.")
 
+        # concat dfs back
+        # there is 1+ plasmid
+        if len(kept_plasmid_contig_ids) > 0:
+            combined_depth_mash_df = pd.concat(
+                [combined_chrom_df, combined_plasmid_df], axis=0
+            )
+        else:  # only chroms
+            combined_depth_mash_df = combined_chrom_df
+
         combined_depth_mash_df.to_csv(
             os.path.join(outdir, prefix + "_summary.tsv"), sep="\t", index=False
         )
@@ -796,7 +785,7 @@ def combine_depth_mash_tsvs(self, prefix, depth_filter):
 
     def finalise_contigs(self, prefix):
         """
-        Renames the contigs of unicycler with the new plasmid copy numbers and outputs finalised file
+        Filters contigs plassembler run
         """
         outdir = self.outdir
 
@@ -808,23 +797,42 @@ def finalise_contigs(self, prefix):
         ].reset_index(drop=True)
         # get contigs only
         plasmid_fasta = os.path.join(outdir, "unicycler_output", "assembly.fasta")
-        i = 0
         with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
             for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
                 split_desc = dna_record.description.split(" ")
+                contig_id = str(split_desc[0])
                 # only keep the contigs that passed the depth threshold
-                if str(split_desc[0]) not in filtered_out_contig_ids:
+                if contig_id not in filtered_out_contig_ids:
+                    sr_copy_number_list = combined_depth_mash_df.loc[
+                        combined_depth_mash_df["contig"] == contig_id,
+                        "plasmid_copy_number_short",
+                    ].values
+                    lr_copy_number_list = combined_depth_mash_df.loc[
+                        combined_depth_mash_df["contig"] == contig_id,
+                        "plasmid_copy_number_long",
+                    ].values
+                    if len(sr_copy_number_list) > 0:
+                        sr_copy_number = sr_copy_number_list[0]
+                    else:
+                        logger.error("Plassembler failed")
+
+                    if len(lr_copy_number_list) > 0:
+                        lr_copy_number = lr_copy_number_list[0]
+                    else:
+                        logger.error("Plassembler failed")
+
+                    # will be ordered so new_contig_count will be the index of the df
+                    # but 1 index for the output
                     if "circular" in dna_record.description:  # circular contigs
-                        id_updated = f"{i} {split_desc[1]} plasmid_copy_number_short={combined_depth_mash_df.plasmid_copy_number_short[i]}x plasmid_copy_number_long={combined_depth_mash_df.plasmid_copy_number_long[i]}x circular=true"
+                        id_updated = f"{contig_id} {split_desc[1]} plasmid_copy_number_short={sr_copy_number}x plasmid_copy_number_long={lr_copy_number}x circular=true"
                     else:  # non circular contigs
-                        id_updated = f"{i} {split_desc[1]} plasmid_copy_number_short={combined_depth_mash_df.plasmid_copy_number_short[i]}x plasmid_copy_number_long={combined_depth_mash_df.plasmid_copy_number_long[i]}x"
-                    i += 1
+                        id_updated = f"{contig_id} {split_desc[1]} plasmid_copy_number_short={sr_copy_number}x plasmid_copy_number_long={lr_copy_number}x"
                     record = SeqRecord(dna_record.seq, id=id_updated, description="")
                     SeqIO.write(record, dna_fa, "fasta")
 
     def finalise_contigs_long(self, prefix):
         """
-        Renames the contigs of assembly with new ones
+        Filters contigs plassembler long
         """
         outdir = self.outdir
 
@@ -836,21 +844,30 @@ def finalise_contigs_long(self, prefix):
         ].reset_index(drop=True)
         # get contigs only
         plasmid_fasta = os.path.join(outdir, "plasmids.fasta")
-        i = 0
         with open(os.path.join(outdir, prefix + "_plasmids.fasta"), "w") as dna_fa:
             for dna_record in SeqIO.parse(plasmid_fasta, "fasta"):
                 # only keep the contigs that passed the depth threshold
-                if str(dna_record.id) not in filtered_out_contig_ids:
+                contig_id = str(dna_record.id)
+                if contig_id not in filtered_out_contig_ids:
                     length = len(dna_record.seq)
-                    copy_number = combined_depth_mash_df.plasmid_copy_number_long[i]
+                    lr_copy_number_list = combined_depth_mash_df.loc[
+                        combined_depth_mash_df["contig"] == contig_id,
+                        "plasmid_copy_number_long",
+                    ].values
+                    if len(lr_copy_number_list) > 0:
+                        lr_copy_number = lr_copy_number_list[0]
+                    else:
+                        logger.error("Plassembler failed")
+
                     if (
                         "circular" in dna_record.description
                     ):  # circular contigs from canu
-                        desc = f"len={length} plasmid_copy_number_long={copy_number}x circular=True"
+                        desc = f"len={length} plasmid_copy_number_long={lr_copy_number}x circular=True"
                     else:
-                        desc = f"len={length} plasmid_copy_number_long={copy_number}x"
-                    i += 1
-                    record = SeqRecord(dna_record.seq, id=str(i), description=desc)
+                        desc = (
+                            f"len={length} plasmid_copy_number_long={lr_copy_number}x"
+                        )
+                    record = SeqRecord(dna_record.seq, id=contig_id, description=desc)
                     SeqIO.write(record, dna_fa, "fasta")
 
 

diff --git a/tests/test_data/end_to_end/input_R1.fastq.gz b/tests/test_data/end_to_end/input_R1.fastq.gz
diff --git a/tests/test_data/end_to_end/input_R2.fastq.gz b/tests/test_data/end_to_end/input_R2.fastq.gz
diff --git a/tests/test_data/end_to_end/input_depth_filter.fastq.gz b/tests/test_data/end_to_end/input_depth_filter.fastq.gz
diff --git a/tests/test_data/end_to_end/input_half.fastq.gz b/tests/test_data/end_to_end/input_half.fastq.gz
diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py
@@ -124,17 +124,6 @@ def test_plassembler_case_3(self):
         exec_command(cmd)
         remove_directory(outdir)
 
-    def test_plassembler_case_3(self):
-        """test plassembler run - chromosome and plasmids assembled with Flye"""
-        longreads: Path = f"{end_to_end}/input_fastq.gz"
-        s1: Path = f"{end_to_end}/input_R1.fastq.gz"
-        s2: Path = f"{end_to_end}/input_R2.fastq.gz"
-        chromosome = 50000
-        outdir: Path = f"{end_to_end}/test_out"
-        cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir}  -t 8 -f"
-        exec_command(cmd)
-        remove_directory(outdir)
-
     def test_plassembler_case_4(self):
         """test plassembler run case 4. Only chromosome assembled with flye, no plasmid in recovery."""
         longreads: Path = f"{end_to_end}/abaumanii_plasmid.fastq.gz"
@@ -159,7 +148,7 @@ def test_plassembler_multiple_chromosomes_no_plasmids(self):
 
     # copy number
 
-    def test_plassembler_case_depth_filter(self):
+    def test_plassembler_case_depth_filter_all(self):
         """test plassembler run depth_filter 1.2 - will have no plasmids left"""
         longreads: Path = f"{end_to_end}/input_fastq.gz"
         s1: Path = f"{end_to_end}/input_R1.fastq.gz"
@@ -170,6 +159,17 @@ def test_plassembler_case_depth_filter(self):
         exec_command(cmd)
         remove_directory(outdir)
 
+    def test_plassembler_case_depth_filter_some(self):
+        """test plassembler run depth_filter 0.6 with input_half lr - will have only 1 plasmid"""
+        longreads: Path = f"{end_to_end}/input_half.fastq.gz"
+        s1: Path = f"{end_to_end}/input_R1.fastq.gz"
+        s2: Path = f"{end_to_end}/input_R2.fastq.gz"
+        chromosome = 50000
+        outdir: Path = f"{end_to_end}/test_out"
+        cmd = f"plassembler run -l {longreads} -c {chromosome} -1 {s1} -2 {s2} -d {plassembler_db_dir} -o {outdir}  -t 8 -f --depth_filter 0.6"
+        exec_command(cmd)
+        remove_directory(outdir)
+
     def test_plassembler_case_extra_unicycler_spades_opts(self):
         """test plassembler with extra unicycler and spades opts"""
         longreads: Path = f"{end_to_end}/input_fastq.gz"
@@ -263,6 +263,24 @@ def test_plassembler_long(self):
         exec_command(cmd)
         remove_directory(outdir)
 
+    def test_plassembler_depth_all(self):
+        """test plassembler long depth filter will all plasmids filtered"""
+        longreads: Path = f"{end_to_end}/input_fastq.gz"
+        chromosome = 50000
+        outdir: Path = f"{end_to_end}/test_out"
+        cmd = f"plassembler long -l {longreads} -c {chromosome} -d {plassembler_db_dir} -o {outdir} --depth_filter 10 -t 8 -f"
+        exec_command(cmd)
+        remove_directory(outdir)
+
+    def test_plassembler_depth_some(self):
+        """test plassembler long depth filter will some plasmids filtered"""
+        longreads: Path = f"{end_to_end}/input_depth_filter.fastq.gz"
+        chromosome = 50000
+        outdir: Path = f"{end_to_end}/test_out"
+        cmd = f"plassembler long -l {longreads} -c {chromosome} -d {plassembler_db_dir} -o {outdir} --depth_filter 2  -t 8 -f"
+        exec_command(cmd)
+        remove_directory(outdir)
+
     # def test_plassembler_long_canu(self):
     #     """test plassembler long canu"""
     #     longreads: Path = f"{end_to_end}/input_fastq.gz"