Rene hamburger1993 patch 1 (#113)

* Fix #112 drop cols containing None before grouping. Speeds the function up: ~x5 when called as `fp.to_dataframe(return_atoms=True, drop_empty=True)` --------- Co-authored-by: Cédric Bouysset <bouysset.cedric@gmail.com>
chemosim-lab · Apr 5, 2023 · e2497c1 · e2497c1
1 parent 6046761
commit e2497c1
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-## [1.1.0] - 2022-11-XX
+### Changed
+- Converting the IFP to a dataframe with atom indices has been optimized and now runs
+  about 5 times faster (Issue #112, PR #113 by @ReneHamburger1993).
+
+
+## [1.1.0] - 2022-11-18
 
 ### Added
 - `Fingerprint.run` now has a `converter_kwargs` parameter that can pass kwargs to the

diff --git a/prolif/utils.py b/prolif/utils.py
@@ -303,15 +303,25 @@ def to_dataframe(
         )
     df = pd.DataFrame(values, columns=columns, index=index)
     if has_atom_indices and return_atoms:
+        if drop_empty:
+            # remove empty columns before grouping
+            df = df[df.columns[~df.applymap(lambda x: x is None).all()]]
+            # ensure columns are still in pairs of two after groupby
+            assert (
+                df.groupby(axis=1, level=["ligand", "protein", "interaction"])
+                .apply(lambda g: len(g.columns))
+                .eq(2)
+                .all()
+            )
+        # aggregate each interaction for a pair of residues as a tuple of ligand and
+        # protein atom indices
         df = df.groupby(axis=1, level=["ligand", "protein", "interaction"]).agg(tuple)
-    if dtype:
-        df = df.astype(dtype)
-    if drop_empty:
-        if has_atom_indices and return_atoms:
-            mask = df.apply(lambda s: ~(s.isin([(None, None)]).all()), axis=0)
-        else:
+    else:
+        if dtype:
+            df = df.astype(dtype)
+        if drop_empty:
             mask = (df != empty_value).any(axis=0)
-        df = df.loc[:, mask]
+            df = df.loc[:, mask]
     return df