From e2497c19d144853e43175f62321925b9367e59bc Mon Sep 17 00:00:00 2001
From: ReneH <48010896+ReneHamburger1993@users.noreply.github.com>
Date: Wed, 5 Apr 2023 21:40:50 +0200
Subject: [PATCH] Rene hamburger1993 patch 1 (#113)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix #112

drop cols containing None before grouping. Speeds the function up: ~x5
when called as `fp.to_dataframe(return_atoms=True, drop_empty=True)`

---------

Co-authored-by: Cédric Bouysset <bouysset.cedric@gmail.com>
---
 CHANGELOG.md    |  7 ++++++-
 prolif/utils.py | 24 +++++++++++++++++-------
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5716596..5ee40d5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-## [1.1.0] - 2022-11-XX
+### Changed
+- Converting the IFP to a dataframe with atom indices has been optimized and now runs
+  about 5 times faster (Issue #112, PR #113 by @ReneHamburger1993).
+
+
+## [1.1.0] - 2022-11-18
 
 ### Added
 - `Fingerprint.run` now has a `converter_kwargs` parameter that can pass kwargs to the
diff --git a/prolif/utils.py b/prolif/utils.py
index 4f21fee..bf681b0 100644
--- a/prolif/utils.py
+++ b/prolif/utils.py
@@ -303,15 +303,25 @@ def to_dataframe(
         )
     df = pd.DataFrame(values, columns=columns, index=index)
     if has_atom_indices and return_atoms:
+        if drop_empty:
+            # remove empty columns before grouping
+            df = df[df.columns[~df.applymap(lambda x: x is None).all()]]
+            # ensure columns are still in pairs of two after groupby
+            assert (
+                df.groupby(axis=1, level=["ligand", "protein", "interaction"])
+                .apply(lambda g: len(g.columns))
+                .eq(2)
+                .all()
+            )
+        # aggregate each interaction for a pair of residues as a tuple of ligand and
+        # protein atom indices
         df = df.groupby(axis=1, level=["ligand", "protein", "interaction"]).agg(tuple)
-    if dtype:
-        df = df.astype(dtype)
-    if drop_empty:
-        if has_atom_indices and return_atoms:
-            mask = df.apply(lambda s: ~(s.isin([(None, None)]).all()), axis=0)
-        else:
+    else:
+        if dtype:
+            df = df.astype(dtype)
+        if drop_empty:
             mask = (df != empty_value).any(axis=0)
-        df = df.loc[:, mask]
+            df = df.loc[:, mask]
     return df