From e2497c19d144853e43175f62321925b9367e59bc Mon Sep 17 00:00:00 2001 From: ReneH <48010896+ReneHamburger1993@users.noreply.github.com> Date: Wed, 5 Apr 2023 21:40:50 +0200 Subject: [PATCH] Rene hamburger1993 patch 1 (#113) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix #112 drop cols containing None before grouping. Speeds the function up: ~x5 when called as `fp.to_dataframe(return_atoms=True, drop_empty=True)` --------- Co-authored-by: Cédric Bouysset --- CHANGELOG.md | 7 ++++++- prolif/utils.py | 24 +++++++++++++++++------- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5716596..5ee40d5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## [1.1.0] - 2022-11-XX +### Changed +- Converting the IFP to a dataframe with atom indices has been optimized and now runs + about 5 times faster (Issue #112, PR #113 by @ReneHamburger1993). + + +## [1.1.0] - 2022-11-18 ### Added - `Fingerprint.run` now has a `converter_kwargs` parameter that can pass kwargs to the diff --git a/prolif/utils.py b/prolif/utils.py index 4f21fee..bf681b0 100644 --- a/prolif/utils.py +++ b/prolif/utils.py @@ -303,15 +303,25 @@ def to_dataframe( ) df = pd.DataFrame(values, columns=columns, index=index) if has_atom_indices and return_atoms: + if drop_empty: + # remove empty columns before grouping + df = df[df.columns[~df.applymap(lambda x: x is None).all()]] + # ensure columns are still in pairs of two after groupby + assert ( + df.groupby(axis=1, level=["ligand", "protein", "interaction"]) + .apply(lambda g: len(g.columns)) + .eq(2) + .all() + ) + # aggregate each interaction for a pair of residues as a tuple of ligand and + # protein atom indices df = df.groupby(axis=1, level=["ligand", "protein", "interaction"]).agg(tuple) - if dtype: - df = df.astype(dtype) - if drop_empty: - if has_atom_indices and return_atoms: - mask = df.apply(lambda s: ~(s.isin([(None, None)]).all()), axis=0) - else: + else: + if dtype: + df = df.astype(dtype) + if drop_empty: mask = (df != empty_value).any(axis=0) - df = df.loc[:, mask] + df = df.loc[:, mask] return df