OPT: group datasets and perform patch.object for always_commit once p…

…er dataset I bet it does not take much time to patch.object but it still does take time, so why to waste it?
datalad · Sep 3, 2020 · 6c7b4a5 · 6c7b4a5
1 parent cd315f9
commit 6c7b4a5
Showing 1 changed file with 16 additions and 8 deletions.
diff --git a/datalad/plugin/addurls.py b/datalad/plugin/addurls.py
@@ -556,16 +556,24 @@ def add_meta(rows):
     """
     from unittest.mock import patch
 
+    # OPT: group by dataset first so to not patch/unpatch always_commit
+    # per each file of which we could have thousands
+    from collections import defaultdict
+    dss_rows = defaultdict(list)
     for row in rows:
-        ds, filename = row["ds"], row["ds_filename"]
+        dss_rows[row["ds"]].append(row)
+
+    for ds, ds_rows in dss_rows.items():
         with patch.object(ds.repo, "always_commit", False):
-            lgr.debug("Adding metadata to %s in %s", filename, ds.path)
-            for a in ds.repo.set_metadata_(filename, add=row["meta_args"]):
-                res = annexjson2result(a, ds, type="file", logger=lgr)
-                # Don't show all added metadata for the file because that
-                # could quickly flood the output.
-                del res["message"]
-                yield res
+            for row in ds_rows:
+                filename = row["ds_filename"]
+                lgr.debug("Adding metadata to %s in %s", filename, ds.path)
+                for a in ds.repo.set_metadata_(filename, add=row["meta_args"]):
+                    res = annexjson2result(a, ds, type="file", logger=lgr)
+                    # Don't show all added metadata for the file because that
+                    # could quickly flood the output.
+                    del res["message"]
+                    yield res
 
 
 @build_doc