replaces various pandas apply functions with list comprehension to sp…

…eed up the code
fritshermans · Jan 2, 2022 · e038eb3 · e038eb3
1 parent d90c78e
commit e038eb3
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 8 deletions.
diff --git a/deduplipy/blocking/blocking.py b/deduplipy/blocking/blocking.py
@@ -59,10 +59,8 @@ def apply_rule(rule, x, j):
         for j, rule_spec in enumerate(self._rules_specs):
             col_name = rule_spec['col_name']
             function = rule_spec['function']
-            col_1 = df_training.apply(
-                lambda row: apply_rule(function, row[f'{col_name}_1'], j), axis=1)
-            col_2 = df_training.apply(
-                lambda row: apply_rule(function, row[f'{col_name}_2'], j), axis=1)
+            col_1 = pd.Series([apply_rule(function, x, j) for x in df_training[f'{col_name}_1']])
+            col_2 = pd.Series([apply_rule(function, x, j) for x in df_training[f'{col_name}_2']])
             match_result = (((col_1 != None) & (col_2 != None)) & (col_1 == col_2)).astype(int)
             self._rules_specs[j].update({'rule_set': set((match_result[match_result == 1]).index.tolist())})
 
@@ -95,7 +93,7 @@ def _fingerprint(self, X: pd.DataFrame) -> pd.DataFrame:
             col_name = rule_selected['col_name']
             func_name = rule_selected['function_name']
             function = rule_selected['function']
-            df[f'{col_name}_{func_name}'] = df[col_name].apply(lambda x: function(x))
+            df[f'{col_name}_{func_name}'] = pd.Series([function(x) for x in df[col_name]])
             df.loc[df[f'{col_name}_{func_name}'].notnull(), f'{col_name}_{func_name}'] = \
                 df[df[f'{col_name}_{func_name}'].notnull()][f'{col_name}_{func_name}'] + f":{j}"
         df_melted = df.melt(id_vars=self.col_names + [ROW_ID], value_name='fingerprint').drop(columns=['variable'])

diff --git a/deduplipy/deduplicator/deduplicator.py b/deduplipy/deduplicator/deduplicator.py
@@ -110,14 +110,14 @@ def _create_pairs_table(self, X: pd.DataFrame, n_samples: int) -> pd.DataFrame:
         return pairs.drop_duplicates()
 
     def _calculate_string_similarities(self, X: pd.DataFrame) -> pd.DataFrame:
+        X.reset_index(drop=True, inplace=True)  # need to reset the index because of the list comprehension below
         metrics_col_names = []
         for field in self.field_info.keys():
             for metric in self.field_info[field]:
                 metrics_col_name = f'{field}_{metric.__name__}'
-                X[metrics_col_name] = X.apply(lambda row: metric(row[f'{field}_1'], row[f'{field}_2']), axis=1)
+                X[metrics_col_name] = pd.Series([metric(x, y) for x, y in zip(X[f'{field}_1'], X[f'{field}_2'])])
                 metrics_col_names.append(metrics_col_name)
-        X['similarities'] = X.apply(lambda row: [row[metrics_col_name] for metrics_col_name in metrics_col_names],
-                                    axis=1)
+        X['similarities'] = X[metrics_col_names].values.tolist()
         X.drop(columns=metrics_col_names, inplace=True)
         return X