Skip to content

Commit

Permalink
replaces various pandas apply functions with list comprehension to sp…
Browse files Browse the repository at this point in the history
…eed up the code
  • Loading branch information
fritshermans committed Jan 2, 2022
1 parent d90c78e commit e038eb3
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 8 deletions.
8 changes: 3 additions & 5 deletions deduplipy/blocking/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,8 @@ def apply_rule(rule, x, j):
for j, rule_spec in enumerate(self._rules_specs):
col_name = rule_spec['col_name']
function = rule_spec['function']
col_1 = df_training.apply(
lambda row: apply_rule(function, row[f'{col_name}_1'], j), axis=1)
col_2 = df_training.apply(
lambda row: apply_rule(function, row[f'{col_name}_2'], j), axis=1)
col_1 = pd.Series([apply_rule(function, x, j) for x in df_training[f'{col_name}_1']])
col_2 = pd.Series([apply_rule(function, x, j) for x in df_training[f'{col_name}_2']])
match_result = (((col_1 != None) & (col_2 != None)) & (col_1 == col_2)).astype(int)
self._rules_specs[j].update({'rule_set': set((match_result[match_result == 1]).index.tolist())})

Expand Down Expand Up @@ -95,7 +93,7 @@ def _fingerprint(self, X: pd.DataFrame) -> pd.DataFrame:
col_name = rule_selected['col_name']
func_name = rule_selected['function_name']
function = rule_selected['function']
df[f'{col_name}_{func_name}'] = df[col_name].apply(lambda x: function(x))
df[f'{col_name}_{func_name}'] = pd.Series([function(x) for x in df[col_name]])
df.loc[df[f'{col_name}_{func_name}'].notnull(), f'{col_name}_{func_name}'] = \
df[df[f'{col_name}_{func_name}'].notnull()][f'{col_name}_{func_name}'] + f":{j}"
df_melted = df.melt(id_vars=self.col_names + [ROW_ID], value_name='fingerprint').drop(columns=['variable'])
Expand Down
6 changes: 3 additions & 3 deletions deduplipy/deduplicator/deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,14 +110,14 @@ def _create_pairs_table(self, X: pd.DataFrame, n_samples: int) -> pd.DataFrame:
return pairs.drop_duplicates()

def _calculate_string_similarities(self, X: pd.DataFrame) -> pd.DataFrame:
X.reset_index(drop=True, inplace=True) # need to reset the index because of the list comprehension below
metrics_col_names = []
for field in self.field_info.keys():
for metric in self.field_info[field]:
metrics_col_name = f'{field}_{metric.__name__}'
X[metrics_col_name] = X.apply(lambda row: metric(row[f'{field}_1'], row[f'{field}_2']), axis=1)
X[metrics_col_name] = pd.Series([metric(x, y) for x, y in zip(X[f'{field}_1'], X[f'{field}_2'])])
metrics_col_names.append(metrics_col_name)
X['similarities'] = X.apply(lambda row: [row[metrics_col_name] for metrics_col_name in metrics_col_names],
axis=1)
X['similarities'] = X[metrics_col_names].values.tolist()
X.drop(columns=metrics_col_names, inplace=True)
return X

Expand Down

0 comments on commit e038eb3

Please sign in to comment.