Skip to content

Commit

Permalink
Merge branch 'main' into smart_sampling
Browse files Browse the repository at this point in the history
  • Loading branch information
fritshermans committed Jan 2, 2022
2 parents 4922488 + 21222f5 commit 74527a4
Showing 1 changed file with 4 additions and 10 deletions.
14 changes: 4 additions & 10 deletions deduplipy/blocking/blocking.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,19 +58,13 @@ def apply_rule(rule, x, j):

for j, rule_spec in enumerate(self._rules_specs):
col_name = rule_spec['col_name']
func_name = rule_spec['function_name']
function = rule_spec['function']
df_training[f'{col_name}_1_{func_name}'] = df_training.apply(
col_1 = df_training.apply(
lambda row: apply_rule(function, row[f'{col_name}_1'], j), axis=1)
df_training[f'{col_name}_2_{func_name}'] = df_training.apply(
col_2 = df_training.apply(
lambda row: apply_rule(function, row[f'{col_name}_2'], j), axis=1)
df_training[f'{col_name}_{func_name}'] = df_training.apply(
lambda row: int(((row[f'{col_name}_1_{func_name}'] != None) &
(row[f'{col_name}_2_{func_name}'] != None)) &
(row[f'{col_name}_1_{func_name}'] == row[f'{col_name}_2_{func_name}'])),
axis=1)
self._rules_specs[j].update({'rule_set': set(
df_training[df_training[f'{col_name}_{func_name}'] == 1][f'{col_name}_{func_name}'].index.tolist())})
match_result = (((col_1 != None) & (col_2 != None)) & (col_1 == col_2)).astype(int)
self._rules_specs[j].update({'rule_set': set((match_result[match_result == 1]).index.tolist())})

self.subsets = [x['rule_set'] for x in self._rules_specs]

Expand Down

0 comments on commit 74527a4

Please sign in to comment.