Skip to content

Commit

Permalink
Merge 6cd7b6f into 55c08a2
Browse files Browse the repository at this point in the history
  • Loading branch information
valeriocos committed Jun 22, 2018
2 parents 55c08a2 + 6cd7b6f commit 7cabeb2
Show file tree
Hide file tree
Showing 3 changed files with 345 additions and 14 deletions.
141 changes: 127 additions & 14 deletions sortinghat/cmd/unify.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,17 @@
#

import argparse
import json
import os

from .. import api
from ..command import Command, CMD_SUCCESS, HELP_LIST
from ..exceptions import MatcherNotSupportedError
from ..matcher import create_identity_matcher, match
from ..matching import SORTINGHAT_IDENTITIES_MATCHERS

RECOVERY_FILE_PATH = '~/.sortinghat.d/unify.log'


class Unify(Command):
"""Merge unique identities using a matching algorithm.
Expand Down Expand Up @@ -55,6 +59,8 @@ def __init__(self, **kwargs):
help="do not rigorous check of values (i.e, well formed email addresses)")
self.parser.add_argument('-i', '--interactive', action='store_true',
help="run interactive mode while unifying")
self.parser.add_argument('-r', '--recovery', dest='recovery', action='store_true',
help="Enable recovery mode")

# Exit early if help is requested
if 'cmd_args' in kwargs and [i for i in kwargs['cmd_args'] if i in HELP_LIST]:
Expand All @@ -63,6 +69,7 @@ def __init__(self, **kwargs):
self._set_database(**kwargs)
self.total = 0
self.matched = 0
self.recovery = False

@property
def description(self):
Expand All @@ -72,7 +79,7 @@ def description(self):
def usage(self):
usg = "%(prog)s unify"
usg += " [--matching <matcher>] [--sources <srcs>]"
usg += " [--fast-matching] [--no-strict-matching] [--interactive]"
usg += " [--fast-matching] [--no-strict-matching] [--interactive] [--recovery]"
return usg

def run(self, *args):
Expand All @@ -82,13 +89,13 @@ def run(self, *args):

code = self.unify(params.matching, params.sources,
params.fast_matching, params.no_strict,
params.interactive)
params.interactive, params.recovery)

return code

def unify(self, matching=None, sources=None,
fast_matching=False, no_strict_matching=False,
interactive=False):
interactive=False, recovery=False):
"""Merge unique identities using a matching algorithm.
This method looks for sets of similar identities, merging those
Expand Down Expand Up @@ -117,13 +124,16 @@ def unify(self, matching=None, sources=None,
:param fast_matching: use the fast mode
:param no_strict_matching: disable strict matching (i.e, well-formed email addresses)
:param interactive: interactive mode for merging identities
:param recovery: if enabled, the unify will read the matching identities stored in
recovery file (RECOVERY_FILE_PATH) and process them
"""
matcher = None

if not matching:
matching = 'default'

strict = not no_strict_matching
self.recovery = recovery

try:
blacklist = api.blacklist(self.db)
Expand Down Expand Up @@ -155,22 +165,40 @@ def __unify_unique_identities(self, uidentities, matcher,
self.total = len(uidentities)
self.matched = 0

matched = match(uidentities, matcher, fastmode=fast_matching)
if self.recovery and RecoveryFile.exists():
print("Loading matches from recovery file: %s" % RecoveryFile.location())
matched = RecoveryFile.load_matches()
else:
matched = match(uidentities, matcher, fastmode=fast_matching)
# convert the matched identities to a common JSON format to ease resuming operations
matched = RecoveryFile.jsonize(matched)

self.__merge(matched, interactive)

if self.recovery:
RecoveryFile.delete()

def __merge(self, matched, interactive):
"""Merge a lists of matched unique identities"""

for m in matched:
u = m[0]
identities = m['identities']
uuid = identities[0]

try:
for c in identities[1:]:
if self.__merge_unique_identities(c, uuid, interactive):
self.matched += 1

for c in m[1:]:
if self.__merge_unique_identities(c, u, interactive):
self.matched += 1
# Retrieve unique identity to show updated info
if interactive:
uuid = api.unique_identities(self.db, uuid=uuid)[0]
except Exception as e:
if self.recovery:
RecoveryFile.save_matches(matched)
raise e

# Retrieve unique identity to show updated info
if interactive:
u = api.unique_identities(self.db, uuid=u.uuid)[0]
m['processed'] = True

def __merge_unique_identities(self, from_uid, to_uid, interactive):
# By default, always merge
Expand All @@ -183,10 +211,10 @@ def __merge_unique_identities(self, from_uid, to_uid, interactive):
if not merge:
return False

api.merge_unique_identities(self.db, from_uid.uuid, to_uid.uuid)
api.merge_unique_identities(self.db, from_uid, to_uid)

self.display('merge.tmpl', from_uuid=from_uid.uuid,
to_uuid=to_uid.uuid)
self.display('merge.tmpl', from_uuid=from_uid,
to_uuid=to_uid)

return True

Expand All @@ -210,3 +238,88 @@ def __display_stats(self):
self.display('unify.tmpl', processed=self.total,
matched=self.matched,
unified=self.total - self.matched)


class RecoveryFile:
"""A class to perform operation on the recovery file.
The class contains four static methods that check whether a recovery file exists,
allow to load the identity matches within it, convert the matches in a JSON
format and delete the file.
"""
@staticmethod
def location():
"""Return the recovery file path"""

return os.path.expanduser(RECOVERY_FILE_PATH)

@staticmethod
def exists():
"""Check whether a recovery file exists"""

return os.path.exists(RecoveryFile.location())

@staticmethod
def load_matches():
"""Load matches of the previous failed execution from the recovery file.
:returns matches: a list of matches in JSON format
"""
if not RecoveryFile.exists():
return []

matches = []
with open(RecoveryFile.location(), 'r') as f:
for line in f.readlines():
match_obj = json.loads(line.strip("\n"))
if match_obj['processed']:
continue

matches.append(match_obj)

return matches

@staticmethod
def jsonize(matched):
"""Convert matches to JSON format.
:param matched: a list of matched identities
:returns json_matches: a list of matches in JSON format
"""
json_matches = []
for m in matched:
identities = [i.uuid for i in m]

if len(identities) == 1:
continue

json_match = {
'identities': identities,
'processed': False
}
json_matches.append(json_match)

return json_matches

@staticmethod
def save_matches(matches):
"""Save matches of a failed execution to the log.
:param matches: a list of matches in JSON format
"""
if not os.path.dirname(RecoveryFile.location()):
os.makedirs(os.path.dirname(RecoveryFile.location()))

with open(RecoveryFile.location(), "w+") as f:
matches = [m for m in matches if not m['processed']]
for m in matches:
match_obj = json.dumps(m)
f.write(match_obj + "\n")

@staticmethod
def delete():
"""Delete the recovery file."""

if RecoveryFile.exists():
os.remove(RecoveryFile.location())
1 change: 1 addition & 0 deletions tests/data/unify_matches.log
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"identities": ["178315df7941fc76a6ffb06fd5b00f6932ad9c41", "880b3dfcb3a08712e5831bddc3dfe81fc5d7b331"], "processed": false}
Loading

0 comments on commit 7cabeb2

Please sign in to comment.