Skip to content

Commit

Permalink
Merge 33114d5 into 641dabd
Browse files Browse the repository at this point in the history
  • Loading branch information
sduenas committed Nov 16, 2017
2 parents 641dabd + 33114d5 commit 7e6ea25
Show file tree
Hide file tree
Showing 13 changed files with 380 additions and 51 deletions.
36 changes: 25 additions & 11 deletions sortinghat/cmd/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ def __init__(self, **kwargs):
help="match and merge using this type of matching")
group.add_argument('-n', '--match-new', dest='match_new', action='store_true',
help="match and merge only new unique identities")
group.add_argument('--no-strict-matching', dest='no_strict', action='store_true',
help="do not rigorous check of values (i.e, well formed email addresses)")
group.add_argument('-v', '--verbose', dest='verbose', action='store_true',
help="run verbose mode while matching and merging")

Expand All @@ -108,7 +110,10 @@ def description(self):

@property
def usage(self):
return "%(prog)s load [-v] [--reset] [--identities | --orgs] [-m matching] [-n] [--overwrite] [file]"
usg = "%(prog)s load"
usg += " [-v] [--reset] [--identities | --orgs]"
usg += " [-m matching] [-n] [--no-strict-matching] [--overwrite] [file]"
return usg

def log(self, msg, debug=True):
if debug:
Expand Down Expand Up @@ -143,20 +148,23 @@ def run(self, *args):

if params.identities:
self.import_blacklist(parser)
code = self.import_identities(parser, params.matching,
params.match_new,
params.reset,
params.verbose)
code = self.import_identities(parser,
matching=params.matching,
match_new=params.match_new,
no_strict_matching=params.no_strict,
reset=params.reset,
verbose=params.verbose)
elif params.orgs:
self.import_organizations(parser, params.overwrite)
code = CMD_SUCCESS
else:
self.import_organizations(parser, params.overwrite)
self.import_blacklist(parser)
code = self.import_identities(parser, params.matching,
params.match_new,
params.reset,
params.verbose)
code = self.import_identities(parser, matching=params.matching,
match_new=params.match_new,
no_strict_matching=params.no_strict,
reset=params.reset,
verbose=params.verbose)

return code

Expand Down Expand Up @@ -221,6 +229,7 @@ def import_organizations(self, parser, overwrite=False):
self.warning(msg)

def import_identities(self, parser, matching=None, match_new=False,
no_strict_matching=False,
reset=False, verbose=False):
"""Import identities information on the registry.
Expand All @@ -231,23 +240,28 @@ def import_identities(self, parser, matching=None, match_new=False,
the new one to insert using 'matching' method. If a match is found,
that means both identities are likely the same. Therefore, both identities
would be merged into one. The 'match_new' parameter can be set to match
and merge only new loaded identities.
and merge only new loaded identities. Rigorous validation of mathching
values (i.e, well formed email addresses) will be disabled when
<no_strict_matching> is set to to `True`.
When `reset` is set, relationships and enrollments will be removed
before loading any data.
:param parser: sorting hat parser
:param matching: type of matching used to merge existing identities
:param match_new: match and merge only the new loaded identities
:param no_strict_matching: disable strict matching (i.e, well-formed email addresses)
:param reset: remove relationships and enrollments before loading data
:param verbose: run in verbose mode when matching is set
"""
matcher = None

if matching:
strict = not no_strict_matching

try:
blacklist = api.blacklist(self.db)
matcher = create_identity_matcher(matching, blacklist)
matcher = create_identity_matcher(matching, blacklist, strict=strict)
except MatcherNotSupportedError as e:
self.error(str(e))
return e.code
Expand Down
26 changes: 21 additions & 5 deletions sortinghat/cmd/unify.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def __init__(self, **kwargs):
help="unify the unique identities from these sources only")
self.parser.add_argument('--fast-matching', dest='fast_matching', action='store_true',
help="run fast matching")
self.parser.add_argument('--no-strict-matching', dest='no_strict', action='store_true',
help="do not rigorous check of values (i.e, well formed email addresses)")
self.parser.add_argument('-i', '--interactive', action='store_true',
help="run interactive mode while unifying")

Expand All @@ -71,27 +73,34 @@ def description(self):

@property
def usage(self):
return """%(prog)s unify [--matching <matcher>] [--sources <srcs>] [--fast-matching] [--interactive]"""
usg = "%(prog)s unify"
usg += " [--matching <matcher>] [--sources <srcs>]"
usg += " [--fast-matching] [--no-strict-matching] [--interactive]"
return usg

def run(self, *args):
"""Merge unique identities using a matching algorithm."""

params = self.parser.parse_args(args)

code = self.unify(params.matching, params.sources,
params.fast_matching, params.interactive)
params.fast_matching, params.no_strict,
params.interactive)

return code

def unify(self, matching=None, sources=None,
fast_matching=False, interactive=False):
fast_matching=False, no_strict_matching=False,
interactive=False):
"""Merge unique identities using a matching algorithm.
This method looks for sets of similar identities, merging those
identities into one unique identity. To determine when two unique
identities are likely the same, a matching algorithm will be given
using the parameter <matching>. When this parameter is not given,
the default algorithm will be used.
the default algorithm will be used. Rigorous validation of mathching
values (i.e, well formed email addresses) will be disabled when
<no_strict_matching> is set to to `True`.
When <fast_matching> is set, it runs a fast algorithm to find matches
between identities. This mode will consume more resources (i.e,
Expand All @@ -109,16 +118,20 @@ def unify(self, matching=None, sources=None,
:param matching: type of matching used to merge existing identities
:param sources: unify the unique identities from these sources only
:param fast_matching: use the fast mode
:param no_strict_matching: disable strict matching (i.e, well-formed email addresses)
:param interactive: interactive mode for merging identities
"""
matcher = None

if not matching:
matching = 'default'

strict = not no_strict_matching

try:
blacklist = api.blacklist(self.db)
matcher = create_identity_matcher(matching, blacklist, sources)
matcher = create_identity_matcher(matching, blacklist,
sources, strict)
except MatcherNotSupportedError as e:
self.error(str(e))
return e.code
Expand Down Expand Up @@ -152,6 +165,9 @@ def __merge(self, matched, interactive):
"""Merge a lists of matched unique identities"""

for m in matched:



u = m[0]

for c in m[1:]:
Expand Down
18 changes: 14 additions & 4 deletions sortinghat/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ class IdentityMatcher(object):
- 'blacklist' : list of entries to ignore during the matching process
- 'sources' : only match the identities from these sources
- 'strict' : strict matching (i.e, well-formed email addresses);
`True` by default
"""
def __init__(self, **kwargs):

self._kwargs = kwargs
blacklist = self._kwargs.get('blacklist', None)
sources = self._kwargs.get('sources', None)
Expand All @@ -53,6 +54,8 @@ def __init__(self, **kwargs):
else:
self.sources = None

self.strict = self._kwargs.get('strict', True)

def match(self, a, b):
"""Abstract method used to determine if both unique identities are the same.
Expand Down Expand Up @@ -117,7 +120,8 @@ def to_dict(self):
}


def create_identity_matcher(matcher='default', blacklist=None, sources=None):
def create_identity_matcher(matcher='default', blacklist=None, sources=None,
strict=True):
"""Create an identity matcher of the given type.
Factory function that creates an identity matcher object of the type
Expand All @@ -127,6 +131,7 @@ def create_identity_matcher(matcher='default', blacklist=None, sources=None):
:param matcher: type of the matcher
:param blacklist: list of entries to ignore while matching
:param sources: only match the identities from these sources
:param strict: strict matching (i.e, well-formed email addresses)
:returns: a identity matcher object of the given type
Expand All @@ -140,7 +145,7 @@ def create_identity_matcher(matcher='default', blacklist=None, sources=None):

klass = matching.SORTINGHAT_IDENTITIES_MATCHERS[matcher]

return klass(blacklist=blacklist, sources=sources)
return klass(blacklist=blacklist, sources=sources, strict=strict)


def match(uidentities, matcher, fastmode=False):
Expand Down Expand Up @@ -306,7 +311,12 @@ def _build_matches(matches, uuids, no_filtered, fastmode=False):
result += no_filtered
result.sort(key=len, reverse=True)

return result
sresult = []
for r in result:
r.sort(key=lambda id_: id_.uuid)
sresult.append(r)

return sresult


def _calculate_matches_closures(groups):
Expand Down
18 changes: 12 additions & 6 deletions sortinghat/matching/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,18 @@ class EmailMatcher(IdentityMatcher):
Simple unique identities matcher.
This matcher only produces a positive result when two identities
from each unique identity share the same email address. It also
returns a positive match when the uuid on both unique identities is equal.
from each unique identity share the same email address. When `strict`
is set, the email must be well-formed. It also returns a positive
match when the uuid on both unique identities is equal.
:param blacklist: list of entries to ignore during the matching process
:param sources: only match the identities from these sources
:param strict: strict matching with well-formed email addresses
"""
def __init__(self, blacklist=None, sources=None):
def __init__(self, blacklist=None, sources=None, strict=True):
super(EmailMatcher, self).__init__(blacklist=blacklist,
sources=sources)
sources=sources,
strict=strict)
self.email_pattern = re.compile(EMAIL_ADDRESS_REGEX)

def match(self, a, b):
Expand Down Expand Up @@ -157,8 +160,11 @@ def filter(self, u):
if self.sources and id_.source.lower() not in self.sources:
continue

if self._check_email(id_.email):
email = id_.email.lower()
if self.strict:
if self._check_email(id_.email):
email = id_.email.lower()
else:
email = id_.email.lower() if id_.email else None

if email:
fid = EmailIdentity(id_.id, id_.uuid, email)
Expand Down
21 changes: 14 additions & 7 deletions sortinghat/matching/email_name.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,17 @@ class EmailNameMatcher(IdentityMatcher):
- identities share the same email address
- name field is composed by "firstname lastname" and both are
equal; i.e: "John Smith" and "J Smith Rae" are valid name fields;
"jonhsmith" are "j.smith" not valid
"jonhsmith" are "j.smith" not valid. This rigorous validation is
only done when `strict` mode is set to `True`.
:param blacklist: list of entries to ignore during the matching process
:param sources: only match the identities from these sources
:param strict: strict matching with well-formed email addresses and names
"""
def __init__(self, blacklist=None, sources=None):
def __init__(self, blacklist=None, sources=None, strict=True):
super(EmailNameMatcher, self).__init__(blacklist=blacklist,
sources=sources)
sources=sources,
strict=strict)
self.email_pattern = re.compile(EMAIL_ADDRESS_REGEX)
self.name_pattern = re.compile(NAME_REGEX)

Expand Down Expand Up @@ -174,10 +177,14 @@ def filter(self, u):
if self._check_blacklist(id_):
continue

if self._check_pattern(self.email_pattern, id_.email):
email = id_.email.lower()
if self._check_pattern(self.name_pattern, id_.name):
name = id_.name.lower()
if self.strict:
if self._check_pattern(self.email_pattern, id_.email):
email = id_.email.lower()
if self._check_pattern(self.name_pattern, id_.name):
name = id_.name.lower()
else:
email = id_.email.lower() if id_.email else None
name = id_.name.lower() if id_.name else None

if email or name:
fid = EmailNameIdentity(id_.id, id_.uuid,
Expand Down
6 changes: 4 additions & 2 deletions sortinghat/matching/github.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,12 @@ class GitHubMatcher(IdentityMatcher):
:param blacklist: list of entries to ignore during the matching process
:param sources: only match the identities from these sources
:param strict: not used by this matcher
"""
def __init__(self, blacklist=None, sources=None):
def __init__(self, blacklist=None, sources=None, strict=True):
super(GitHubMatcher, self).__init__(blacklist=blacklist,
sources=sources)
sources=sources,
strict=strict)

def match(self, a, b):
"""Determine if two unique identities are the same.
Expand Down
6 changes: 4 additions & 2 deletions sortinghat/matching/username.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,12 @@ class UsernameMatcher(IdentityMatcher):
:param blacklist: list of entries to ignore during the matching process
:param sources: only match the identities from these sources
:param strict: not used by this matcher
"""
def __init__(self, blacklist=None, sources=None):
def __init__(self, blacklist=None, sources=None, strict=True):
super(UsernameMatcher, self).__init__(blacklist=blacklist,
sources=sources)
sources=sources,
strict=strict)

def match(self, a, b):
"""Determine if two unique identities are the same.
Expand Down
32 changes: 32 additions & 0 deletions tests/data/sortinghat_no_strict_valid.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"source": null,
"time": "2017-11-16 17:13:00",
"blacklist": [
],
"organizations": {
},
"uidentities": {
"e8284285566fdc1f41c8a22bb84a295fc3c4cbb3": {
"enrollments": [
],
"identities": [
{
"email": "jsmith@example",
"id": "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3",
"name": null,
"source": "scm",
"username": null,
"uuid": "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3"
}
],
"profile": {
"country": null,
"email": "jsmith@example.com",
"name": null,
"is_bot": true,
"uuid": "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3"
},
"uuid": "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3"
}
}
}
Loading

0 comments on commit 7e6ea25

Please sign in to comment.