Merge 33114d5 into 641dabd

chaoss · Nov 16, 2017 · 7e6ea25 · 7e6ea25
2 parents 641dabd + 33114d5
commit 7e6ea25
Show file tree

Hide file tree

Showing 13 changed files with 380 additions and 51 deletions.
diff --git a/sortinghat/cmd/load.py b/sortinghat/cmd/load.py
@@ -87,6 +87,8 @@ def __init__(self, **kwargs):
                            help="match and merge using this type of matching")
         group.add_argument('-n', '--match-new', dest='match_new', action='store_true',
                            help="match and merge only new unique identities")
+        group.add_argument('--no-strict-matching', dest='no_strict', action='store_true',
+                           help="do not rigorous check of values (i.e, well formed email addresses)")
         group.add_argument('-v', '--verbose', dest='verbose', action='store_true',
                            help="run verbose mode while matching and merging")
 
@@ -108,7 +110,10 @@ def description(self):
 
     @property
     def usage(self):
-        return "%(prog)s load [-v] [--reset] [--identities | --orgs] [-m matching] [-n] [--overwrite] [file]"
+        usg = "%(prog)s load"
+        usg += " [-v] [--reset] [--identities | --orgs]"
+        usg += " [-m matching] [-n] [--no-strict-matching] [--overwrite] [file]"
+        return usg
 
     def log(self, msg, debug=True):
         if debug:
@@ -143,20 +148,23 @@ def run(self, *args):
 
         if params.identities:
             self.import_blacklist(parser)
-            code = self.import_identities(parser, params.matching,
-                                          params.match_new,
-                                          params.reset,
-                                          params.verbose)
+            code = self.import_identities(parser,
+                                          matching=params.matching,
+                                          match_new=params.match_new,
+                                          no_strict_matching=params.no_strict,
+                                          reset=params.reset,
+                                          verbose=params.verbose)
         elif params.orgs:
             self.import_organizations(parser, params.overwrite)
             code = CMD_SUCCESS
         else:
             self.import_organizations(parser, params.overwrite)
             self.import_blacklist(parser)
-            code = self.import_identities(parser, params.matching,
-                                          params.match_new,
-                                          params.reset,
-                                          params.verbose)
+            code = self.import_identities(parser, matching=params.matching,
+                                          match_new=params.match_new,
+                                          no_strict_matching=params.no_strict,
+                                          reset=params.reset,
+                                          verbose=params.verbose)
 
         return code
 
@@ -221,6 +229,7 @@ def import_organizations(self, parser, overwrite=False):
                     self.warning(msg)
 
     def import_identities(self, parser, matching=None, match_new=False,
+                          no_strict_matching=False,
                           reset=False, verbose=False):
         """Import identities information on the registry.
 
@@ -231,23 +240,28 @@ def import_identities(self, parser, matching=None, match_new=False,
         the new one to insert using 'matching' method. If a match is found,
         that means both identities are likely the same. Therefore, both identities
         would be merged into one. The 'match_new' parameter can be set to match
-        and merge only new loaded identities.
+        and merge only new loaded identities. Rigorous validation of mathching
+        values (i.e, well formed email addresses) will be disabled when
+        <no_strict_matching> is set to to `True`.
 
         When `reset` is set, relationships and enrollments will be removed
         before loading any data.
 
         :param parser: sorting hat parser
         :param matching: type of matching used to merge existing identities
         :param match_new: match and merge only the new loaded identities
+        :param no_strict_matching: disable strict matching (i.e, well-formed email addresses)
         :param reset: remove relationships and enrollments before loading data
         :param verbose: run in verbose mode when matching is set
         """
         matcher = None
 
         if matching:
+            strict = not no_strict_matching
+
             try:
                 blacklist = api.blacklist(self.db)
-                matcher = create_identity_matcher(matching, blacklist)
+                matcher = create_identity_matcher(matching, blacklist, strict=strict)
             except MatcherNotSupportedError as e:
                 self.error(str(e))
                 return e.code

diff --git a/sortinghat/cmd/unify.py b/sortinghat/cmd/unify.py
@@ -54,6 +54,8 @@ def __init__(self, **kwargs):
                                  help="unify the unique identities from these sources only")
         self.parser.add_argument('--fast-matching', dest='fast_matching', action='store_true',
                                  help="run fast matching")
+        self.parser.add_argument('--no-strict-matching', dest='no_strict', action='store_true',
+                                 help="do not rigorous check of values (i.e, well formed email addresses)")
         self.parser.add_argument('-i', '--interactive', action='store_true',
                                  help="run interactive mode while unifying")
 
@@ -71,27 +73,34 @@ def description(self):
 
     @property
     def usage(self):
-        return """%(prog)s unify [--matching <matcher>] [--sources <srcs>] [--fast-matching] [--interactive]"""
+        usg = "%(prog)s unify"
+        usg += " [--matching <matcher>] [--sources <srcs>]"
+        usg += " [--fast-matching] [--no-strict-matching] [--interactive]"
+        return usg
 
     def run(self, *args):
         """Merge unique identities using a matching algorithm."""
 
         params = self.parser.parse_args(args)
 
         code = self.unify(params.matching, params.sources,
-                          params.fast_matching, params.interactive)
+                          params.fast_matching, params.no_strict,
+                          params.interactive)
 
         return code
 
     def unify(self, matching=None, sources=None,
-              fast_matching=False, interactive=False):
+              fast_matching=False, no_strict_matching=False,
+              interactive=False):
         """Merge unique identities using a matching algorithm.
 
         This method looks for sets of similar identities, merging those
         identities into one unique identity. To determine when two unique
         identities are likely the same, a matching algorithm will be given
         using the parameter <matching>. When this parameter is not given,
-        the default algorithm will be used.
+        the default algorithm will be used. Rigorous validation of mathching
+        values (i.e, well formed email addresses) will be disabled when
+        <no_strict_matching> is set to to `True`.
 
         When <fast_matching> is set, it runs a fast algorithm to find matches
         between identities. This mode will consume more resources (i.e,
@@ -109,16 +118,20 @@ def unify(self, matching=None, sources=None,
         :param matching: type of matching used to merge existing identities
         :param sources: unify the unique identities from these sources only
         :param fast_matching: use the fast mode
+        :param no_strict_matching: disable strict matching (i.e, well-formed email addresses)
         :param interactive: interactive mode for merging identities
         """
         matcher = None
 
         if not matching:
             matching = 'default'
 
+        strict = not no_strict_matching
+
         try:
             blacklist = api.blacklist(self.db)
-            matcher = create_identity_matcher(matching, blacklist, sources)
+            matcher = create_identity_matcher(matching, blacklist,
+                                              sources, strict)
         except MatcherNotSupportedError as e:
             self.error(str(e))
             return e.code
@@ -152,6 +165,9 @@ def __merge(self, matched, interactive):
         """Merge a lists of matched unique identities"""
 
         for m in matched:
+
+
+
             u = m[0]
 
             for c in m[1:]:

diff --git a/sortinghat/matcher.py b/sortinghat/matcher.py
@@ -33,9 +33,10 @@ class IdentityMatcher(object):
 
        - 'blacklist' : list of entries to ignore during the matching process
        - 'sources' : only match the identities from these sources
+       - 'strict' : strict matching (i.e, well-formed email addresses);
+          `True` by default
     """
     def __init__(self, **kwargs):
-
         self._kwargs = kwargs
         blacklist = self._kwargs.get('blacklist', None)
         sources = self._kwargs.get('sources', None)
@@ -53,6 +54,8 @@ def __init__(self, **kwargs):
         else:
             self.sources = None
 
+        self.strict = self._kwargs.get('strict', True)
+
     def match(self, a, b):
         """Abstract method used to determine if both unique identities are the same.
 
@@ -117,7 +120,8 @@ def to_dict(self):
                }
 
 
-def create_identity_matcher(matcher='default', blacklist=None, sources=None):
+def create_identity_matcher(matcher='default', blacklist=None, sources=None,
+                            strict=True):
     """Create an identity matcher of the given type.
 
     Factory function that creates an identity matcher object of the type
@@ -127,6 +131,7 @@ def create_identity_matcher(matcher='default', blacklist=None, sources=None):
     :param matcher: type of the matcher
     :param blacklist: list of entries to ignore while matching
     :param sources: only match the identities from these sources
+    :param strict: strict matching (i.e, well-formed email addresses)
 
     :returns: a identity matcher object of the given type
 
@@ -140,7 +145,7 @@ def create_identity_matcher(matcher='default', blacklist=None, sources=None):
 
     klass = matching.SORTINGHAT_IDENTITIES_MATCHERS[matcher]
 
-    return klass(blacklist=blacklist, sources=sources)
+    return klass(blacklist=blacklist, sources=sources, strict=strict)
 
 
 def match(uidentities, matcher, fastmode=False):
@@ -306,7 +311,12 @@ def _build_matches(matches, uuids, no_filtered, fastmode=False):
     result += no_filtered
     result.sort(key=len, reverse=True)
 
-    return result
+    sresult = []
+    for r in result:
+        r.sort(key=lambda id_: id_.uuid)
+        sresult.append(r)
+
+    return sresult
 
 
 def _calculate_matches_closures(groups):

diff --git a/sortinghat/matching/email.py b/sortinghat/matching/email.py
@@ -51,15 +51,18 @@ class EmailMatcher(IdentityMatcher):
     Simple unique identities matcher.
 
     This matcher only produces a positive result when two identities
-    from each unique identity share the same email address. It also
-    returns a positive match when the uuid on both unique identities is equal.
+    from each unique identity share the same email address. When `strict`
+    is set, the email must be well-formed. It also returns a positive
+    match when the uuid on both unique identities is equal.
 
     :param blacklist: list of entries to ignore during the matching process
     :param sources: only match the identities from these sources
+    :param strict: strict matching with well-formed email addresses
     """
-    def __init__(self, blacklist=None, sources=None):
+    def __init__(self, blacklist=None, sources=None, strict=True):
         super(EmailMatcher, self).__init__(blacklist=blacklist,
-                                           sources=sources)
+                                           sources=sources,
+                                           strict=strict)
         self.email_pattern = re.compile(EMAIL_ADDRESS_REGEX)
 
     def match(self, a, b):
@@ -157,8 +160,11 @@ def filter(self, u):
             if self.sources and id_.source.lower() not in self.sources:
                 continue
 
-            if self._check_email(id_.email):
-                email = id_.email.lower()
+            if self.strict:
+                if self._check_email(id_.email):
+                    email = id_.email.lower()
+            else:
+                email = id_.email.lower() if id_.email else None
 
             if email:
                 fid = EmailIdentity(id_.id, id_.uuid, email)

diff --git a/sortinghat/matching/email_name.py b/sortinghat/matching/email_name.py
@@ -60,14 +60,17 @@ class EmailNameMatcher(IdentityMatcher):
        - identities share the same email address
        - name field is composed by "firstname lastname" and both are
          equal; i.e: "John Smith" and "J Smith Rae" are valid name fields;
-         "jonhsmith" are "j.smith" not valid
+         "jonhsmith" are "j.smith" not valid. This rigorous validation is
+         only done when `strict` mode is set to `True`.
 
     :param blacklist: list of entries to ignore during the matching process
     :param sources: only match the identities from these sources
+    :param strict: strict matching with well-formed email addresses and names
     """
-    def __init__(self, blacklist=None, sources=None):
+    def __init__(self, blacklist=None, sources=None, strict=True):
         super(EmailNameMatcher, self).__init__(blacklist=blacklist,
-                                               sources=sources)
+                                               sources=sources,
+                                               strict=strict)
         self.email_pattern = re.compile(EMAIL_ADDRESS_REGEX)
         self.name_pattern = re.compile(NAME_REGEX)
 
@@ -174,10 +177,14 @@ def filter(self, u):
             if self._check_blacklist(id_):
                 continue
 
-            if self._check_pattern(self.email_pattern, id_.email):
-                email = id_.email.lower()
-            if self._check_pattern(self.name_pattern, id_.name):
-                name = id_.name.lower()
+            if self.strict:
+                if self._check_pattern(self.email_pattern, id_.email):
+                    email = id_.email.lower()
+                if self._check_pattern(self.name_pattern, id_.name):
+                    name = id_.name.lower()
+            else:
+                email = id_.email.lower() if id_.email else None
+                name = id_.name.lower() if id_.name else None
 
             if email or name:
                 fid = EmailNameIdentity(id_.id, id_.uuid,

diff --git a/sortinghat/matching/github.py b/sortinghat/matching/github.py
@@ -57,10 +57,12 @@ class GitHubMatcher(IdentityMatcher):
 
     :param blacklist: list of entries to ignore during the matching process
     :param sources: only match the identities from these sources
+    :param strict: not used by this matcher
     """
-    def __init__(self, blacklist=None, sources=None):
+    def __init__(self, blacklist=None, sources=None, strict=True):
         super(GitHubMatcher, self).__init__(blacklist=blacklist,
-                                            sources=sources)
+                                            sources=sources,
+                                            strict=strict)
 
     def match(self, a, b):
         """Determine if two unique identities are the same.

diff --git a/sortinghat/matching/username.py b/sortinghat/matching/username.py
@@ -54,10 +54,12 @@ class UsernameMatcher(IdentityMatcher):
 
     :param blacklist: list of entries to ignore during the matching process
     :param sources: only match the identities from these sources
+    :param strict: not used by this matcher
     """
-    def __init__(self, blacklist=None, sources=None):
+    def __init__(self, blacklist=None, sources=None, strict=True):
         super(UsernameMatcher, self).__init__(blacklist=blacklist,
-                                              sources=sources)
+                                              sources=sources,
+                                              strict=strict)
 
     def match(self, a, b):
         """Determine if two unique identities are the same.

diff --git a/tests/data/sortinghat_no_strict_valid.json b/tests/data/sortinghat_no_strict_valid.json
@@ -0,0 +1,32 @@
+{
+    "source": null,
+    "time": "2017-11-16 17:13:00",
+    "blacklist": [
+    ],
+    "organizations": {
+    },
+    "uidentities": {
+        "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3": {
+            "enrollments": [
+            ],
+            "identities": [
+                {
+                    "email": "jsmith@example",
+                    "id": "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3",
+                    "name": null,
+                    "source": "scm",
+                    "username": null,
+                    "uuid": "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3"
+                }
+            ],
+            "profile": {
+                "country": null,
+                "email": "jsmith@example.com",
+                "name": null,
+                "is_bot": true,
+                "uuid": "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3"
+            },
+            "uuid": "e8284285566fdc1f41c8a22bb84a295fc3c4cbb3"
+        }
+    }
+}