Skip to content

Commit

Permalink
[model] Force 'utf8_unicode_ci' collation on MySQL tables
Browse files Browse the repository at this point in the history
MySQL considers chars like 'β' and 'b' or 'ı' and 'i' the same,
when some collation values are set (i.e utf8_general_ci). This can
raise integrity errors when Sorting Hat tries to add similar
identities with these pairs of characters.

For instance, if the identity:

    ('scm', 'βart', 'bart@example.com', 'bart)

is stored in the database, the insertion of:

    ('scm', 'bart', 'bart@example.com', 'bart)

will raise an error, even when these identities have different
UUIDs.

Forcing MySQL to use 'utf8_unicode_ci' fixes this error, allowing
to insert both identities.
  • Loading branch information
sduenas committed Apr 24, 2017
1 parent eedd68e commit 1ee8800
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 9 deletions.
2 changes: 1 addition & 1 deletion sortinghat/_version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Versions compliant with PEP 440 https://www.python.org/dev/peps/pep-0440
__version__ = "0.3.1.dev1"
__version__ = "0.3.1.dev2"
22 changes: 14 additions & 8 deletions sortinghat/db/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,12 @@
MIN_PERIOD_DATE = datetime.datetime(1900, 1, 1, 0, 0, 0)
MAX_PERIOD_DATE = datetime.datetime(2100, 1, 1, 0, 0, 0)

# Default charset and collation
MYSQL_CHARSET = {
'mysql_charset': 'utf8',
'mysql_collate': 'utf8_unicode_ci'
}


ModelBase = declarative_base()

Expand All @@ -53,7 +59,7 @@ class Organization(ModelBase):
enrollments = association_proxy('enrollments', 'uidentities')

__table_args__ = (UniqueConstraint('name', name='_name_unique'),
{'mysql_charset': 'utf8'})
MYSQL_CHARSET)

def to_dict(self):
return {
Expand All @@ -78,7 +84,7 @@ class Domain(ModelBase):
lazy='joined')

__table_args__ = (UniqueConstraint('domain', name='_domain_unique'),
{'mysql_charset': 'utf8'})
MYSQL_CHARSET)

def to_dict(self):
return {
Expand All @@ -99,7 +105,7 @@ class Country(ModelBase):
alpha3 = Column(String(3), nullable=False)

__table_args__ = (UniqueConstraint('alpha3', name='_alpha_unique'),
{'mysql_charset': 'utf8'})
MYSQL_CHARSET)

def to_dict(self):
return {
Expand Down Expand Up @@ -128,7 +134,7 @@ class UniqueIdentity(ModelBase):
# Many-to-many association proxy
organizations = association_proxy('enrollments', 'organizations')

__table_args__ = ({'mysql_charset': 'utf8'})
__table_args__ = (MYSQL_CHARSET)

def __init__(self, uuid=None):
self.uuid = uuid
Expand Down Expand Up @@ -163,7 +169,7 @@ class Identity(ModelBase):

__table_args__ = (UniqueConstraint('name', 'email', 'username', 'source',
name='_identity_unique'),
{'mysql_charset': 'utf8'})
MYSQL_CHARSET)

def to_dict(self):
return {
Expand Down Expand Up @@ -191,7 +197,7 @@ class Profile(ModelBase):
country = relationship('Country', backref='profile_country',
lazy='joined')

__table_args__ = ({'mysql_charset': 'utf8'})
__table_args__ = (MYSQL_CHARSET)

def to_dict(self):
return {
Expand Down Expand Up @@ -235,7 +241,7 @@ class Enrollment(ModelBase):
__table_args__ = (UniqueConstraint('uuid', 'organization_id',
'start', 'end',
name='_period_unique'),
{'mysql_charset': 'utf8'})
MYSQL_CHARSET)

def to_dict(self):
return {
Expand All @@ -251,7 +257,7 @@ class MatchingBlacklist(ModelBase):

excluded = Column(String(128), primary_key=True)

__table_args__ = ({'mysql_charset': 'utf8'})
__table_args__ = (MYSQL_CHARSET)


class MappedTable(object):
Expand Down
14 changes: 14 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,20 @@ def test_unaccent_identities(self):
self.assertEqual(context.exception.uuid,
'a16659ea83d28c839ffae76ceebb3ca9fb8e8894')

def test_charset(self):
"""Check if it adds two identities with different encoding"""

# With an invalid encoding both names wouldn't be inserted;
# In MySQL, chars 'ı' and 'i' are considered the same with a
# collation distinct to <charset>_unicode_ci
uuid1 = api.add_identity(self.db, 'scm', 'jsmith@example.com',
'John Smıth', 'jsmith')
uuid2 = api.add_identity(self.db, 'scm', 'jsmith@example.com',
'John Smith', 'jsmith')

self.assertEqual(uuid1, 'cf79edf008b7b2960a0be3972b256c65af449dc1')
self.assertEqual(uuid2, 'a9b403e150dd4af8953a52a4bb841051e4b705d9')

def test_none_source(self):
"""Check whether new identities cannot be added when giving a None source"""

Expand Down
30 changes: 30 additions & 0 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,19 @@ def test_none_name_organizations(self):
self.session.add(org1)
self.session.commit()

def test_charset(self):
"""Check encoding charset"""

# With an invalid encoding both names wouldn't be inserted;
# In MySQL, chars 'ı' and 'i' are considered the same with a
# collation distinct to <charset>_unicode_ci
org1 = Organization(name='ıCompany'.encode('utf-8'))
org2 = Organization(name='iCompany')

self.session.add(org1)
self.session.add(org2)
self.session.commit()

def test_to_dict(self):
"""Test output of to_dict() method"""

Expand Down Expand Up @@ -424,6 +437,23 @@ def test_unique_identities(self):

self.assertNotEqual(id1.id, id2.id)

def test_charset(self):
"""Check encoding charset"""

# With an invalid encoding both names wouldn't be inserted;
# In MySQL, chars 'ı' and 'i' are considered the same with a
# collation distinct to <charset>_unicode_ci
id1 = Identity(id='A', name='John Smıth'.encode('utf-8'),
email='jsmith@example.com',
username='jsmith', source='scm')
id2 = Identity(id='B', name='John Smith',
email='jsmith@example.com',
username='jsmith', source='scm')

self.session.add(id1)
self.session.add(id2)
self.session.commit()

def test_to_dict(self):
"""Test output of to_dict() method"""

Expand Down

0 comments on commit 1ee8800

Please sign in to comment.