Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion data_diff/sqeleton/databases/mysql.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from ..abcs.database_types import (
Datetime,
Timestamp,
Expand All @@ -21,6 +22,8 @@
from .base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS, TIMESTAMP_PRECISION_POS, Mixin_Schema, Mixin_RandomSample
from ..queries.ast_classes import BinBoolOp

logger = logging.getLogger("mysql")


@import_helper("mysql")
def import_mysql():
Expand Down Expand Up @@ -127,17 +130,31 @@ def __init__(self, *, thread_count, **kw):
self._args = kw

super().__init__(thread_count=thread_count)
self.check_charset()

# In MySQL schema and database are synonymous
try:
self.default_schema = kw["database"]
except KeyError:
raise ValueError("MySQL URL must specify a database")

def check_charset(self) -> None:
if logging.getLogger().level == logging.DEBUG:
try:
char_set_result = self.query("SELECT @@character_set_client;")
logger.debug(f"charset: {char_set_result.rows}")
collation_result = self.query("SELECT @@collation_connection;")
logger.debug(f"collation: {collation_result.rows}")
except Exception as ex:
logger.warning(f"Failed to check the charset: {ex}")

def create_connection(self):
mysql = import_mysql()
try:
return mysql.connect(charset="utf8", use_unicode=True, **self._args)
conn = mysql.connect(charset="utf8mb4", use_unicode=True, **self._args)
conn.set_charset_collation(charset="utf8mb4", collation="utf8mb4_0900_ai_ci")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This collation was added in MySQL 8.0.1 released sometime around 2018. We might expect much older versions for the typical migration/replication use cases for which data-diff is used (usually for the purpose of leaving those older versions). Should we make it so that the absence of this collation does not raise an error a few lines below?

And a second question: "ci" means "case insensitive". Does it make bisection by textual PKs, as well as where-filtering, case-insensitive by default, which might come as a surprise to users?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nolar I think I need to look into it more deeply, but from what I can tell, all of the default collations are _ci

return conn

except mysql.Error as e:
if e.errno == mysql.errorcode.ER_ACCESS_DENIED_ERROR:
raise ConnectError("Bad user name or password") from e
Expand Down