Skip to content

Commit

Permalink
Add support for offline readiness state (#5730)
Browse files Browse the repository at this point in the history
Unlike `not_ready`, the `offline` state will drop existing connections
and refuse to accept new ones.  This is useful for whenever there is a
switchover in an HA or upgrade scenario that requires the old instance
to stop serving.
  • Loading branch information
elprans committed Jul 3, 2023
1 parent a49668c commit 926f8fe
Show file tree
Hide file tree
Showing 8 changed files with 98 additions and 5 deletions.
2 changes: 1 addition & 1 deletion edb/api/errors.txt
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@
0x_08_00_00_00 AvailabilityError

0x_08_00_00_01 BackendUnavailableError #SHOULD_RETRY

0x_08_00_00_02 ServerOfflineError

####

Expand Down
5 changes: 5 additions & 0 deletions edb/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
'AuthenticationError',
'AvailabilityError',
'BackendUnavailableError',
'ServerOfflineError',
'BackendError',
'UnsupportedBackendFeatureError',
'LogMessage',
Expand Down Expand Up @@ -416,6 +417,10 @@ class BackendUnavailableError(AvailabilityError):
_code = 0x_08_00_00_01


class ServerOfflineError(AvailabilityError):
_code = 0x_08_00_00_02


class BackendError(EdgeDBError):
_code = 0x_09_00_00_00

Expand Down
9 changes: 9 additions & 0 deletions edb/server/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,17 @@ class JOSEKeyMode(enum.StrEnum):
class ReadinessState(enum.StrEnum):

Default = "default"
"""Default state: serving normally"""

NotReady = "not_ready"
"""/server/status/ready returns an error, but clients can still connect."""

ReadOnly = "read_only"
"""Only read-only queries are allowed."""

Offline = "offline"
"""Any existing connections are gracefully terminated and no new
connections are allowed."""


class ServerAuthMethod(enum.StrEnum):
Expand Down
15 changes: 14 additions & 1 deletion edb/server/dbview/dbview.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1099,25 +1099,38 @@ cdef class DatabaseConnectionView:
error_constructor,
reason,
):
if not self.server.is_online():
readiness_reason = self.server.get_readiness_reason()
msg = "the server is going offline"
if readiness_reason:
msg = f"{msg}: {readiness_reason}"
raise errors.ServerOfflineError(msg)

if query_capabilities & ~self._capability_mask:
# _capability_mask is currently only used for system database
raise query_capabilities.make_error(
self._capability_mask,
errors.UnsupportedCapabilityError,
"system database is read-only",
)

if query_capabilities & ~allowed_capabilities:
raise query_capabilities.make_error(
allowed_capabilities,
error_constructor,
reason,
)

if self.server.is_readonly():
if query_capabilities & enums.Capability.WRITE:
readiness_reason = self.server.get_readiness_reason()
msg = "the server is currently in read-only mode"
if readiness_reason:
msg = f"{msg}: {readiness_reason}"
raise query_capabilities.make_error(
~enums.Capability.WRITE,
errors.DisabledCapabilityError,
"the server is currently in read-only mode",
msg,
)


Expand Down
6 changes: 6 additions & 0 deletions edb/server/protocol/binary.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1093,6 +1093,12 @@ cdef class EdgeConnection(frontend.FrontendConnection):
self.write_error(ex)
self.flush()

if isinstance(ex, errors.ServerOfflineError):
# This server is going into "offline" mode,
# close the connection.
self.close()
return

# The connection was aborted while we were
# interpreting the error (via compiler/errmech.py).
if self._con_status == EDGECON_BAD:
Expand Down
6 changes: 6 additions & 0 deletions edb/server/protocol/binary_v0.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,12 @@ cdef class EdgeConnectionBackwardsCompatible(EdgeConnection):
self.write_error(ex)
self.flush()

if isinstance(ex, errors.ServerOfflineError):
# This server is going into "offline" mode,
# close the connection.
self.close()
return

# The connection was aborted while we were
# interpreting the error (via compiler/errmech.py).
if self._con_status == EDGECON_BAD:
Expand Down
21 changes: 18 additions & 3 deletions edb/server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ def __init__(
self._admin_ui = admin_ui

self._readiness = srvargs.ReadinessState.Default
self._readiness_reason = ""

# A set of databases that should not accept new connections.
self._block_new_connections: set[str] = set()
Expand Down Expand Up @@ -326,12 +327,21 @@ def in_test_mode(self):
def is_admin_ui_enabled(self):
return self._admin_ui

def is_online(self) -> bool:
return self._readiness is not srvargs.ReadinessState.Offline

def is_ready(self) -> bool:
return self._readiness is srvargs.ReadinessState.Default
return (
self._readiness is srvargs.ReadinessState.Default
or self._readiness is srvargs.ReadinessState.ReadOnly
)

def is_readonly(self) -> bool:
return self._readiness is srvargs.ReadinessState.ReadOnly

def get_readiness_reason(self) -> str:
return self._readiness_reason

def get_pg_dbname(self, dbname: str) -> str:
return self._cluster.get_db_name(dbname)

Expand Down Expand Up @@ -1960,13 +1970,16 @@ def reload_state_file(_file_modified, _event):
def reload_readiness_state(self, state_file):
try:
with open(state_file, 'rt') as rt:
state = rt.readline().strip()
line = rt.readline().strip()
try:
state, _, reason = line.partition(":")
self._readiness = srvargs.ReadinessState(state)
self._readiness_reason = reason
logger.info(
"readiness state file changed, "
"setting server readiness to %r",
"setting server readiness to %r%s",
state,
f" ({reason})" if reason else "",
)
except ValueError:
logger.warning(
Expand Down Expand Up @@ -1994,6 +2007,8 @@ def reload_readiness_state(self, state_file):
)
self._readiness = srvargs.ReadinessState.Default

self._accepting_connections = self.is_online()

def reload_tls(self, tls_cert_file, tls_key_file):
logger.info("loading TLS certificates")
tls_password_needed = False
Expand Down
39 changes: 39 additions & 0 deletions tests/test_server_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,45 @@ async def test_server_ops_readonly(self):
rf.close()
os.unlink(rf_name)

async def test_server_ops_offline(self):
rf_no, rf_name = tempfile.mkstemp(text=True)
rf = open(rf_no, "wt")

try:
print("default", file=rf, flush=True)

async with tb.start_edgedb_server(
readiness_state_file=rf_name,
) as sd:
conn = await sd.connect()
await conn.execute("select 1")

# Go offline
rf.seek(0)
print("offline", file=rf, flush=True)
await asyncio.sleep(0.01)

with self.assertRaises(
(edgedb.AvailabilityError, edgedb.ClientConnectionError),
):
await conn.execute("select 1")

# Clear read-only by removing the file
rf.close()
os.unlink(rf_name)
await asyncio.sleep(0.05)
async for tr in self.try_until_succeeds(
ignore=(errors.ClientConnectionError,),
):
async with tr:
await conn.execute("select 1")

await conn.aclose()
finally:
if os.path.exists(rf_name):
rf.close()
os.unlink(rf_name)

async def test_server_ops_restore_with_schema_signal(self):
async def test(pgdata_path):
backend_dsn = f'postgres:///?user=postgres&host={pgdata_path}'
Expand Down

0 comments on commit 926f8fe

Please sign in to comment.