From 8edeb4d1ed648d6d53ac5f535983fb4e5065a85a Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 24 Oct 2025 12:40:46 +0500 Subject: [PATCH 1/2] Set transaction_per_migration=True --- src/dstack/_internal/server/migrations/env.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/dstack/_internal/server/migrations/env.py b/src/dstack/_internal/server/migrations/env.py index 4259ec9fa..0b2f73a19 100644 --- a/src/dstack/_internal/server/migrations/env.py +++ b/src/dstack/_internal/server/migrations/env.py @@ -36,7 +36,6 @@ def run_migrations_offline(): literal_binds=True, dialect_opts={"paramstyle": "named"}, ) - with context.begin_transaction(): context.run_migrations() @@ -61,12 +60,21 @@ def run_migrations(connection: Connection): # https://alembic.sqlalchemy.org/en/latest/batch.html#dealing-with-referencing-foreign-keys if connection.dialect.name == "sqlite": connection.execute(text("PRAGMA foreign_keys=OFF;")) + elif connection.dialect.name == "postgresql": + # lock_timeout is needed so that migrations that acquire locks + # do not wait for locks forever, blocking live queries. + # Better to fail and retry a deployment. + connection.execute(text("SET lock_timeout='10s';")) connection.commit() context.configure( connection=connection, target_metadata=target_metadata, compare_type=True, render_as_batch=True, + # Running each migration in a separate transaction. + # Running all migrations in one transaction may lead to deadlocks in HA deployments + # because lock ordering is not respected across all migrations. + transaction_per_migration=True, ) with context.begin_transaction(): context.run_migrations() From d9a4c6fbdc6b6cd79b6289160471e77647f6ff5e Mon Sep 17 00:00:00 2001 From: Victor Skvortsov Date: Fri, 24 Oct 2025 14:24:53 +0500 Subject: [PATCH 2/2] Document Server upgrades --- docs/docs/guides/server-deployment.md | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/docs/docs/guides/server-deployment.md b/docs/docs/guides/server-deployment.md index 80f34a335..4976bf060 100644 --- a/docs/docs/guides/server-deployment.md +++ b/docs/docs/guides/server-deployment.md @@ -400,6 +400,28 @@ export DSTACK_DB_MAX_OVERFLOW=80 You have to ensure your Postgres installation supports that many connections by configuring [`max_connections`](https://www.postgresql.org/docs/current/runtime-config-connection.html#GUC-MAX-CONNECTIONS) and/or using connection pooler. +## Server upgrades + +When upgrading the `dstack` server, follow these guidelines to ensure a smooth transition and minimize downtime. + +### Before upgrading + +1. **Check the changelog**: Review the [release notes :material-arrow-top-right-thin:{ .external }](https://github.com/dstackai/dstack/releases){:target="_blank"} for breaking changes, new features, and migration notes. +2. **Review backward compatibility**: Understand the [backward compatibility](#backward-compatibility) policy. +3. **Back up your data**: Ensure you always create a backup before upgrading. + +### Best practices + +- **Test in staging**: Always test upgrades in a non-production environment first. +- **Monitor logs**: Watch server logs during and after the upgrade for any errors or warnings. +- **Keep backups**: Retain backups for at least a few days after a successful upgrade. + +### Troubleshooting + +**Deadlock when upgrading a multi-replica PostgreSQL deployment** + +If a deployment is stuck due to a deadlock when applying DB migrations, try scaling server replicas to 1 and retry the deployment multiple times. Some releases may not support rolling deployments, which is always noted in the release notes. If you think there is a bug, please [file an issue](https://github.com/dstackai/dstack/issues). + ## FAQs ??? info "Can I run multiple replicas of dstack server?"