Skip to content

Commit

Permalink
Change the order in which the locks are acquired (#7542)
Browse files Browse the repository at this point in the history
This PR changes the order in which the locks are acquired (for the
target and reference tables), when a modify request is initiated from a
worker node that is not the "FirstWorkerNode".

To prevent concurrent writes, locks are acquired on the first worker
node for the replicated tables. When the update statement originates
from the first worker node, it acquires the lock on the reference
table(s) first, followed by the target table(s). However, if the update
statement is initiated in another worker node, the lock requests are
sent to the first worker in a different order. This PR unifies the
modification order on the first worker node. With the third commit,
independent of the node that received the request, the locks are
acquired for the modified table and then the reference tables on the
first node.

The first commit shows a sample output for the test prior to the fix.

Fixes #7477

---------

Co-authored-by: Jelte Fennema-Nio <jelte.fennema@microsoft.com>
(cherry picked from commit 8afa2d0)
  • Loading branch information
eaydingol authored and JelteF committed Apr 16, 2024
1 parent 0b2e0cd commit 0ac208b
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 8 deletions.
26 changes: 19 additions & 7 deletions src/backend/distributed/utils/resource_lock.c
Original file line number Diff line number Diff line change
Expand Up @@ -707,13 +707,27 @@ SerializeNonCommutativeWrites(List *shardIntervalList, LOCKMODE lockMode)
}

List *replicatedShardList = NIL;
if (AnyTableReplicated(shardIntervalList, &replicatedShardList))
bool anyTableReplicated = AnyTableReplicated(shardIntervalList, &replicatedShardList);

/*
* Acquire locks on the modified table.
* If the table is replicated, the locks are first acquired on the first worker node then locally.
* But if we're already on the first worker, acquiring on the first worker node and locally are the same operation.
* So we only acquire locally in that case.
*/
if (anyTableReplicated && ClusterHasKnownMetadataWorkers() && !IsFirstWorkerNode())
{
if (ClusterHasKnownMetadataWorkers() && !IsFirstWorkerNode())
{
LockShardListResourcesOnFirstWorker(lockMode, replicatedShardList);
}
LockShardListResourcesOnFirstWorker(lockMode, replicatedShardList);
}
LockShardListResources(shardIntervalList, lockMode);

/*
* Next, acquire locks on the reference tables that are referenced by a foreign key if there are any.
* Note that LockReferencedReferenceShardResources() first acquires locks on the first worker,
* then locally.
*/
if (anyTableReplicated)
{
ShardInterval *firstShardInterval =
(ShardInterval *) linitial(replicatedShardList);
if (ReferenceTableShardId(firstShardInterval->shardId))
Expand All @@ -728,8 +742,6 @@ SerializeNonCommutativeWrites(List *shardIntervalList, LOCKMODE lockMode)
LockReferencedReferenceShardResources(firstShardInterval->shardId, lockMode);
}
}

LockShardListResources(shardIntervalList, lockMode);
}


Expand Down
62 changes: 62 additions & 0 deletions src/test/regress/expected/issue_7477.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
--- Test for updating a table that has a foreign key reference to another reference table.
--- Issue #7477: Distributed deadlock after issuing a simple UPDATE statement
--- https://github.com/citusdata/citus/issues/7477
CREATE TABLE table1 (id INT PRIMARY KEY);
SELECT create_reference_table('table1');
create_reference_table
---------------------------------------------------------------------

(1 row)

INSERT INTO table1 VALUES (1);
CREATE TABLE table2 (
id INT,
info TEXT,
CONSTRAINT table1_id_fk FOREIGN KEY (id) REFERENCES table1 (id)
);
SELECT create_reference_table('table2');
create_reference_table
---------------------------------------------------------------------

(1 row)

INSERT INTO table2 VALUES (1, 'test');
--- Runs the update command in parallel on workers.
--- Due to bug #7477, before the fix, the result is non-deterministic
--- and have several rows of the form:
--- localhost | 57638 | f | ERROR: deadlock detected
--- localhost | 57637 | f | ERROR: deadlock detected
--- localhost | 57637 | f | ERROR: canceling the transaction since it was involved in a distributed deadlock
SELECT * FROM master_run_on_worker(
ARRAY['localhost', 'localhost','localhost', 'localhost','localhost',
'localhost','localhost', 'localhost','localhost', 'localhost']::text[],
ARRAY[57638, 57637, 57637, 57638, 57637, 57638, 57637, 57638, 57638, 57637]::int[],
ARRAY['UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1'
]::text[],
true);
node_name | node_port | success | result
---------------------------------------------------------------------
localhost | 57638 | t | UPDATE 1
localhost | 57637 | t | UPDATE 1
localhost | 57637 | t | UPDATE 1
localhost | 57638 | t | UPDATE 1
localhost | 57637 | t | UPDATE 1
localhost | 57638 | t | UPDATE 1
localhost | 57637 | t | UPDATE 1
localhost | 57638 | t | UPDATE 1
localhost | 57638 | t | UPDATE 1
localhost | 57637 | t | UPDATE 1
(10 rows)

--- cleanup
DROP TABLE table2;
DROP TABLE table1;
2 changes: 1 addition & 1 deletion src/test/regress/multi_schedule
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ test: multi_dropped_column_aliases foreign_key_restriction_enforcement
test: binary_protocol
test: alter_table_set_access_method
test: alter_distributed_table
test: issue_5248 issue_5099 issue_5763 issue_6543 issue_6758
test: issue_5248 issue_5099 issue_5763 issue_6543 issue_6758 issue_7477
test: object_propagation_debug
test: undistribute_table
test: run_command_on_all_nodes
Expand Down
44 changes: 44 additions & 0 deletions src/test/regress/sql/issue_7477.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

--- Test for updating a table that has a foreign key reference to another reference table.
--- Issue #7477: Distributed deadlock after issuing a simple UPDATE statement
--- https://github.com/citusdata/citus/issues/7477

CREATE TABLE table1 (id INT PRIMARY KEY);
SELECT create_reference_table('table1');
INSERT INTO table1 VALUES (1);

CREATE TABLE table2 (
id INT,
info TEXT,
CONSTRAINT table1_id_fk FOREIGN KEY (id) REFERENCES table1 (id)
);
SELECT create_reference_table('table2');
INSERT INTO table2 VALUES (1, 'test');

--- Runs the update command in parallel on workers.
--- Due to bug #7477, before the fix, the result is non-deterministic
--- and have several rows of the form:
--- localhost | 57638 | f | ERROR: deadlock detected
--- localhost | 57637 | f | ERROR: deadlock detected
--- localhost | 57637 | f | ERROR: canceling the transaction since it was involved in a distributed deadlock

SELECT * FROM master_run_on_worker(
ARRAY['localhost', 'localhost','localhost', 'localhost','localhost',
'localhost','localhost', 'localhost','localhost', 'localhost']::text[],
ARRAY[57638, 57637, 57637, 57638, 57637, 57638, 57637, 57638, 57638, 57637]::int[],
ARRAY['UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1',
'UPDATE table2 SET info = ''test_update'' WHERE id = 1'
]::text[],
true);

--- cleanup
DROP TABLE table2;
DROP TABLE table1;

0 comments on commit 0ac208b

Please sign in to comment.