From cc83515d0c837c8b1596a97b6f09d519a0f75f72 Mon Sep 17 00:00:00 2001 From: Sri Harsha CH <57220027+harshachinta@users.noreply.github.com> Date: Wed, 8 Nov 2023 10:43:29 +0530 Subject: [PATCH] feat(spanner): update code for session leaks cleanup (#8978) * feat(spanner): fix nil pointer and move to a function * feat(spanner): remove mutex * feat(spanner): move variable parameter to sessionpool struct * feat(spanner): fix test flakiness by force running clean up task * feat(spanner): fix flakiness --- spanner/client.go | 11 +++------- spanner/client_test.go | 22 +++++++++----------- spanner/pdml.go | 8 +++---- spanner/session.go | 16 +++++++------- spanner/session_test.go | 16 ++------------ spanner/transaction.go | 46 +++++++++++++++++++++++++++-------------- 6 files changed, 58 insertions(+), 61 deletions(-) diff --git a/spanner/client.go b/spanner/client.go index 1c81ccefba9..52cc320a74f 100644 --- a/spanner/client.go +++ b/spanner/client.go @@ -569,18 +569,13 @@ func (c *Client) rwTransaction(ctx context.Context, f func(context.Context, *Rea if sh == nil || sh.getID() == "" || sh.getClient() == nil { // Session handle hasn't been allocated or has been destroyed. sh, err = c.idleSessions.take(ctx) - if t != nil { - // Some operations (for ex BatchUpdate) can be long-running. For such operations set the isLongRunningTransaction flag to be true - sh.mu.Lock() - t.mu.Lock() - sh.eligibleForLongRunning = t.isLongRunningTransaction - t.mu.Unlock() - sh.mu.Unlock() - } if err != nil { // If session retrieval fails, just fail the transaction. return err } + + // Some operations (for ex BatchUpdate) can be long-running. For such operations set the isLongRunningTransaction flag to be true + t.setSessionEligibilityForLongRunning(sh) } if t.shouldExplicitBegin(attempt) { // Make sure we set the current session handle before calling BeginTransaction. diff --git a/spanner/client_test.go b/spanner/client_test.go index b0bd2bd2caf..f35b8ff715b 100644 --- a/spanner/client_test.go +++ b/spanner/client_test.go @@ -269,8 +269,6 @@ func TestClient_Single_WhenInactiveTransactionsAndSessionIsNotFoundOnBackend_Rem if g, w := sh.eligibleForLongRunning, false; g != w { t.Fatalf("isLongRunningTransaction mismatch\nGot: %v\nWant: %v\n", g, w) } - p.InactiveTransactionRemovalOptions.mu.Lock() - defer p.InactiveTransactionRemovalOptions.mu.Unlock() if g, w := p.numOfLeakedSessionsRemoved, uint64(1); g != w { t.Fatalf("Number of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } @@ -1563,14 +1561,14 @@ func TestClient_ReadWriteTransaction_WhenMultipleOperations_SessionLastUseTimeSh MaxOpened: 1, InactiveTransactionRemovalOptions: InactiveTransactionRemovalOptions{ actionOnInactiveTransaction: WarnAndClose, - idleTimeThreshold: 30 * time.Millisecond, + idleTimeThreshold: 300 * time.Millisecond, }, }, }) defer teardown() server.TestSpanner.PutExecutionTime(MethodExecuteSql, SimulatedExecutionTime{ - MinimumExecutionTime: 20 * time.Millisecond, + MinimumExecutionTime: 200 * time.Millisecond, }) ctx := context.Background() p := client.idleSessions @@ -1603,11 +1601,11 @@ func TestClient_ReadWriteTransaction_WhenMultipleOperations_SessionLastUseTimeSh t.Fatalf("Session lastUseTime times should not be equal") } - if (time.Now().Sub(sessionPrevLastUseTime)).Milliseconds() < 40 { - t.Fatalf("Expected session to be checkedout for more than 40 milliseconds") + if (time.Now().Sub(sessionPrevLastUseTime)).Milliseconds() < 400 { + t.Fatalf("Expected session to be checkedout for more than 400 milliseconds") } - if (time.Now().Sub(sessionCheckoutTime)).Milliseconds() < 40 { - t.Fatalf("Expected session to be checkedout for more than 40 milliseconds") + if (time.Now().Sub(sessionCheckoutTime)).Milliseconds() < 400 { + t.Fatalf("Expected session to be checkedout for more than 400 milliseconds") } // force run task to clean up unexpected long-running sessions whose lastUseTime >= 3sec. // The session should not be cleaned since the latest operation on the transaction has updated the lastUseTime. @@ -1707,8 +1705,8 @@ func TestClient_ReadWriteTransaction_WhenLongRunningExecuteBatchUpdate_TakeNoAct if g, w := attempts, 2; g != w { t.Fatalf("number of attempts mismatch:\nGot%d\nWant:%d", g, w) } - p.InactiveTransactionRemovalOptions.mu.Lock() - defer p.InactiveTransactionRemovalOptions.mu.Unlock() + p.mu.Lock() + defer p.mu.Unlock() if g, w := p.numOfLeakedSessionsRemoved, uint64(0); g != w { t.Fatalf("Number of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } @@ -4256,8 +4254,8 @@ func TestClient_WhenLongRunningPartitionedUpdateRequest_TakeNoAction(t *testing. t.Errorf("Row count mismatch\nGot: %v\nWant: %v", g, w) } p := client.idleSessions - p.InactiveTransactionRemovalOptions.mu.Lock() - defer p.InactiveTransactionRemovalOptions.mu.Unlock() + p.mu.Lock() + defer p.mu.Unlock() if g, w := p.numOfLeakedSessionsRemoved, uint64(0); g != w { t.Fatalf("Number of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } diff --git a/spanner/pdml.go b/spanner/pdml.go index e23b7c27da7..2bdfd72897d 100644 --- a/spanner/pdml.go +++ b/spanner/pdml.go @@ -53,16 +53,16 @@ func (c *Client) partitionedUpdate(ctx context.Context, statement Statement, opt } sh, err := c.idleSessions.take(ctx) - // Mark isLongRunningTransaction to true, as the session in case of partitioned dml can be long-running - sh.mu.Lock() - sh.eligibleForLongRunning = true - sh.mu.Unlock() if err != nil { return 0, ToSpannerError(err) } if sh != nil { defer sh.recycle() } + // Mark isLongRunningTransaction to true, as the session in case of partitioned dml can be long-running + sh.mu.Lock() + sh.eligibleForLongRunning = true + sh.mu.Unlock() // Create the parameters and the SQL request, but without a transaction. // The transaction reference will be added by the executePdml method. diff --git a/spanner/session.go b/spanner/session.go index 884036be405..c23f66541a8 100644 --- a/spanner/session.go +++ b/spanner/session.go @@ -59,7 +59,6 @@ const ( // InactiveTransactionRemovalOptions has configurations for action on long-running transactions. type InactiveTransactionRemovalOptions struct { - mu sync.Mutex // actionOnInactiveTransaction is the configuration to choose action for inactive transactions. // It can be one of Warn, Close, WarnAndClose. actionOnInactiveTransaction ActionOnInactiveTransactionKind @@ -74,9 +73,6 @@ type InactiveTransactionRemovalOptions struct { // variable that keeps track of the last execution time when inactive transactions // were removed by the maintainer task. lastExecutionTime time.Time - // indicates the number of leaked sessions removed from the session pool. - // This is valid only when ActionOnInactiveTransaction is WarnAndClose or ActionOnInactiveTransaction is Close. - numOfLeakedSessionsRemoved uint64 } // sessionHandle is an interface for transactions to access Cloud Spanner @@ -613,6 +609,10 @@ type sessionPool struct { // tagMap is a map of all tags that are associated with the emitted metrics. tagMap *tag.Map + + // indicates the number of leaked sessions removed from the session pool. + // This is valid only when ActionOnInactiveTransaction is WarnAndClose or ActionOnInactiveTransaction is Close in InactiveTransactionRemovalOptions. + numOfLeakedSessionsRemoved uint64 } // newSessionPool creates a new session pool. @@ -761,14 +761,16 @@ func (p *sessionPool) removeLongRunningSessions() { // destroy long-running sessions if p.actionOnInactiveTransaction == WarnAndClose || p.actionOnInactiveTransaction == Close { + var leakedSessionsRemovedCount uint64 for _, sh := range longRunningSessions { // removes inner session out of the pool to reduce the probability of two processes trying // to use the same session at the same time. sh.destroy() - p.InactiveTransactionRemovalOptions.mu.Lock() - p.InactiveTransactionRemovalOptions.numOfLeakedSessionsRemoved++ - p.InactiveTransactionRemovalOptions.mu.Unlock() + leakedSessionsRemovedCount++ } + p.mu.Lock() + p.numOfLeakedSessionsRemoved += leakedSessionsRemovedCount + p.mu.Unlock() } } diff --git a/spanner/session_test.go b/spanner/session_test.go index 4ec24afd400..a90f46f0d34 100644 --- a/spanner/session_test.go +++ b/spanner/session_test.go @@ -474,8 +474,6 @@ func TestSessionLeak_WhenInactiveTransactions_RemoveSessionsFromPool(t *testing. if g, w := p.numOpened, uint64(0); g != w { t.Fatalf("Session pool size mismatch\nGot: %d\nWant: %d\n", g, w) } - p.InactiveTransactionRemovalOptions.mu.Lock() - defer p.InactiveTransactionRemovalOptions.mu.Unlock() if g, w := p.numOfLeakedSessionsRemoved, uint64(1); g != w { t.Fatalf("Number of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } @@ -541,11 +539,11 @@ func TestMaintainer_LongRunningTransactionsCleanup_IfClose_VerifyInactiveSession // Sleep for maintainer to run long-running cleanup task time.Sleep(30 * time.Millisecond) + // force run task to clean up unexpected long-running sessions + sp.removeLongRunningSessions() sp.mu.Lock() defer sp.mu.Unlock() - sp.InactiveTransactionRemovalOptions.mu.Lock() - defer sp.InactiveTransactionRemovalOptions.mu.Unlock() if g, w := sp.numOfLeakedSessionsRemoved, uint64(2); g != w { t.Fatalf("No of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } @@ -614,8 +612,6 @@ func TestLongRunningTransactionsCleanup_IfClose_VerifyInactiveSessionsClosed(t * sp.mu.Lock() defer sp.mu.Unlock() - sp.InactiveTransactionRemovalOptions.mu.Lock() - defer sp.InactiveTransactionRemovalOptions.mu.Unlock() if g, w := sp.numOfLeakedSessionsRemoved, uint64(2); g != w { t.Fatalf("No of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } @@ -702,8 +698,6 @@ func TestLongRunningTransactionsCleanup_IfLog_VerifyInactiveSessionsOpen(t *test sp.mu.Lock() defer sp.mu.Unlock() - sp.InactiveTransactionRemovalOptions.mu.Lock() - defer sp.InactiveTransactionRemovalOptions.mu.Unlock() if g, w := sp.numOfLeakedSessionsRemoved, uint64(0); g != w { t.Fatalf("No of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } @@ -763,8 +757,6 @@ func TestLongRunningTransactionsCleanup_UtilisationBelowThreshold_VerifyInactive sp.mu.Lock() defer sp.mu.Unlock() - sp.InactiveTransactionRemovalOptions.mu.Lock() - defer sp.InactiveTransactionRemovalOptions.mu.Unlock() // 2/3 sessions are used. Hence utilisation < 95%. if g, w := sp.numOfLeakedSessionsRemoved, uint64(0); g != w { t.Fatalf("No of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) @@ -834,8 +826,6 @@ func TestLongRunningTransactions_WhenAllExpectedlyLongRunning_VerifyInactiveSess sp.mu.Lock() defer sp.mu.Unlock() - sp.InactiveTransactionRemovalOptions.mu.Lock() - defer sp.InactiveTransactionRemovalOptions.mu.Unlock() if g, w := sp.numOfLeakedSessionsRemoved, uint64(0); g != w { t.Fatalf("No of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } @@ -916,8 +906,6 @@ func TestLongRunningTransactions_WhenDurationBelowThreshold_VerifyInactiveSessio sp.mu.Lock() defer sp.mu.Unlock() - sp.InactiveTransactionRemovalOptions.mu.Lock() - defer sp.InactiveTransactionRemovalOptions.mu.Unlock() if g, w := sp.numOfLeakedSessionsRemoved, uint64(0); g != w { t.Fatalf("No of leaked sessions removed mismatch\nGot: %d\nWant: %d\n", g, w) } diff --git a/spanner/transaction.go b/spanner/transaction.go index 941ac3eab49..875162e4e45 100644 --- a/spanner/transaction.go +++ b/spanner/transaction.go @@ -54,6 +54,7 @@ type txReadEnv interface { // release should be called at the end of every transactional read to deal // with session recycling. release(error) + setSessionEligibilityForLongRunning(sh *sessionHandle) } // txReadOnly contains methods for doing transactional reads. @@ -676,10 +677,10 @@ func (t *ReadOnlyTransaction) begin(ctx context.Context) error { // Retry the BeginTransaction call if a 'Session not found' is returned. for { sh, err = t.sp.take(ctx) - sh.eligibleForLongRunning = t.isLongRunningTransaction if err != nil { return err } + t.setSessionEligibilityForLongRunning(sh) sh.updateLastUseTime() var md metadata.MD res, err = sh.getClient().BeginTransaction(contextWithOutgoingMetadata(ctx, sh.getMetadata(), t.disableRouteToLeader), &sppb.BeginTransactionRequest{ @@ -937,6 +938,16 @@ func (t *ReadOnlyTransaction) WithTimestampBound(tb TimestampBound) *ReadOnlyTra return t } +func (t *ReadOnlyTransaction) setSessionEligibilityForLongRunning(sh *sessionHandle) { + if t != nil && sh != nil { + sh.mu.Lock() + t.mu.Lock() + sh.eligibleForLongRunning = t.isLongRunningTransaction + t.mu.Unlock() + sh.mu.Unlock() + } +} + // ReadWriteTransaction provides a locking read-write transaction. // // This type of transaction is the only way to write data into Cloud Spanner; @@ -1136,13 +1147,6 @@ func (t *ReadWriteTransaction) batchUpdateWithOptions(ctx context.Context, stmts if err != nil { return nil, err } - // mark transaction and session to be eligible for long-running - t.mu.Lock() - t.isLongRunningTransaction = true - t.mu.Unlock() - t.sh.mu.Lock() - t.sh.eligibleForLongRunning = true - t.sh.mu.Unlock() // Cloud Spanner will return "Session not found" on bad sessions. sid := sh.getID() @@ -1151,6 +1155,12 @@ func (t *ReadWriteTransaction) batchUpdateWithOptions(ctx context.Context, stmts return nil, errSessionClosed(sh) } + // mark transaction and session to be eligible for long-running + t.mu.Lock() + t.isLongRunningTransaction = true + t.mu.Unlock() + t.setSessionEligibilityForLongRunning(sh) + var sppbStmts []*sppb.ExecuteBatchDmlRequest_Statement for _, st := range stmts { params, paramTypes, err := st.convertParams() @@ -1339,6 +1349,16 @@ func (t *ReadWriteTransaction) release(err error) { } } +func (t *ReadWriteTransaction) setSessionEligibilityForLongRunning(sh *sessionHandle) { + if t != nil && sh != nil { + sh.mu.Lock() + t.mu.Lock() + sh.eligibleForLongRunning = t.isLongRunningTransaction + t.mu.Unlock() + sh.mu.Unlock() + } +} + func beginTransaction(ctx context.Context, sid string, client *vkit.Client, opts TransactionOptions) (transactionID, error) { res, err := client.BeginTransaction(ctx, &sppb.BeginTransactionRequest{ Session: sid, @@ -1411,14 +1431,8 @@ func (t *ReadWriteTransaction) begin(ctx context.Context) error { if err != nil { return err } - - sh.mu.Lock() - t.mu.Lock() - // for batch update operations, isLongRunningTransaction will be true - sh.eligibleForLongRunning = t.isLongRunningTransaction - t.mu.Unlock() - sh.mu.Unlock() - + // Some operations (for ex BatchUpdate) can be long-running. For such operations set the isLongRunningTransaction flag to be true + t.setSessionEligibilityForLongRunning(sh) continue } else { err = ToSpannerError(err)