Permalink
Newer
Older
100644 528 lines (474 sloc) 24.4 KB
1
// Copyright 2014 The Cockroach Authors.
2
//
3
// Licensed under the Apache License, Version 2.0 (the "License");
4
// you may not use this file except in compliance with the License.
5
// You may obtain a copy of the License at
6
//
7
// http://www.apache.org/licenses/LICENSE-2.0
8
//
9
// Unless required by applicable law or agreed to in writing, software
10
// distributed under the License is distributed on an "AS IS" BASIS,
11
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
12
// implied. See the License for the specific language governing
13
// permissions and limitations under the License.
16
package cockroach.roachpb;
17
option go_package = "roachpb";
19
import "roachpb/metadata.proto";
20
import "storage/engine/enginepb/mvcc.proto";
21
import "storage/engine/enginepb/mvcc3.proto";
22
import "util/hlc/timestamp.proto";
23
import "gogoproto/gogo.proto";
25
// Span is a key range with an inclusive start Key and an exclusive end Key.
Nov 12, 2015
26
message Span {
27
option (gogoproto.equal) = true;
28
29
option (gogoproto.goproto_stringer) = false;
30
option (gogoproto.populate) = true;
32
reserved 1, 2;
33
// The start key of the key range.
34
bytes key = 3 [(gogoproto.casttype) = "Key"];
35
// The end key of the key range. The value is empty if the key range
36
// contains only a single key. Otherwise, it must order strictly after Key.
37
// In such a case, the Span encompasses the key range from Key to EndKey,
Nov 12, 2015
38
// including Key and excluding EndKey.
39
bytes end_key = 4 [(gogoproto.casttype) = "Key"];
Nov 12, 2015
40
}
41
42
// ValueType defines a set of type constants placed in the "tag" field of Value
43
// messages. These are defined as a protocol buffer enumeration so that they
44
// can be used portably between our Go and C code. The tags are used by the
45
// RocksDB Merge Operator to perform specialized merges.
46
enum ValueType {
47
// This is a subset of the SQL column type values, representing the underlying
48
// storage for various types. The DELIMITED_foo entries each represent a foo
49
// variant that self-delimits length.
50
UNKNOWN = 0;
51
reserved 7;
52
INT = 1;
53
FLOAT = 2;
54
BYTES = 3;
55
DELIMITED_BYTES = 8;
56
TIME = 4;
58
DELIMITED_DECIMAL = 9;
61
// TUPLE represents a DTuple, encoded as repeated pairs of varint field number
62
// followed by a value encoded Datum.
63
TUPLE = 10;
64
65
BITARRAY = 11;
66
67
// TIMESERIES is applied to values which contain InternalTimeSeriesData.
68
TIMESERIES = 100;
69
}
70
71
// Value specifies the value at a key. Multiple values at the same key are
72
// supported based on timestamp. The data stored within a value is typed
73
// (ValueType) and custom encoded into the raw_bytes field. A custom encoding
74
// is used instead of separate proto fields to avoid proto overhead and to
75
// avoid unnecessary encoding and decoding as the value gets read from disk and
76
// passed through the network. The format is:
77
//
78
// <4-byte-checksum><1-byte-tag><encoded-data>
79
//
80
// A CRC-32-IEEE checksum is computed from the associated key, tag and encoded
81
// data, in that order.
82
//
83
// TODO(peter): Is a 4-byte checksum overkill when most (all?) values
84
// will be less than 64KB?
85
message Value {
86
option (gogoproto.equal) = true;
87
88
// raw_bytes contains the encoded value and checksum.
89
bytes raw_bytes = 1;
90
// Timestamp of value.
91
util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
92
}
93
94
// KeyValue is a pair of Key and Value for returned Key/Value pairs
95
// from ScanRequest/ScanResponse. It embeds a Key and a Value.
96
message KeyValue {
97
bytes key = 1 [(gogoproto.casttype) = "Key"];
98
Value value = 2 [(gogoproto.nullable) = false];
101
// A StoreIdent uniquely identifies a store in the cluster. The
102
// StoreIdent is written to the underlying storage engine at a
103
// store-reserved system key (KeyLocalIdent).
104
message StoreIdent {
105
bytes cluster_id = 1 [(gogoproto.nullable) = false,
106
(gogoproto.customname) = "ClusterID",
107
(gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/uuid.UUID"];
108
int32 node_id = 2 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
109
int32 store_id = 3 [(gogoproto.customname) = "StoreID", (gogoproto.casttype) = "StoreID"];
112
// A SplitTrigger is run after a successful commit of an AdminSplit
113
// command. It provides the updated left hand side of the split's
114
// range descriptor (left_desc) and the new range descriptor covering
115
// the right hand side of the split (right_desc). This information
116
// allows the final bookkeeping for the split to be completed and the
117
// new range put into operation.
118
message SplitTrigger {
119
option (gogoproto.equal) = true;
120
121
RangeDescriptor left_desc = 1 [(gogoproto.nullable) = false];
122
RangeDescriptor right_desc = 2 [(gogoproto.nullable) = false];
123
reserved 3;
Bram Gruneir
Dec 11, 2014
126
// A MergeTrigger is run after a successful commit of an AdminMerge
127
// command. It provides the updated left hand side of the split's
128
// range descriptor (left_desc) that now encompasses what was
129
// originally both ranges and the soon-to-be-invalid range descriptor
130
// that used to cover the subsumed, right hand side of the merge
131
// (right_desc). This information allows the final bookkeeping for the
132
// merge to be completed and put into operation.
Bram Gruneir
Dec 11, 2014
133
message MergeTrigger {
134
option (gogoproto.equal) = true;
135
136
RangeDescriptor left_desc = 1 [(gogoproto.nullable) = false];
137
RangeDescriptor right_desc = 2 [(gogoproto.nullable) = false];
139
reserved 3;
140
141
storage.engine.enginepb.MVCCStats right_mvcc_stats = 4 [
142
(gogoproto.customname) = "RightMVCCStats",
143
(gogoproto.nullable) = false
144
];
145
146
// FreezeStart is a timestamp that is guaranteed to be greater than the
147
// timestamps at which any requests were serviced by the responding replica
148
// before it stopped responding to requests altogether (in anticipation of
149
// being subsumed). It is suitable for use as the timestamp cache's low water
150
// mark for the keys previously owned by the subsumed range.
151
util.hlc.Timestamp freeze_start = 5 [(gogoproto.nullable) = false];
Bram Gruneir
Dec 11, 2014
152
}
153
154
// ReplicaChangeType is a parameter of ChangeReplicasTrigger.
155
enum ReplicaChangeType {
156
option (gogoproto.goproto_enum_prefix) = false;
157
158
ADD_REPLICA = 0;
159
REMOVE_REPLICA = 1;
160
}
161
162
message ChangeReplicasTrigger {
163
option (gogoproto.equal) = true;
164
165
option (gogoproto.goproto_stringer) = false;
166
167
// TODO(benesch): this trigger should just specify the updated descriptor,
168
// like the split and merge triggers, so that the receiver doesn't need to
169
// reconstruct the range descriptor update.
170
171
ReplicaChangeType change_type = 1;
172
// The replica being modified.
173
ReplicaDescriptor replica = 2 [(gogoproto.nullable) = false];
174
// The new replica list with this change applied.
175
repeated ReplicaDescriptor updated_replicas = 3 [(gogoproto.nullable) = false];
176
int32 next_replica_id = 4 [(gogoproto.customname) = "NextReplicaID", (gogoproto.casttype) = "ReplicaID"];
179
// ModifiedSpanTrigger indicates that a specific span has been modified.
180
// This can be used to trigger scan-and-gossip for the given span.
181
message ModifiedSpanTrigger {
182
option (gogoproto.equal) = true;
183
184
bool system_config_span = 1;
185
// node_liveness_span is set to indicate that node liveness records
186
// need re-gossiping after modification or range lease updates. The
187
// span is set to a single key when nodes update their liveness records
188
// with heartbeats to extend the expiration timestamp. Changes to the
189
// range lease for the range containing node liveness triggers re-gossip
190
// of the entire node liveness key range.
191
Span node_liveness_span = 2;
192
}
193
194
// InternalCommitTrigger encapsulates all of the internal-only commit triggers.
195
// Only one may be set.
196
message InternalCommitTrigger {
197
option (gogoproto.equal) = true;
198
199
// InternalCommitTrigger is always nullable, and these getters are
200
// nil-safe, which is often convenient.
201
option (gogoproto.goproto_getters) = true;
202
203
SplitTrigger split_trigger = 1;
204
MergeTrigger merge_trigger = 2;
205
ChangeReplicasTrigger change_replicas_trigger = 3;
206
ModifiedSpanTrigger modified_span_trigger = 4;
209
// TransactionStatus specifies possible states for a transaction.
210
enum TransactionStatus {
211
option (gogoproto.goproto_enum_prefix) = false;
212
213
// PENDING is the default state for a new transaction. Transactions
214
// move from PENDING to one of COMMITTED or ABORTED. Mutations made
215
// as part of a PENDING transactions are recorded as "intents" in
216
// the underlying MVCC model.
217
PENDING = 0;
218
// COMMITTED is the state for a transaction which has been
219
// committed. Mutations made as part of a transaction which is moved
220
// into COMMITTED state become durable and visible to other
221
// transactions, moving from "intents" to permanent versioned
222
// values.
223
COMMITTED = 1;
224
// ABORTED is the state for a transaction which has been aborted.
225
// Mutations made as part of a transaction which is moved into
226
// ABORTED state are deleted and are never made visible to other
227
// transactions.
228
ABORTED = 2;
229
}
230
231
message ObservedTimestamp {
232
option (gogoproto.equal) = true;
234
option (gogoproto.populate) = true;
235
236
int32 node_id = 1 [(gogoproto.customname) = "NodeID", (gogoproto.casttype) = "NodeID"];
237
util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
240
// A Transaction is a unit of work performed on the database.
241
// Cockroach transactions support two isolation levels: snapshot
242
// isolation and serializable snapshot isolation. Each Cockroach
243
// transaction is assigned a random priority. This priority will be
244
// used to decide whether a transaction will be aborted during
245
// contention.
246
//
247
// If you add fields to Transaction you'll need to update
248
// Transaction.Clone. Failure to do so will result in test failures.
249
message Transaction {
250
option (gogoproto.equal) = true;
252
option (gogoproto.goproto_stringer) = false;
253
option (gogoproto.populate) = true;
255
// The transaction metadata. These are persisted with every intent.
256
storage.engine.enginepb.TxnMeta meta = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
257
// A free-text identifier for debug purposes.
258
string name = 2;
259
TransactionStatus status = 4;
260
util.hlc.Timestamp last_heartbeat = 5 [(gogoproto.nullable) = false];
261
// The original timestamp at which the transaction started. For serializable
262
// transactions, if the timestamp drifts from the original timestamp, the
263
// transaction will retry unless we manage to "refresh the reads" - see
264
// refreshed_timestamp.
266
// This timestamp is the one at which all transactions will read, unless
267
// refreshed_timestamp is set. It is also, surprisingly, the timestamp at
268
// which transactions will provisionally _write_ (i.e. intents are written at
269
// this orig_timestamp and, after commit, when the intents are resolved,
270
// their timestamps are bumped to the to the commit timestamp), if
271
// refreshed_timestamp isn't set.
272
// This is ultimately because of correctness concerns around SNAPSHOT
273
// transactions.
275
// Intuitively, one could think that the timestamp at which intents should be
276
// written should be the provisional commit timestamp, and while this is
277
// morally true, consider the following scenario, where txn1 is a SNAPSHOT
278
// txn:
280
// - txn1 at orig_timestamp=5 reads key1: (value) 1.
281
// - txn1 writes elsewhere, has its commit timestamp increased to 20.
282
// - txn2 at orig_timestamp=10 reads key1: 1
283
// - txn2 increases the value by 5: key1: 6 and commits
284
// - txn1 increases the value by 1: key1: 2, attempts commit
285
//
286
// If txn1 uses its orig_timestamp for updating key1 (as it does), it
287
// conflicts with txn2's committed value (which is at timestamp 10, in the
288
// future of 5), and restarts.
289
// Using instead its candidate commit timestamp, it wouldn't see a conflict
290
// and commit, but this is not the expected outcome (the expected outcome is
291
// {key1: 6} (since txn1 is not expected to commit)) and we would be
292
// experiencing the Lost Update Anomaly.
293
//
294
// Note that in practice, before restarting, txn1 would still lay down an
295
// intent (just above the committed value) not with the intent to commit it,
296
// but to avoid being starved by short-lived transactions on that key which
297
// would otherwise not have to go through conflict resolution with txn1.
298
//
299
// Again, keep in mind that, when the transaction commits, all the intents are
300
// bumped to the commit timestamp (otherwise, pushing a transaction wouldn't
301
// achieve anything).
302
util.hlc.Timestamp orig_timestamp = 6 [(gogoproto.nullable) = false];
303
// Initial Timestamp + clock skew. Reads which encounter values with
304
// timestamps between timestamp and max_timestamp trigger a txn
305
// retry error, unless the node being read is listed in observed_timestamps
306
// (in which case no more read uncertainty can occur).
307
// The case max_timestamp < timestamp is possible for transactions which have
308
// been pushed; in this case, max_timestamp should be ignored.
309
util.hlc.Timestamp max_timestamp = 7 [(gogoproto.nullable) = false];
310
// The refreshed timestamp is the timestamp at which the transaction
311
// can commit without necessitating a serializable restart. This
312
// value is forwarded to the transaction's current timestamp (meta.timestamp)
313
// if the transaction coordinator is able to refresh all refreshable spans
314
// encountered during the course of the txn. If set, this take precedence
315
// over orig_timestamp and is the timestamp at which the transaction both
316
// reads and writes going forward.
317
// We need to keep track of both refresh_timestamp and orig_timestamp (instead
318
// of simply overwriting the orig_timestamp after refreshes) because the
319
// orig_timestamp needs to be used as a lower bound timestamp for the
320
// time-bound iterator used to resolve intents - i.e. there can be intents to
321
// resolve up to the timestamp that the txn started with.
322
util.hlc.Timestamp refreshed_timestamp = 15 [(gogoproto.nullable) = false];
323
// A list of <NodeID, timestamp> pairs. The list maps NodeIDs to timestamps
324
// as observed from their local clock during this transaction. The purpose of
325
// this map is to avoid uncertainty related restarts which normally occur
326
// when reading a value in the near future as per the max_timestamp field.
327
//
328
// Morally speaking, having an entry for a node in this map means that this
329
// node has been visited before, and that no more uncertainty restarts are
330
// expected for operations served from it. However, this is not entirely
331
// accurate. For example, say a txn starts with orig_timestamp=1 (and some
332
// large max_timestamp). It then reads key "a" from node A, registering an
333
// entry `A -> 5` in the process (`5` happens to be a timestamp taken off
334
// that node's clock at the end of the read).
335
// Now assume that some other transaction writes and commits a value at key "b"
336
// and timestamp 4 (again, served by node A), and our transaction attempts to
337
// read that key. Since there is an entry in its observed_timestamps for A,
338
// our uncertainty window is `[orig_timestamp, 5) = [1, 5)` but the value at
339
// key "b" is in that window, and so we will restart. However, we will restart
340
// with a timestamp that is at least high as our entry in the map for node A,
341
// so no future operation on node A will be uncertain.
342
//
343
// Thus, expressed properly, you could say that when a node has been read from
344
// successfully before, uncertainty on that node is restricted to values with
345
// timestamps in the interval [orig_timestamp, first_visit_timestamp), and
346
// that no node will trigger restarts more than once (and in fact, usually
347
// the first restart also bumps the txn timestamp enough to clear all other
348
// nodes).
349
//
350
// When this list holds a corresponding entry for the node the current
351
// request is executing on, we can run the command with the map's timestamp
352
// as the top boundary of our uncertainty interval, limiting (and often
353
// avoiding) uncertainty restarts.
354
//
355
// When a transaction is first initialized on a node, it may use a timestamp
356
// from the local hybrid logical clock to initialize the corresponding entry
357
// in the map. In particular, if `orig_timestamp` is taken from that node's
358
// clock, we may add that to the map, which eliminates read uncertainty for
359
// reads on that node.
360
//
361
// The list of observed timestamps is kept sorted by NodeID. Use
362
// Transaction.UpdateObservedTimestamp to maintain the sorted order.
363
repeated ObservedTimestamp observed_timestamps = 8 [(gogoproto.nullable) = false];
364
// Writing is true if the transaction has previously sent a Begin transaction
365
// (i.e. if it ever attempted to perform a write, so if it ever attempted to
366
// leave intents (across retries)). The flag will be set even if the BeginTxn
367
// batch failed.
368
// When set, the AbortCache must be checked by reads so that they don't miss
369
// to see the txn's previous writes.
371
// If this is true, the transaction must retry. Relevant only for
372
// SNAPSHOT transactions: a SERIALIZABLE transaction would have to
373
// retry anyway due to its commit timestamp having moved forward (whenever
374
// write_too_old is set, meta.Timestamp has been pushed above orig_timestamp).
375
// This bool is set instead of immediately returning a txn retry
376
// error so that intents can continue to be laid down, minimizing
377
// work required on txn restart.
378
bool write_too_old = 12;
379
// If retry_on_push is true, the transaction must retry in the event
380
// that the commit timestamp is pushed forward. This flag is set if
381
// the transaction contains any calls to DeleteRange, in order to
382
// prevent the LostDeleteRange anomaly. This flag is relevant only
383
// for SNAPSHOT transactions.
384
bool retry_on_push = 13;
385
repeated Span intents = 11 [(gogoproto.nullable) = false];
386
// Epoch zero timestamp is used to keep track of the earliest timestamp
387
// that any epoch of the transaction used. This is set only if the
388
// transaction is restarted and the epoch is bumped. It is used during
389
// intent resolution to more efficiently scan for intents.
390
util.hlc.Timestamp epoch_zero_timestamp = 14 [(gogoproto.nullable) = false];
391
// This flag is set if the transaction's original timestamp was
392
// "leaked" beyond the transaction (i.e. if returned via NOW() or
393
// transaction_timestamp()). If true, this prevents optimizations
394
// which commit at a higher timestamp without resorting to a
395
// client-side retry.
396
bool orig_timestamp_was_observed = 16;
399
// A Intent is a Span together with a Transaction metadata and its status.
400
message Intent {
401
option (gogoproto.equal) = true;
402
403
Span span = 1 [(gogoproto.nullable) = false, (gogoproto.embed) = true];
404
storage.engine.enginepb.TxnMeta txn = 2 [(gogoproto.nullable) = false];
405
TransactionStatus status = 3;
408
// A SequencedWrite is a point write to a key with a certain sequence number.
409
//
410
// TODO(nvanbenschoten/tschottdorf): This message type can be used as the
411
// PromisedWrites repeated field in EndTransaction in the parallel commits
412
// proposal (#24194).
413
message SequencedWrite {
414
// The key that the write was made at.
415
bytes key = 1 [(gogoproto.casttype) = "Key"];
416
// The sequence number of the request that created the write.
417
int32 sequence = 2;
418
}
419
420
// Lease contains information about range leases including the
421
// expiration and lease holder.
Apr 9, 2015
422
message Lease {
Apr 30, 2015
423
option (gogoproto.goproto_stringer) = false;
424
option (gogoproto.populate) = true;
425
426
// The start is a timestamp at which the lease begins. This value
427
// must be greater than the last lease expiration or the lease request
428
// is considered invalid.
429
util.hlc.Timestamp start = 1 [(gogoproto.nullable) = false];
430
431
// The expiration is a timestamp at which the lease expires. This means that
432
// a new lease can be granted for a later timestamp.
433
util.hlc.Timestamp expiration = 2 [(gogoproto.moretags) = "cockroachdb:\"randnullable\""];
435
// The address of the would-be lease holder.
436
ReplicaDescriptor replica = 3 [(gogoproto.nullable) = false];
438
// The start of the lease stasis period. This field is deprecated.
439
util.hlc.Timestamp deprecated_start_stasis = 4 [(gogoproto.moretags) = "cockroachdb:\"randnullable\""];
441
// The current timestamp when this lease has been proposed. Used after a
442
// transfer and after a node restart to enforce that a node only uses leases
443
// proposed after the time of the said transfer or restart. This is nullable
444
// to help with the rollout (such that a lease applied by some nodes before
445
// the rollout and some nodes after the rollout is serialized the same).
446
// TODO(andrei): Make this non-nullable after the rollout.
447
util.hlc.Timestamp proposed_ts = 5 [(gogoproto.customname) = "ProposedTS"];
448
449
// The epoch of the lease holder's node liveness entry. If this value
450
// is non-zero, the start and expiration values are ignored.
453
// A zero-indexed sequence number which is incremented during the acquisition
454
// of each new range lease that is not equivalent to the previous range lease
455
// (i.e. an acquisition that implies a leaseholder change). The sequence
456
// number is used to detect lease changes between command proposal and
457
// application without requiring that we send the entire lease through Raft.
458
// Lease sequence numbers are a reflection of the "lease equivalency" property
459
// (see Lease.Equivalent). Two adjacent leases that are equivalent will have
460
// the same sequence number and two adjacent leases that are not equivalent
461
// will have different sequence numbers.
462
int64 sequence = 7 [(gogoproto.casttype) = "LeaseSequence"];
Apr 9, 2015
463
}
464
465
// AbortSpanEntry contains information about a transaction which has
466
// been aborted. It's written to a range's AbortSpan if the range
467
// may have contained intents of the aborted txn. In the event that
468
// the same transaction attempts to read keys it may have written
469
// previously, this entry informs the transaction that it has aborted
470
// and must start fresh with an updated priority.
471
message AbortSpanEntry {
472
option (gogoproto.equal) = true;
473
option (gogoproto.populate) = true;
474
475
// The key of the associated transaction.
476
bytes key = 1 [(gogoproto.casttype) = "Key"];
477
// The candidate commit timestamp the transaction record held at the time
478
// it was aborted.
479
util.hlc.Timestamp timestamp = 2 [(gogoproto.nullable) = false];
480
// The priority of the transaction.
481
int32 priority = 3;
483
484
// TxnCoordMeta is metadata held by a transaction coordinator. This
485
// message is defined here because it is used in several layers of the
486
// system (internal/client, sql/distsqlrun, kv).
487
message TxnCoordMeta {
488
// txn is a copy of the transaction record, updated with each request.
489
Transaction txn = 1 [(gogoproto.nullable) = false];
490
// intents stores key spans affected by this transaction through
491
// this coordinator. These spans allow the coordinator to set the
492
// list of intent spans in the EndTransactionRequest when the
493
// transaction is finalized.
494
repeated Span intents = 2 [(gogoproto.nullable) = false];
495
// command_count indicates how many requests have been sent through
496
// this transaction. Reset on retryable txn errors.
497
int32 command_count = 3;
498
// refresh_reads and refresh_writes store key spans which were read
499
// or, less frequently, written during a transaction. These fields
500
// are utilized for SERIALIZABLE transactions in the event a
501
// transaction experiences a retry error. In that case, the
502
// coordinator uses the Refresh and RefreshRange RPCs to verify that
503
// no write has occurred to the spans more recently than the txn's
504
// original timestamp, and updates the affected timestamp caches to
505
// the transaction's refreshed timestamp. On failure, the retry
506
// error is propagated. On success, the transaction's original and
507
// current timestamps are forwarded to the refresh timestamp, and
508
// the transaction can continue.
509
repeated Span refresh_reads = 4 [(gogoproto.nullable) = false];
510
repeated Span refresh_writes = 5 [(gogoproto.nullable) = false];
511
// refresh_invalid indicates that spans were discarded or not collected
512
// (i.e. because of a dist SQL processor running a version before refreshing
513
// was introduced). This is false if all spans encountered during the
514
// transaction which need refreshing have been collected to the refresh_reads
515
// and refresh_writes span slices.
516
bool refresh_invalid = 7;
517
// deprecated_refresh_valid is the inverse of refresh_invalid. It was
518
// deprecated in favor of refresh_invalid in order to give the struct a useful
519
// zero value.
520
// TODO(nvanbenschoten): Can be removed in 2.2.
521
bool deprecated_refresh_valid = 6;
522
// outstanding_writes stores all writes that are outstanding and have
523
// not yet been resolved. Any client wishing to send a request that
524
// overlaps with them must chain on to their success using a QueryIntent
525
// request.
526
repeated SequencedWrite outstanding_writes = 8 [(gogoproto.nullable) = false];