DAT-331: Add option to automatically preserve TTL and timestamp (#384)

datastax · Mar 11, 2021 · 8bb97fa · 8bb97fa
1 parent b65a9e6
commit 8bb97fa
Show file tree

Hide file tree

Showing 20 changed files with 2,610 additions and 1,886 deletions.
diff --git a/changelog/README.md b/changelog/README.md
@@ -1,6 +1,6 @@
 ## Changelog
 
-## 1.7.1 (in progress)
+## 1.8.0 (in progress)
 
 - [improvement] Upgrade driver to 4.10.0.
 - [bug] Fix incorrect error message when read concurrency is < 1.
@@ -9,6 +9,7 @@
 - [improvement] Accept Well-known Binary (WKB) input formats for Geometry types.
 - [improvement] Make Json connector sensitive to the configured binary format.
 - [improvement] Make Geometry formats configurable.
+- [new feature] DAT-331: Add option to automatically preserve TTL and timestamp.
 
 ## 1.7.0
 

diff --git a/manual/application.template.conf b/manual/application.template.conf
@@ -715,6 +715,52 @@ dsbulk {
     # Default value: true
     #schema.nullToUnset = true
 
+    # Whether to preserve cell timestamps when loading and unloading. Ignored when `schema.query` is
+    # provided, or when the target table is a counter table. If true, the following rules will be
+    # applied to generated queries:
+    # 
+    # - When loading, instead of a single INSERT statement, the generated query will be a BATCH
+    # query; this is required in order to preserve individual column timestamps for each row.
+    # - When unloading, the generated SELECT statement will export each column along with its
+    # individual timestamp.
+    # 
+    # For both loading and unlaoding, DSBulk will import and export timestamps using field names
+    # such as `"writetime(<column>)"`, where `<column>` is the column's internal CQL name; for
+    # example, if the table has a column named `"MyCol"`, its corresponding timestamp would be
+    # exported as `"writetime(MyCol)"` in the generated query and in the resulting connector record.
+    # If you intend to use this feature to export and import tables letting DSBulk generate the
+    # appropriate queries, these names are fine and need not be changed. If, however, you would like
+    # to export or import data to or from external sources that use different field names, you could
+    # do so by using the function `writetime` in a schema.mapping entry; for example, the following
+    # mapping would map `col1` along with its timestamp to two distinct fields, `field1` and
+    # `field1_writetime`: `field1 = col1, field1_writetime = writetime(col1)`.
+    # Type: boolean
+    # Default value: false
+    #schema.preserveTimestamp = false
+
+    # Whether to preserve cell TTLs when loading and unloading. Ignored when `schema.query` is
+    # provided, or when the target table is a counter table. If true, the following rules will be
+    # applied to generated queries:
+    # 
+    # - When loading, instead of a single INSERT statement, the generated query will be a BATCH
+    # query; this is required in order to preserve individual column TTLs for each row.
+    # - When unloading, the generated SELECT statement will export each column along with its
+    # individual TTL.
+    # 
+    # For both loading and unlaoding, DSBulk will import and export TTLs using field names such as
+    # `"ttl(<column>)"`, where `<column>` is the column's internal CQL name; for example, if the
+    # table has a column named `"MyCol"`, its corresponding TTL would be exported as `"ttl(MyCol)"`
+    # in the generated query and in the resulting connector record. If you intend to use this
+    # feature to export and import tables letting DSBulk generate the appropriate queries, these
+    # names are fine and need not be changed. If, however, you would like to export or import data
+    # to or from external sources that use different field names, you could do so by using the
+    # function `ttl` in a schema.mapping entry; for example, the following mapping would map `col1`
+    # along with its TTL to two distinct fields, `field1` and `field1_ttl`: `field1 = col1,
+    # field1_ttl = ttl(col1)`.
+    # Type: boolean
+    # Default value: false
+    #schema.preserveTtl = false
+
     # The query to use. If not specified, then *schema.keyspace* and *schema.table* must be
     # specified, and dsbulk will infer the appropriate statement based on the table's metadata,
     # using all available columns. If `schema.keyspace` is provided, the query need not include the

diff --git a/manual/settings.md b/manual/settings.md
@@ -790,6 +790,28 @@ This setting is ignored when counting. When set to true but the protocol version
 
 Default: **true**.
 
+#### -timestamp,<br />--schema.preserveTimestamp<br />--dsbulk.schema.preserveTimestamp _&lt;boolean&gt;_
+
+Whether to preserve cell timestamps when loading and unloading. Ignored when `schema.query` is provided, or when the target table is a counter table. If true, the following rules will be applied to generated queries:
+
+- When loading, instead of a single INSERT statement, the generated query will be a BATCH query; this is required in order to preserve individual column timestamps for each row.
+- When unloading, the generated SELECT statement will export each column along with its individual timestamp.
+
+For both loading and unlaoding, DSBulk will import and export timestamps using field names such as `"writetime(<column>)"`, where `<column>` is the column's internal CQL name; for example, if the table has a column named `"MyCol"`, its corresponding timestamp would be exported as `"writetime(MyCol)"` in the generated query and in the resulting connector record. If you intend to use this feature to export and import tables letting DSBulk generate the appropriate queries, these names are fine and need not be changed. If, however, you would like to export or import data to or from external sources that use different field names, you could do so by using the function `writetime` in a schema.mapping entry; for example, the following mapping would map `col1` along with its timestamp to two distinct fields, `field1` and `field1_writetime`: `field1 = col1, field1_writetime = writetime(col1)`.
+
+Default: **false**.
+
+#### -ttl,<br />--schema.preserveTtl<br />--dsbulk.schema.preserveTtl _&lt;boolean&gt;_
+
+Whether to preserve cell TTLs when loading and unloading. Ignored when `schema.query` is provided, or when the target table is a counter table. If true, the following rules will be applied to generated queries:
+
+- When loading, instead of a single INSERT statement, the generated query will be a BATCH query; this is required in order to preserve individual column TTLs for each row.
+- When unloading, the generated SELECT statement will export each column along with its individual TTL.
+
+For both loading and unlaoding, DSBulk will import and export TTLs using field names such as `"ttl(<column>)"`, where `<column>` is the column's internal CQL name; for example, if the table has a column named `"MyCol"`, its corresponding TTL would be exported as `"ttl(MyCol)"` in the generated query and in the resulting connector record. If you intend to use this feature to export and import tables letting DSBulk generate the appropriate queries, these names are fine and need not be changed. If, however, you would like to export or import data to or from external sources that use different field names, you could do so by using the function `ttl` in a schema.mapping entry; for example, the following mapping would map `col1` along with its TTL to two distinct fields, `field1` and `field1_ttl`: `field1 = col1, field1_ttl = ttl(col1)`.
+
+Default: **false**.
+
 #### -query,<br />--schema.query<br />--dsbulk.schema.query _&lt;string&gt;_
 
 The query to use. If not specified, then *schema.keyspace* and *schema.table* must be specified, and dsbulk will infer the appropriate statement based on the table's metadata, using all available columns. If `schema.keyspace` is provided, the query need not include the keyspace to qualify the table reference.

diff --git a/mapping/src/main/antlr4/com/datastax/oss/dsbulk/generated/mapping/Mapping.g4 b/mapping/src/main/antlr4/com/datastax/oss/dsbulk/generated/mapping/Mapping.g4
@@ -40,9 +40,9 @@ regularMappedEntry
     ;
 
 inferredMappedEntry
-    : '*' ( ':' | '=' ) '*'
-    | '*' ( ':' | '=' ) '-' variable
-    | '*' ( ':' | '=' ) '[' '-' variable ( ',' '-' variable )* ']'
+    : STAR ( ':' | '=' ) STAR
+    | STAR ( ':' | '=' ) '-' variable
+    | STAR ( ':' | '=' ) '[' '-' variable ( ',' '-' variable )* ']'
     ;
 
 indexedEntry
@@ -63,46 +63,69 @@ fieldOrFunction
     | function
     ;
 
-field
-    : UNQUOTED_IDENTIFIER
-    | QUOTED_IDENTIFIER
-    ;
-
 variableOrFunction
     : variable
     | function
     ;
 
+field
+    : identifier
+    ;
+
 variable
+    : identifier
+    ;
+
+keyspaceName
+    : identifier
+    ;
+
+functionName
+    : identifier
+    ;
+
+columnName
+    : identifier
+    ;
+
+identifier
     : UNQUOTED_IDENTIFIER
     | QUOTED_IDENTIFIER
+    // also valid as identifiers:
+    | WRITETIME
+    | TTL
     ;
 
 function
-    : WRITETIME '(' functionArg ')'
-    | qualifiedFunctionName '(' ')'
-    | qualifiedFunctionName '(' functionArgs ')'
+    : writetime
+    | ttl
+    | qualifiedFunctionName '(' functionArgs? ')'
     ;
 
-qualifiedFunctionName
-    : ( keyspaceName '.' )? functionName
+writetime
+    : WRITETIME '(' STAR ')'
+    | WRITETIME '(' columnName ( ',' columnName )* ')'
     ;
 
-keyspaceName
-    : UNQUOTED_IDENTIFIER
-    | QUOTED_IDENTIFIER
+ttl
+    : TTL '(' STAR ')'
+    | TTL '(' columnName ( ',' columnName )* ')'
     ;
 
-functionName
-    : UNQUOTED_IDENTIFIER
-    | QUOTED_IDENTIFIER
+qualifiedFunctionName
+    : ( keyspaceName '.' )? functionName
     ;
 
 functionArgs
     :  functionArg ( ',' functionArg )*
     ;
 
 functionArg
+    : columnName
+    | literal
+    ;
+
+literal
     : INTEGER
     | FLOAT
     | BOOLEAN
@@ -111,8 +134,6 @@ functionArg
     | HEXNUMBER
     | STRING_LITERAL
     | ( '-' )? ( K_NAN | K_INFINITY )
-    | QUOTED_IDENTIFIER
-    | UNQUOTED_IDENTIFIER
     ;
 
 // Case-insensitive alpha characters
@@ -181,6 +202,14 @@ WRITETIME
     : W R I T E T I M E
     ;
 
+TTL
+    : T T L
+    ;
+
+STAR
+  : '*'
+  ;
+
 BOOLEAN
     : T R U E | F A L S E
     ;