Feature additional docs (#222)

* wip * wip * doc updates * function doc changes only * function doc changes only
databrickslabs · Jul 12, 2023 · b0fe747 · b0fe747
1 parent 168b6f1
commit b0fe747
Show file tree

Hide file tree

Showing 5 changed files with 62 additions and 26 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -152,7 +152,7 @@ For example:
 
 To use an older DB runtime version in your notebook, you can use the following code in your notebook:
 
-```commandline
+```shell
 %pip install git+https://github.com/databrickslabs/dbldatagen@dbr_7_3_LTS_compat
 ```
 

diff --git a/dbldatagen/data_generator.py b/dbldatagen/data_generator.py
@@ -901,20 +901,24 @@ def withStructColumn(self, colName, fields=None, asJson=False, **kwargs):
         a struct of the specified fields.
 
         :param colName: name of column
-        :param fields: list of fields to compose as a struct valued column
+        :param fields: list of elements to compose as a struct valued column (each being a string or tuple), or a dict
+                          outlining the structure of the struct column
         :param asJson: If False, generate a struct valued column. If True, generate a JSON string column
+        :param kwargs: keyword arguments to pass to the underlying column generators as per `withColumn`
         :return: A modified in-place instance of data generator allowing for chaining of calls
                   following the Builder pattern
 
         .. note::
             Additional options for the field specification may be specified as keyword arguments.
 
-            The field specification may be :
-            - a list of field references (strings) which will be used as both the field name and the SQL expression
-            - a list of tuples of the form (field_name, field_expression) where field_name is the name of the field
-            - a Python dict outlining the structure of the struct column. The keys of the dict are the field names
+            The fields specification specified by the `fields` argument may be :
 
-            When using the ``struct`` form of the field specifications, a field whose value is a list will be treated
+            - A list of field references (`strings`) which will be used as both the field name and the SQL expression
+            - A list of tuples of the form **(field_name, field_expression)** where `field_name` is the name of the
+              field. In that case, the `field_expression` string should be a SQL expression to generate the field value
+            - A Python dict outlining the structure of the struct column. The keys of the dict are the field names
+
+            When using the `dict` form of the field specifications, a field whose value is a list will be treated
             as creating a SQL array literal.
 
         """

diff --git a/dbldatagen/utils.py b/dbldatagen/utils.py
@@ -9,16 +9,18 @@
 """
 
 import functools
-import warnings
-from datetime import timedelta
-import re
 import json
+import re
 import time
+import warnings
+from datetime import timedelta
+
 import jmespath
 
 
 def deprecated(message=""):
-    """ Define a deprecated decorator without dependencies on 3rd party libraries
+    """
+    Define a deprecated decorator without dependencies on 3rd party libraries
 
     Note there is a 3rd party library called `deprecated` that provides this feature but goal is to only have
     dependencies on packages already used in the Databricks runtime
@@ -275,7 +277,8 @@ def strip_margins(s, marginChar):
 
 
 def split_list_matching_condition(lst, cond):
-    """ Split a list on elements that match a condition
+    """
+    Split a list on elements that match a condition
 
     This will find all matches of a specific condition in the list and split the list into sub lists around the
     element that matches this condition.
@@ -288,9 +291,9 @@ def split_list_matching_condition(lst, cond):
     splitListOnCondition(x, lambda el: el == 'id')
 
 
-    result:
+    Result:
     `[['id'], ['city_name'], ['id'], ['city_id', 'city_pop'],
-     ['id'], ['city_id', 'city_pop', 'city_id', 'city_pop'], ['id']]`
+      ['id'], ['city_id', 'city_pop', 'city_id', 'city_pop'], ['id']]`
 
     :arg lst: list of items to perform condition matches against
     :arg cond: lambda function or function taking single argument and returning True or False

diff --git a/docs/source/generating_column_data.rst b/docs/source/generating_column_data.rst
@@ -13,6 +13,7 @@ This includes:
 - Whether the generated data set will be a streaming or batch data set
 - How the column data should be generated and what the dependencies for each column are
 - How random and psuedo-random data is generated
+- The structure for structured columns and JSON valued columns
 
 .. seealso::
    See the following links for more details:
@@ -22,6 +23,7 @@ This includes:
   * Controlling how existing columns are generated - :data:`~dbldatagen.data_generator.DataGenerator.withColumnSpec`
   * Adding column generation specs in bulk - :data:`~dbldatagen.data_generator.DataGenerator.withColumnSpecs`
   * Options for column generation - :doc:`options_and_features`
+  * Generating JSON and complex data - :doc:`generating_json_data`
 
 Column data is generated for all columns whether imported from a schema or explicitly added
 to a data specification. However, column data can be omitted from the final output, allowing columns to be used
@@ -35,15 +37,16 @@ These control the data generation process.
 
 The data generation process itself is deferred until the data generation instance ``build`` method is executed.
 
-So until the ``build`` method is invoked, the data generation specification is in initialization mode.
+So until the :data:`~dbldatagen.data_generator.DataGenerator.build` method is invoked, the data generation
+specification is in initialization mode.
 
 Once ``build`` has been invoked, the data generation instance holds state about the data set generated.
 
 While ``build`` can be invoked a subsequent time, making further modifications to the definition post build before
 calling ``build`` again is not recommended. We recommend the use of the ``clone`` method to make a new data generation
 specification similar to an existing one if further modifications are needed.
 
-See :data:`~dbldatagen.data_generator.DataGenerator.clone` for further information.
+See the method :data:`~dbldatagen.data_generator.DataGenerator.clone` for further information.
 
 Adding columns to a data generation spec
 ----------------------------------------
@@ -55,18 +58,21 @@ specification.
 When building the data generation spec, the ``withSchema`` method may be used to add columns from an existing schema.
 This does _not_ prevent the use of ``withColumn`` to add new columns.
 
-Use ``withColumn`` to define a new column. This method takes a parameter to specify the data type.
-See :data:`~dbldatagen.data_generator.DataGenerator.withColumn`.
+| Use ``withColumn`` to define a new column. This method takes a parameter to specify the data type.
+| See the method :data:`~dbldatagen.data_generator.DataGenerator.withColumn` for further details.
 
 Use ``withColumnSpec`` to define how a column previously defined in a schema should be generated. This method does not
 take a data type property, but uses the data type information defined in the schema.
-See :data:`~dbldatagen.data_generator.DataGenerator.withColumnSpec`.
+See the method :data:`~dbldatagen.data_generator.DataGenerator.withColumnSpec` for further details.
 
-Use ``withColumnSpecs`` to define how multiple columns imported from a schema should be generated.
-As the pattern matching may inadvertently match an unintended column, it is permitted to override the specification
-added through this method by a subsequent call to ``withColumnSpec`` to change the definition of how a specific column
-should be generated
-See :data:`~dbldatagen.data_generator.DataGenerator.withColumnSpecs`.
+| Use ``withColumnSpecs`` to define how multiple columns imported from a schema should be generated.
+  As the pattern matching may inadvertently match an unintended column, it is permitted to override the specification
+  added through this method by a subsequent call to ``withColumnSpec`` to change the definition of how a specific column
+  should be generated.
+| See the method :data:`~dbldatagen.data_generator.DataGenerator.withColumnSpecs` for further details.
+
+Use the method :data:`~dbldatagen.data_generator.DataGenerator.withStructColumn` for simpler creation of struct and
+JSON valued columns.
 
 By default all columns are marked as being dependent on an internal ``id`` seed column.
 Use the ``baseColumn`` attribute to mark a column as being dependent on another column or set of columns.
@@ -85,6 +91,7 @@ Use of the base column attribute has several effects:
 
   If you need to generate a field with the same name as the seed column (by default `id`), you may override
   the default seed column name in the constructor of the data generation spec through the use of the
+  ``seedColumnName`` parameter.
 
 
   Note that Spark SQL is case insensitive with respect to column names.
@@ -127,6 +134,12 @@ For example, the following code will generate rows with varying numbers of synth
 
    df = ds.build()
 
+| The helper method ``withStructColumn`` of the ``DataGenerator`` class enables simpler definition of structured
+  and JSON valued columns.
+| See the documentation for the method :data:`~dbldatagen.data_generator.DataGenerator.withStructColumn` for
+  further details.
+
+
 The mechanics of column data generation
 ---------------------------------------
 The data set is generated when the ``build`` method is invoked on the data generation instance.
@@ -168,3 +181,4 @@ This has several implications:
   However it does not reorder the building sequence if there is a reference to a column that will be built later in the
   SQL expression.
   To enforce the dependency, you must use the `baseColumn` attribute to indicate the dependency.
+
diff --git a/docs/source/generating_json_data.rst b/docs/source/generating_json_data.rst
@@ -12,7 +12,7 @@ Generating JSON data
 There are several methods for generating JSON data:
 
 - Generate a dataframe and save it as JSON will generate full data set as JSON
-- Generate JSON valued fields using SQL functions such as `named_struct` and `to_json`
+- Generate JSON valued fields using SQL functions such as `named_struct` and `to_json`.
 
 Writing dataframe as JSON data
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -178,7 +178,8 @@ written as:
                expr="named_struct('event_type', event_type, 'event_ts', event_ts)",
                baseColumn=['event_type', 'event_ts'])
 
- To simplify the specification of struct valued columns, the defined value of `INFER_DATATYPE` can be used in place of
+
+To simplify the specification of struct valued columns, the defined value of `INFER_DATATYPE` can be used in place of
 the datatype when the `expr` attribute is specified. This will cause the datatype to be inferred from the expression.
 
 In this case, the previous code would be written as follows:
@@ -191,6 +192,20 @@ In this case, the previous code would be written as follows:
 
 The helper method ``withStructColumn`` can also be used to simplify the specification of struct valued columns.
 
+Using this method, the previous code can be written as one of the following options:
+
+.. code-block:: python
+
+   # Use either form to create the struct valued field
+   .withStructColumn("event_info1", fields=['event_type', 'event_ts'])
+   .withStructColumn("event_info2", fields={'event_type': 'event_type',
+                                            'event_ts': 'event_ts'})
+
+In the case of the second variant, the expression following the struct field name can be any arbitrary SQL string. It
+can also generate JSON for the same definition.
+
+See the following documentation for more details: :data:`~dbldatagen.data_generator.DataGenerator.withStructColumn`
+
 Generating JSON valued fields
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^