databrickslabs · ronanstokes-db · Jun 7, 2024 · Mar 22, 2024 · Mar 22, 2024 · Mar 22, 2024
@@ -1,18 +1,19 @@
-# Databricks Labs Data Generator Release Notes
+# Databricks Labs Synthetic Data Generator Release Notes
 
 ## Change History
 All notable changes to the Databricks Labs Data Generator will be documented in this file.
 
 ### Unreleased
 
-### Changed
+#### Changed
 * Modified data generator to allow specification of constraints to the data generation process
 * Updated documentation for generating text data.
 * Modified data distribiutions to use abstract base classes
 * migrated data distribution tests to use `pytest`
 
-### Added
+#### Added
 * Added classes for constraints on the data generation via new package `dbldatagen.constraints`
+* Added support for standard data sets via the new package `dbldatagen.datasets`
 
 
 ### Version 0.3.6 Post 1
@@ -24,7 +25,6 @@ All notable changes to the Databricks Labs Data Generator will be documented in
 #### Fixed 
 * Fixed scenario where `DataAnalyzer` is used on dataframe containing a column named `summary`
 
-
 ### Version 0.3.6
 
 #### Changed

@@ -53,6 +53,7 @@ used in other computations
 * plugin mechanism to allow use of 3rd party libraries such as Faker
 * Use within a Databricks Delta Live Tables pipeline as a synthetic data generation source
 * Generate synthetic data generation code from existing schema or data (experimental)
+* Use of standard datasets for quick generation of synthetic data
 
 Details of these features can be found in the online documentation  -
  [online documentation](https://databrickslabs.github.io/dbldatagen/public_docs/index.html). 
@@ -110,6 +111,17 @@ in your environment.
 
 Once the library has been installed, you can use it to generate a data frame composed of synthetic data.
 
+The easiest way to use the data generator is to use one of the standard datasets which can be further customized
+for your use case.
+
+```buildoutcfg
+import dbldatagen as dg
+df = dg.Datasets(spark, "basic/user").get(rows=1000_000).build()
+num_rows=df.count()                          
+```
+
+You can also define fully custom data sets using the `DataGenerator` class.
+
 For example
 
 ```buildoutcfg

@@ -30,6 +30,7 @@
 from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
     deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \
     json_value_from_path, system_time_millis
+
 from ._version import __version__
 from .column_generation_spec import ColumnGenerationSpec
 from .column_spec_options import ColumnSpecOptions
@@ -43,11 +44,12 @@
 from .text_generators import TemplateGenerator, ILText, TextGenerator
 from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
 from .html_utils import HtmlUtils
+from .datasets_object import Datasets
 
 __all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
            "column_generation_spec", "utils", "function_builder",
            "spark_singleton", "text_generators", "datarange", "datagen_constants",
-           "text_generator_plugins", "html_utils"
+           "text_generator_plugins", "html_utils", "datasets_object"
            ]
 
 

@@ -0,0 +1,8 @@
+from .dataset_provider import DatasetProvider, dataset_definition
+from .basic_user import BasicUserProvider
+from .multi_table_telephony_provider import MultiTableTelephonyProvider
+
+__all__ = ["dataset_provider",
+           "basic_user",
+           "multi_table_telephony_provider"
+           ]
@@ -0,0 +1,64 @@
+from .dataset_provider import DatasetProvider, dataset_definition
+
+
+@dataset_definition(name="basic/user", summary="Basic User Data Set", autoRegister=True, supportsStreaming=True)
+class BasicUserProvider(DatasetProvider.NoAssociatedDatasetsMixin, DatasetProvider):
+    """
+    Basic User Data Set
+    ===================
+
+    This is a basic user data set with customer id, name, email, ip address, and phone number.
+
+    It takes the following optins when retrieving the table:
+        - random: if True, generates random data
+        - dummyValues: number of additional dummy value columns to generate (to widen row size if necessary)
+        - rows : number of rows to generate. Default is 100000
+        - partitions: number of partitions to use. If -1, it will be computed based on the number of rows
+        -
+
+    As the data specification is a DataGenerator object, you can add further columns to the data set and
+    add constraints (when the feature is available)
+
+    Note that this datset does not use any features that would prevent it from being used as a source for a
+    streaming dataframe, and so the flag `supportsStreaming` is set to True.
+
+    """
+    MAX_LONG = 9223372036854775807
+    COLUMN_COUNT = 5
+
+    @DatasetProvider.allowed_options(options=["random", "dummyValues"])
+    def getTableGenerator(self, sparkSession, *, tableName=None, rows=-1, partitions=-1,
+                          **options):
+        import dbldatagen as dg
+
+        generateRandom = options.get("random", False)
+        dummyValues = options.get("dummyValues", 0)
+
+        if rows is None or rows < 0:
+            rows = DatasetProvider.DEFAULT_ROWS
+
+        if partitions is None or partitions < 0:
+            partitions = self.autoComputePartitions(rows, self.COLUMN_COUNT + dummyValues)
+
+        assert tableName is None or tableName == DatasetProvider.DEFAULT_TABLE_NAME, "Invalid table name"
+        df_spec = (
+            dg.DataGenerator(sparkSession=sparkSession, rows=rows,
+                             partitions=partitions,
+                             randomSeedMethod="hash_fieldname")
+            .withColumn("customer_id", "long", minValue=1000000, maxValue=self.MAX_LONG, random=generateRandom)
+            .withColumn("name", "string",
+                        template=r'\w \w|\w \w \w', random=generateRandom)
+            .withColumn("email", "string",
+                        template=r'\w.\w@\w.com|\w@\w.co.u\k', random=generateRandom)
+            .withColumn("ip_addr", "string",
+                        template=r'\n.\n.\n.\n', random=generateRandom)
+            .withColumn("phone", "string",
+                        template=r'(ddd)-ddd-dddd|1(ddd) ddd-dddd|ddd ddddddd',
+                        random=generateRandom)
+        )
+
+        if dummyValues > 0:
+            df_spec = df_spec.withColumn("dummy", "long", random=True, numColumns=dummyValues,
+                                         minValue=1, maxValue=self.MAX_LONG)
+
+        return df_spec