Skip to content

Commit

Permalink
Feature html formatting (#208)
Browse files Browse the repository at this point in the history
* wip

* wip

* wip

* wip

* wip
  • Loading branch information
ronanstokes-db committed Apr 20, 2023
1 parent 9e58aaa commit 80ca02b
Show file tree
Hide file tree
Showing 12 changed files with 322 additions and 123 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@
## Change History
All notable changes to the Databricks Labs Data Generator will be documented in this file.

### Unreleased

#### Changed
* Added formatting of generated code as Html for script methods


### Version 0.3.4 Post 2

### Fixed
* Fix for use of values in columns of type array, map and struct
* Fix for generation of arrays via `numFeatures` and `structType` attributes when numFeatures has value of 1


### Version 0.3.4 Post 1

### Fixed
Expand Down
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ wheel = "==0.38.4"
pandas = "==1.2.4"
setuptools = "==65.6.3"
pyparsing = "==2.4.7"
jmespath = "==0.10.0"

[requires]
python_version = ">=3.8.10"
6 changes: 4 additions & 2 deletions dbldatagen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
from .datagen_constants import DEFAULT_RANDOM_SEED, RANDOM_SEED_RANDOM, RANDOM_SEED_FIXED, \
RANDOM_SEED_HASH_FIELD_NAME, MIN_PYTHON_VERSION, MIN_SPARK_VERSION
from .utils import ensure, topologicalSort, mkBoundsList, coalesce_values, \
deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins
deprecated, parse_time_interval, DataGenError, split_list_matching_condition, strip_margins, \
json_value_from_path, system_time_millis
from ._version import __version__
from .column_generation_spec import ColumnGenerationSpec
from .column_spec_options import ColumnSpecOptions
Expand All @@ -40,11 +41,12 @@
from .spark_singleton import SparkSingleton
from .text_generators import TemplateGenerator, ILText, TextGenerator
from .text_generator_plugins import PyfuncText, PyfuncTextFactory, FakerTextFactory, fakerText
from .html_utils import HtmlUtils

__all__ = ["data_generator", "data_analyzer", "schema_parser", "daterange", "nrange",
"column_generation_spec", "utils", "function_builder",
"spark_singleton", "text_generators", "datarange", "datagen_constants",
"text_generator_plugins"
"text_generator_plugins", "html_utils"
]


Expand Down
23 changes: 19 additions & 4 deletions dbldatagen/data_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
OPTION_RANDOM, OPTION_RANDOM_SEED, OPTION_RANDOM_SEED_METHOD

from .utils import ensure, topologicalSort, DataGenError, deprecated, split_list_matching_condition
from .html_utils import HtmlUtils
from . _version import _get_spark_version
from .schema_parser import SchemaParser

Expand Down Expand Up @@ -1187,13 +1188,14 @@ def _mkInsertOrUpdateStatement(self, columns, srcAlias, substitutions, isUpdate=

return ", ".join(results)

def scriptTable(self, name=None, location=None, tableFormat="delta"):
def scriptTable(self, name=None, location=None, tableFormat="delta", asHtml=False):
""" generate create table script suitable for format of test data set
:param name: name of table to use in generated script
:param location: path to location of data. If specified (default is None), will generate
an external table definition.
:param tableFormat: table format for table
:param asHtml: if true, generate output suitable for use with `displayHTML` method in notebook environment
:returns: SQL string for scripted table
"""
assert name is not None, "`name` must be specified"
Expand All @@ -1219,14 +1221,21 @@ def scriptTable(self, name=None, location=None, tableFormat="delta"):
if location is not None:
results.append(f"location '{location}'")

return "\n".join(results)
results = "\n".join(results)

if asHtml:
results = HtmlUtils.formatCodeAsHtml(results)

return results

def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None, joinExpr=None, timeExpr=None,
insertExpr=None,
useExplicitNames=True,
updateColumns=None, updateColumnExprs=None,
insertColumns=None, insertColumnExprs=None,
srcAlias="src", tgtAlias="tgt"):
srcAlias="src", tgtAlias="tgt",
asHtml=False
):
""" generate merge table script suitable for format of test data set
:param tgtName: name of target table to use in generated script
Expand All @@ -1253,6 +1262,7 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,
By default, will use src column as update value for
target table. This should have the form [ ("update_column_name", "update column expr"), ...]
:param useExplicitNames: If True, generate explicit column names in insert and update statements
:param asHtml: if true, generate output suitable for use with `displayHTML` method in notebook environment
:returns: SQL string for scripted merge statement
"""
assert tgtName is not None, "you must specify a target table"
Expand Down Expand Up @@ -1327,4 +1337,9 @@ def scriptMerge(self, tgtName=None, srcName=None, updateExpr=None, delExpr=None,

results.append(ins_clause)

return "\n".join(results)
result = "\n".join(results)

if asHtml:
result = HtmlUtils.formatCodeAsHtml(results)

return result
102 changes: 102 additions & 0 deletions dbldatagen/html_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
This file defines the `HtmlUtils` classes and utility functions
"""

from .utils import system_time_millis


class HtmlUtils:
""" Utility class for formatting code as HTML and other notebook related formatting
"""

def __init__(self):
pass

@classmethod
def formatCodeAsHtml(cls, codeText):
""" Formats supplied code as Html suitable for use with notebook ``displayHTML``
:param codeText: Code to be wrapped in html section
:return: Html string
This will wrap the code with a html section using html ``pre`` and ``code`` tags.
It adds a copy text to clipboard button to enable users to easily copy the code to the clipboard.
It does not reformat code so supplied code should be preformatted into lines.
.. note::
As the notebook environment uses IFrames in rendering html within ``displayHtml``, it cannot use
the newer ``navigator`` based functionality as this is blocked for cross domain IFrames by default.
"""
ts = system_time_millis()

formattedCode = f"""
<h3>Generated Code</h3>
<div style="outline: 1px dashed blue;"><p ><pre><code id="generated_code_{ts}">
{codeText}
</code></pre></p></br>
</div>
<p><button type="button" onclick="dbldatagen_copy_code_to_clipboard()">Copy code to clipboard!</button> </p>
<script>
function dbldatagen_copy_code_to_clipboard() {{
try {{
var r = document.createRange();
r.selectNode(document.getElementById("generated_code_{ts}"));
window.getSelection().removeAllRanges();
window.getSelection().addRange(r);
document.execCommand('copy');
window.getSelection().removeAllRanges();
}}
catch {{
console.error("copy to clipboard failed")
}}
}}
</script>
"""

return formattedCode

@classmethod
def formatTextAsHtml(cls, textContent, title="Output"):
""" Formats supplied text as Html suitable for use with notebook ``displayHTML``
:param textContent: Text to be wrapped in html section
:param title: Title text to be used
:return: Html string
This will wrap the text content with with Html formatting
"""
ts = system_time_millis()
formattedContent = f"""
<h3>{title}</h3>
<div style="outline: 1px dashed blue;"><p ><pre id="generated_content_{ts}">
{textContent}
</pre></p></br>
</div>
<p><button type="button" onclick="dbldatagen_copy_to_clipboard()">Copy output to clipboard!</button></p>
<script>
function dbldatagen_copy_to_clipboard() {{
try {{
var r = document.createRange();
r.selectNode(document.getElementById("generated_content_{ts}"));
window.getSelection().removeAllRanges();
window.getSelection().addRange(r);
document.execCommand('copy');
window.getSelection().removeAllRanges();
}}
catch {{
console.error("copy to clipboard failed")
}}
}}
</script>
"""

return formattedContent
36 changes: 36 additions & 0 deletions dbldatagen/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
import warnings
from datetime import timedelta
import re
import json
import time
import jmespath


def deprecated(message=""):
Expand Down Expand Up @@ -321,3 +324,36 @@ def match_condition(matchList, matchFn):

# filter out empty lists
return [el for el in retval if el != []]


def json_value_from_path(searchPath, jsonData, defaultValue):
""" Get JSON value from JSON data referenced by searchPath
searchPath should be a JSON path as supported by the `jmespath` package
(see https://jmespath.org/)
:param searchPath: A `jmespath` compatible JSON search path
:param jsonData: The json data to search (string representation of the JSON data)
:param defaultValue: The default value to be returned if the value was not found
:return: Returns the json value if present, otherwise returns the default value
"""
assert searchPath is not None and len(searchPath) > 0, "search path cannot be empty"
assert jsonData is not None and len(jsonData) > 0, "JSON data cannot be empty"

jsonDict = json.loads(jsonData)

jsonValue = jmespath.search(searchPath, jsonDict)

if jsonValue is not None:
return jsonValue

return defaultValue


def system_time_millis():
""" return system time as milliseconds since start of epoch
:return: system time millis as long
"""
curr_time = round(time.time() / 1000)
return curr_time
2 changes: 2 additions & 0 deletions docs/utils/mk_quick_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
"grouping": "internal classes"},
"utils.py": {"briefDesc": "",
"grouping": "internal classes"},
"html_utils.py": {"briefDesc": "",
"grouping": "internal classes"},

"beta.py": {"briefDesc": "Beta distribution related code",
"grouping": "data distribution"},
Expand Down
1 change: 1 addition & 0 deletions python/dev_require.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pyspark>=3.1.3
python-dateutil==2.8.1
six==1.15.0
pyparsing==2.4.7
jmespath==0.10.0

# The following packages are required for development only
wheel==0.36.2
Expand Down
3 changes: 3 additions & 0 deletions python/require.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ pyspark>=3.1.3
python-dateutil==2.8.1
six==1.15.0
pyparsing==2.4.7
jmespath==0.10.0

# The following packages are required for development only
wheel==0.36.2
Expand All @@ -31,3 +32,5 @@ recommonmark
sphinx-markdown-builder
rst2pdf==0.98
Jinja2 < 3.1
sphinx-copybutton

33 changes: 33 additions & 0 deletions tests/test_html_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pytest

from dbldatagen import HtmlUtils, SparkSingleton

spark = SparkSingleton.getLocalInstance("unit tests")


class TestHtmlUtils:

@pytest.mark.parametrize("content",
["""
for x in range(10):
print(x)
"""]
)
def test_html_format_code(self, content):
formattedContent = HtmlUtils.formatCodeAsHtml(content)
assert formattedContent is not None
assert content in formattedContent

@pytest.mark.parametrize("content, heading",
[("""
this is a test
this is another one
""", "testing"
)])
def test_html_format_content(self, content, heading):
formattedContent = HtmlUtils.formatTextAsHtml(content, title=heading)

assert formattedContent is not None, "formatted output is None"

assert content in formattedContent, "original content missing"
assert heading in formattedContent, "heading missing from content"

0 comments on commit 80ca02b

Please sign in to comment.