In [1]:
RULE_GENERATION_PROMPT = """You are part of a context-aware data validation system.
You are asked to transform the user's intuitions into formal validation rules to ensure the data meets the user's expectations. We are using PyDeequ as the validation library so the rules should be in PyDeequ format.

The function signature is as follows:
    def hasSize(self, assertion, hint=None):
    def isComplete(self, column, hint=None):
    def hasCompleteness(self, column, assertion, hint=None):
    def areComplete(self, columns, hint=None):
    def haveCompleteness(self, columns, assertion, hint=None):
    def areAnyComplete(self, columns, hint=None):
    def haveAnyCompleteness(self, columns, assertion, hint=None):
    def isUnique(self, column, hint=None):
    def isPrimaryKey(self, column, *columns, hint=None):
    def hasUniqueness(self, columns, assertion, hint=None):
    def hasDistinctness(self, columns, assertion, hint=None):
    def hasUniqueValueRatio(self, columns, assertion, hint=None):
    def hasNumberOfDistinctValues(self, column, assertion, binningUdf, maxBins, hint=None):
    def hasHistogramValues(self, column, assertion, binningUdf, maxBins, hint=None):
    def kllSketchSatisfies(self, column, assertion, kllParameters=None, hint=None):
    def _isNewestPointNonAnomalous(self):
    def hasEntropy(self, column, assertion, hint=None):
    def hasMutualInformation(self, columnA, columnB, assertion, hint=None):
    def hasApproxQuantile(self, column, quantile, assertion, hint=None):
    def hasMinLength(self, column, assertion, hint=None):
    def hasMaxLength(self, column, assertion, hint=None):
    def hasMin(self, column, assertion, hint=None):
    def hasMax(self, column, assertion, hint=None):
    def hasMean(self, column, assertion, hint=None):
    def hasSum(self, column, assertion, hint=None):
    def hasStandardDeviation(self, column, assertion, hint=None):
    def hasApproxCountDistinct(self, column, assertion, hint=None):
    def hasCorrelation(self, columnA, columnB, assertion, hint=None):
    def satisfies(self, columnCondition, constraintName, assertion=None, hint=None):
    def hasPattern(self, column, pattern, assertion=None, name=None, hint=None):
    def containsCreditCardNumber(self, column, assertion=None, hint=None):
    def containsEmail(self, column, assertion=None, hint=None):
    def containsURL(self, column, assertion=None, hint=None):
    def containsSocialSecurityNumber(self, column, assertion=None, hint=None):
    def hasDataType(self, column, datatype: ConstrainableDataTypes, assertion=None, hint=None):
    def isNonNegative(self, column, assertion=None, hint=None):
    def isPositive(self, column, assertion=None, hint=None):
    def isLessThan(self, columnA, columnB, assertion=None, hint=None):
    def isLessThanOrEqualTo(self, columnA, columnB, assertion=None, hint=None):
    def isGreaterThan(self, columnA, columnB, assertion=None, hint=None):
    def isGreaterThanOrEqualTo(self, columnA, columnB, assertion=None, hint=None):
    def isContainedIn(self, column, allowed_values, assertion=None, hint=None):

To help you understand the signature, here are the valid PyDeequ constraints:

    .hasMin('person_age', lambda x: x > 18)
    .hasMax('person_age', lambda x: x < 120)
    .isComplete('loan_status')
    .hasCompleteness('loan_status', lambda x: x == 1.0)
    .isUnique('id')
    .hasUniqueValueRatio(['id'], lambda x: x > 0.8)
    .hasEntropy('loan_status', lambda x: x > 0.4)
    .hasMutualInformation('loan_grade', 'loan_amnt', lambda x: x < 0.1)
    .hasApproxQuantile('person_income', 0.5, lambda x: x > 0.8)
    .hasMinLength('loan_intent', lambda x: x > 1)
    .hasMaxLength('loan_intent', lambda x: x < 20)
    .hasStandardDeviation('person_income', lambda x: x > 0.8)
    .hasApproxCountDistinct('loan_intent', lambda x: x > 0.8)
    .hasCorrelation('person_income', 'loan_amnt', lambda x: x > 0.3)
    .satisfies('person_income > 0 WHERE loan_amnt > 0', lambda x: x > 0.8)
    .hasPattern('person_home_ownership', 'RENT|OWN|MORTGAGE|OTHER', lambda x: x > 0.8)
    .isContainedIn('loan_grade', ['A', 'B', 'C', 'D', 'E', 'F', 'G'])
    .containsURL('loan_intent', lambda x: x == 0)
    .isPositive('person_income')
    .isGreaterThan('person_income', 'loan_amnt', lambda x: x > 0.8)

The Intuitions generated by the *Expectation Extraction* component are:
{expectations}
Please generate validation rules as a JSON object with the column names as keys and a list of assumptions as values.
e.g., ```
{{
    "column_name_1": ["code_for_assumption_1", "code_for_assumption_1", ...],
    "column_name_2": ["code_for_assumption_1", "code_for_assumption_1", ...],
    ...
}}```
Each assumption should be a PyDeequ constraint. For example, `.isComplete("column_name")` or `.isContainedIn("column_name", ["value_1", "value_2"])`.
"""


In [2]:
from inspect import cleandoc

print(cleandoc(RULE_GENERATION_PROMPT))
print("...")
print(RULE_GENERATION_PROMPT)

You are part of a context-aware data validation system.
You are asked to transform the user's intuitions into formal validation rules to ensure the data meets the user's expectations. We are using PyDeequ as the validation library so the rules should be in PyDeequ format.

The function signature is as follows:
    def hasSize(self, assertion, hint=None):
    def isComplete(self, column, hint=None):
    def hasCompleteness(self, column, assertion, hint=None):
    def areComplete(self, columns, hint=None):
    def haveCompleteness(self, columns, assertion, hint=None):
    def areAnyComplete(self, columns, hint=None):
            def haveAnyCompleteness(self, columns, assertion, hint=None):
    def isUnique(self, column, hint=None):
    def isPrimaryKey(self, column, *columns, hint=None):
    def hasUniqueness(self, columns, assertion, hint=None):
    def hasDistinctness(self, columns, assertion, hint=None):
    def hasUniqueValueRatio(self, columns, assertion, hint=None):
    def hasNumb