## import libraries

In [2]:
from cadv_exploration.utils import load_dotenv

load_dotenv()
from scripts.python.utils import setup_logger
from llm.langchain.downstream_task_prompt import CD_TASK_DESCRIPTION
from inspector.deequ.deequ_inspector_manager import DeequInspectorManager
from llm.langchain import LangChainCADV
from data_models import Constraints

from loader import FileLoader

from cadv_exploration.dq_manager import DeequDataQualityManager
from cadv_exploration.utils import get_project_root
import pandas as pd

logger = setup_logger("toy_example.log")



## Load the data

In [3]:
dq_manager = DeequDataQualityManager()
train_file_path = get_project_root() / "data" / "toy_example" / "files" / "hospitalisations_train.csv"
test_file_path = get_project_root() / "data" / "toy_example" / "files" / "hospitalisations_test.csv"
train_data = FileLoader.load_csv(train_file_path, na_values=["NULL"])
test_data = FileLoader.load_csv(test_file_path, na_values=["NULL"])
spark_train_data, spark_train = dq_manager.spark_df_from_pandas_df(train_data)
spark_validation_data, spark_validation = dq_manager.spark_df_from_pandas_df(test_data)

:: loading settings :: url = jar:file:/Users/haochen/Library/Caches/pypoetry/virtualenvs/cadv-exploration-4wWqlI_J-py3.9/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/haochen/.ivy2/cache
The jars for the packages stored in: /Users/haochen/.ivy2/jars
com.amazon.deequ#deequ added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-9997ded6-8731-46f1-b31d-e72eb52351fb;1.0
	confs: [default]
	found com.amazon.deequ#deequ;2.0.7-spark-3.5 in central
	found org.scala-lang#scala-reflect;2.12.10 in local-m2-cache
	found org.scalanlp#breeze_2.12;2.1.0 in local-m2-cache
	found org.scalanlp#breeze-macros_2.12;2.1.0 in local-m2-cache
	found org.typelevel#spire_2.12;0.17.0 in local-m2-cache
	found org.typelevel#spire-macros_2.12;0.17.0 in local-m2-cache
	found org.typelevel#algebra_2.12;2.0.1 in local-m2-cache
	found org.typelevel#cats-kernel_2.12;2.1.1 in local-m2-cache
	found org.typelevel#spire-platform_2.12;0.17.0 in local-m2-cache
	found org.typelevel#spire-util_2.12;0.17.0 in local-m2-cache
	found dev.ludovic.netlib#blas;3.0.1 in local-m2-cache
	found dev.ludovic.netlib#lapack;3.0.1 in local-m2-c

## get constraints with deequ

In [4]:
constraints = dq_manager.get_constraints_for_spark_df(spark_train, spark_train_data, spark_validation,
                                                      spark_validation_data)
constraints.to_dict()

25/01/21 11:27:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Python Callback server started!


{'constraints': {'admission_day': {'code': [['.isComplete("admission_day")',
     'Invalid'],
    ['.isNonNegative("admission_day")', 'Valid']],
   'assumptions': []},
  'bloodtype': {'code': [['.isComplete("bloodtype")', 'Valid'],
    ['.isContainedIn("bloodtype", ["A pos", "O pos"])', 'Invalid']],
   'assumptions': []},
  'complications': {'code': [['.isComplete("complications")', 'Valid'],
    ['.isContainedIn("complications", ["N"])', 'Invalid']],
   'assumptions': []},
  'cost': {'code': [['.isComplete("cost")', 'Valid'],
    ['.isNonNegative("cost")', 'Valid']],
   'assumptions': []},
  'diagnosis': {'code': [['.isComplete("diagnosis")', 'Invalid']],
   'assumptions': []},
  'discharge_day': {'code': [['.isComplete("discharge_day")', 'Valid'],
    ['.isNonNegative("discharge_day")', 'Valid'],
    ['.isUnique("discharge_day")', 'Valid']],
   'assumptions': []},
  'gender': {'code': [['.isComplete("gender")', 'Invalid'],
    ['.isNonNegative("gender")', 'Valid']],
   'assumptions':

## validate constraints on test data

In [5]:
code_column_map = constraints.get_suggestions_code_column_map(valid_only=False)
code_list = [item for item in code_column_map.keys()]
spark_test_data, spark_test = dq_manager.spark_df_from_pandas_df(test_data)
status_on_test_data = dq_manager.validate_on_spark_df(spark_test, spark_test_data, code_list,
                                                      return_raw=True)
code_list_for_constraints = [
    (code_list[i], status_on_test_data[i].constraint_status, status_on_test_data[i].constraint_message) for i
    in
    range(len(code_list))]
writable_code_list_for_constraints = [f"{item[0]}, {item[1]}, {item[2]}" for item in code_list_for_constraints]
pd.DataFrame(code_list_for_constraints, columns=["column_name", "constraint_status", "constraint_message"])


Unnamed: 0,column_name,constraint_status,constraint_message
0,".isComplete(""race"")",Success,
1,".isComplete(""admission_day"")",Failure,Value: 0.6666666666666666 does not meet the co...
2,".isNonNegative(""admission_day"")",Success,
3,".isComplete(""ssn"")",Success,
4,".isUnique(""ssn"")",Success,
5,".isContainedIn(""bloodtype"", [""A pos"", ""O pos""])",Failure,Value: 0.6666666666666666 does not meet the co...
6,".isComplete(""bloodtype"")",Success,
7,".isComplete(""insurance"")",Success,
8,".isComplete(""discharge_day"")",Success,
9,".isNonNegative(""discharge_day"")",Success,


## prepare context for LLM

In [6]:
column_desc = DeequInspectorManager().spark_df_to_column_desc(spark_train_data, spark_train)
context = """
nonsensitive_df = duckdb.sql("SELECT * EXCLUDE ssn, gender, race
FROM 's3://datalake/latest/hospitalisations.csv'").df()
hosp_df = nonsensitive_df.dropna()
strokes_total = duckdb.sql("SELECT COUNT(*) FROM hosp_df
WHERE diagnosis = 'stroke'").fetch()
strokes_for_rare_bloodtypes = duckdb.sql("SELECT COUNT(*)
FROM hosp_df WHERE diagnosis = 'stroke'
AND bloodtype IN ('AB negative', 'B negative')").fetch()
generate_report(strokes_total, strokes_for_rare_bloodtypes)"""
result_path_cadv = "toy_example_cadv_constraints.yaml"

In [7]:
## run LLM on toy model with default settings

In [8]:
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=CD_TASK_DESCRIPTION,
                   assumption_generation_trick=None, logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_validation,
                                                                spark_validation_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

constraints.to_dict()

{'constraints': {'bloodtype': {'code': [[".isContainedIn('bloodtype', ['A positive', 'A negative', 'B positive', 'B negative', 'AB positive', 'AB negative', 'O positive', 'O negative'])",
     'Invalid']],
   'assumptions': ["The column should support the presence of 'AB negative' and 'B negative' as valid blood types."]},
  'diagnosis': {'code': [[".isContainedIn('diagnosis', ['stroke', 'heart attack', 'pneumonia', 'diabetes', 'cancer', 'flu', 'asthma', 'allergy', 'infection', 'fracture', 'sprain', 'burn', 'laceration', 'migraine', 'arthritis', 'depression', 'anxiety', 'hypertension', 'covid-19', 'common cold'])",
     'Invalid']],
   'assumptions': ["The column should contain the value 'stroke' as a valid diagnosis."]}}}

## run LLM on toy model with add_deequ trick

It will add, delete and modify the constraints generated by deequ

In [11]:
deequ_assumptions = dq_manager.get_constraints_for_spark_df(spark_train, spark_train_data).to_string()
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=CD_TASK_DESCRIPTION,
                   assumption_generation_trick='add_deequ', logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context, "deequ_assumptions": deequ_assumptions},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_validation,
                                                                spark_validation_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

constraints.to_dict()

{'constraints': {'bloodtype': {'code': [[".isComplete('bloodtype')", 'Valid'],
    [".isContainedIn('bloodtype', ['A positive', 'A negative', 'B positive', 'B negative', 'AB positive', 'AB negative', 'O positive', 'O negative'])",
     'Invalid']],
   'assumptions': ["The 'bloodtype' column should be complete, with no missing values.",
    "The 'bloodtype' column should expect to contain values such as 'AB negative' and 'B negative'."]},
  'diagnosis': {'code': [[".isComplete('diagnosis')", 'Invalid'],
    [".isContainedIn('diagnosis', ['stroke', 'heart attack', 'cancer', 'diabetes', 'asthma', 'allergy', 'flu', 'covid-19', 'pneumonia', 'bronchitis'])",
     'Invalid']],
   'assumptions': ["The 'diagnosis' column should be complete, with no missing values.",
    "The 'diagnosis' column should contain the value 'stroke'."]}}}

In [None]:
## run LLM on toy model with add_experience trick

In [13]:
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=CD_TASK_DESCRIPTION,
                   assumption_generation_trick="add_experience", logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_validation,
                                                                spark_validation_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

constraints.to_dict()

{'constraints': {'bloodtype': {'code': [[".isComplete('bloodtype')", 'Valid'],
    [".isContainedIn('bloodtype', ['O pos', 'A pos', 'AB negative', 'B negative', 'O neg', 'A neg', 'B pos', 'AB pos'])",
     'Invalid']],
   'assumptions': ['The column should be NOT NULL.',
    "The column should have values in ['O pos', 'A pos', 'AB negative', 'B negative'] and possibly other blood types.",
    "The column should have an IS IN constraint with values ['O pos', 'A pos', 'AB negative', 'B negative']."]},
  'diagnosis': {'code': [[".isComplete('diagnosis')", 'Invalid'],
    [".isContainedIn('diagnosis', ['cancer', 'cough', 'fraction', 'stroke', 'heart attack', 'diabetes', 'flu', 'pneumonia', 'asthma', 'bronchitis', 'allergy', 'infection', 'migraine', 'anemia', 'arthritis', 'hypertension', 'depression', 'anxiety', 'obesity', 'malaria'])",
     'Invalid']],
   'assumptions': ['The column should be NOT NULL.',
    "The column should have values in ['cancer', 'cough', 'fraction', 'stroke'] and p

## stop spark

In [None]:
spark_train.sparkContext._gateway.shutdown_callback_server()
spark_train.stop()
spark_validation.sparkContext._gateway.shutdown_callback_server()
spark_validation.stop()