## import libraries

In [51]:
from cadv_exploration.utils import load_dotenv

load_dotenv()
from scripts.python.utils import setup_logger
from llm.langchain.downstream_task_prompt import CD_TASK_DESCRIPTION
from inspector.deequ.deequ_inspector_manager import DeequInspectorManager
from llm.langchain import LangChainCADV
from data_models import Constraints

from loader import FileLoader

from cadv_exploration.dq_manager import DeequDataQualityManager
from cadv_exploration.utils import get_project_root
import pandas as pd
import oyaml as yaml

logger = setup_logger("toy_example.log")

## Load the data and utility functions

In [52]:
dq_manager = DeequDataQualityManager()
train_file_path = get_project_root() / "data" / "toy_example" / "files" / "hospitalisations_train.csv"
test_file_path = get_project_root() / "data" / "toy_example" / "files" / "hospitalisations_test.csv"
train_data = FileLoader.load_csv(train_file_path, na_values=["NULL"])
test_data = FileLoader.load_csv(test_file_path, na_values=["NULL"])
spark_train_data, spark_train = dq_manager.spark_df_from_pandas_df(train_data)
spark_test_data, spark_test = dq_manager.spark_df_from_pandas_df(test_data)


def validate_on_test_data(constraints, test_data):
    code_column_map = constraints.get_suggestions_code_column_map(valid_only=False)
    code_list = [item for item in code_column_map.keys()]
    spark_test_data, spark_test = dq_manager.spark_df_from_pandas_df(test_data)
    status_on_test_data = dq_manager.validate_on_spark_df(spark_test, spark_test_data, code_list,
                                                          return_raw=True)
    code_list_for_constraints = [
        (code_list[i], status_on_test_data[i].constraint_status, status_on_test_data[i].constraint_message) for i
        in
        range(len(code_list))]
    return pd.DataFrame(code_list_for_constraints, columns=["column_name", "constraint_status", "constraint_message"])

## get constraints with deequ

In [53]:
constraints = dq_manager.get_constraints_for_spark_df(spark_train, spark_train_data, spark_test,
                                                      spark_test_data)
print(yaml.dump(constraints.to_dict()))

constraints:
  admission_day:
    code:
    - - .isComplete("admission_day")
      - Invalid
    - - .isNonNegative("admission_day")
      - Valid
    assumptions: []
  bloodtype:
    code:
    - - .isComplete("bloodtype")
      - Valid
    - - .isContainedIn("bloodtype", ["A pos", "O pos"])
      - Invalid
    assumptions: []
  complications:
    code:
    - - .isComplete("complications")
      - Valid
    - - .isContainedIn("complications", ["N"])
      - Invalid
    assumptions: []
  cost:
    code:
    - - .isComplete("cost")
      - Valid
    - - .isNonNegative("cost")
      - Valid
    assumptions: []
  diagnosis:
    code:
    - - .isComplete("diagnosis")
      - Invalid
    assumptions: []
  discharge_day:
    code:
    - - .isComplete("discharge_day")
      - Valid
    - - .isNonNegative("discharge_day")
      - Valid
    - - .isUnique("discharge_day")
      - Valid
    assumptions: []
  gender:
    code:
    - - .isComplete("gender")
      - Invalid
    - - .isNonNegative("ge

## validate constraints on test data

In [54]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,".isComplete(""race"")",Success,
1,".isComplete(""admission_day"")",Failure,Value: 0.6666666666666666 does not meet the co...
2,".isNonNegative(""admission_day"")",Success,
3,".isComplete(""ssn"")",Success,
4,".isUnique(""ssn"")",Success,
5,".isContainedIn(""bloodtype"", [""A pos"", ""O pos""])",Failure,Value: 0.6666666666666666 does not meet the co...
6,".isComplete(""bloodtype"")",Success,
7,".isComplete(""insurance"")",Success,
8,".isComplete(""discharge_day"")",Success,
9,".isNonNegative(""discharge_day"")",Success,


## prepare context for LLM

In [55]:
column_desc = DeequInspectorManager().spark_df_to_column_desc(spark_train_data, spark_train)
context = """
nonsensitive_df = duckdb.sql("SELECT * EXCLUDE ssn, gender, race
FROM 's3://datalake/latest/hospitalisations.csv'").df()
hosp_df = nonsensitive_df.dropna()
strokes_total = duckdb.sql("SELECT COUNT(*) FROM hosp_df
WHERE diagnosis = 'stroke'").fetch()
strokes_for_rare_bloodtypes = duckdb.sql("SELECT COUNT(*)
FROM hosp_df WHERE diagnosis = 'stroke'
AND bloodtype IN ('AB negative', 'B negative')").fetch()
generate_report(strokes_total, strokes_for_rare_bloodtypes)"""
result_path_cadv = "toy_example_cadv_constraints.yaml"

In [56]:
## run LLM on toy model with default settings

In [57]:
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=CD_TASK_DESCRIPTION,
                   assumption_generation_trick=None, logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  bloodtype:
    code:
    - - .isComplete('bloodtype')
      - Valid
    - - .isContainedIn('bloodtype', ['A positive', 'A negative', 'B positive', 'B
        negative', 'AB positive', 'AB negative', 'O positive', 'O negative'])
      - Invalid
    assumptions:
    - The value of the bloodtype column should be one of the expected blood types,
      such as 'AB negative' or 'B negative'.
  diagnosis:
    code:
    - - .isComplete('diagnosis')
      - Invalid
    - - .isContainedIn('diagnosis', ['stroke', 'heart attack', 'diabetes', 'cancer',
        'asthma', 'migraine', 'flu', 'allergy', 'hypertension', 'pneumonia'])
      - Invalid
    assumptions:
    - The value of the diagnosis column should be one of the expected medical conditions,
      such as 'stroke'.



## run LLM on toy model with add_deequ trick

It will add, delete and modify the constraints generated by deequ

In [58]:
deequ_assumptions = dq_manager.get_constraints_for_spark_df(spark_train, spark_train_data).to_string()
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=CD_TASK_DESCRIPTION,
                   assumption_generation_trick='add_deequ', logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context, "deequ_assumptions": deequ_assumptions},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  bloodtype:
    code:
    - - .isComplete('bloodtype')
      - Valid
    - - .isContainedIn('bloodtype', ['A positive', 'A negative', 'B positive', 'B
        negative', 'AB positive', 'AB negative', 'O positive', 'O negative'])
      - Invalid
    assumptions:
    - The column 'bloodtype' should be complete, meaning no missing values are expected.
    - The column 'bloodtype' is assumed to potentially include rare blood types 'AB
      negative' and 'B negative' as they are explicitly queried in the code.
  diagnosis:
    code:
    - - .isComplete('diagnosis')
      - Invalid
    - - .isContainedIn('diagnosis', ['stroke', 'heart attack', 'flu', 'pneumonia',
        'cancer', 'diabetes', 'asthma', 'allergy', 'infection', 'injury'])
      - Invalid
    assumptions:
    - The column 'diagnosis' should be complete, meaning no missing values are expected.
    - The values in the column 'diagnosis' are assumed to include 'stroke' for the
      specific queries in the downstrea

In [59]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,.isComplete('diagnosis'),Failure,Value: 0.6666666666666666 does not meet the co...
1,".isContainedIn('diagnosis', ['stroke', 'heart ...",Failure,Value: 0.6666666666666666 does not meet the co...
2,.isComplete('bloodtype'),Success,
3,".isContainedIn('bloodtype', ['A positive', 'A ...",Failure,Value: 0.0 does not meet the constraint requir...


## run LLM on toy model with add_experience trick

In [60]:
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=CD_TASK_DESCRIPTION,
                   assumption_generation_trick="add_experience", logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  bloodtype:
    code:
    - - .isComplete('bloodtype')
      - Valid
    - - .isContainedIn('bloodtype', ['O pos', 'A pos', 'AB negative', 'B negative',
        'O negative', 'A negative', 'B pos', 'AB pos'])
      - Invalid
    assumptions:
    - 'The column ''bloodtype'' should have values within the observed range: ''O
      pos'', ''A pos''. Additional reasonable values could include ''AB negative'',
      ''B negative'', etc.'
    - The column 'bloodtype' should be NOT NULL as it is complete in the sample.
  diagnosis:
    code:
    - - .isComplete('diagnosis')
      - Invalid
    - - .isContainedIn('diagnosis', ['cancer', 'cough', 'fraction', 'stroke', 'flu',
        'infection', 'fever', 'asthma', 'diabetes', 'hypertension', 'heart attack',
        'pneumonia', 'allergy', 'migraine', 'arthritis', 'depression', 'anxiety',
        'tuberculosis', 'malaria', 'HIV'])
      - Invalid
    assumptions:
    - 'The column ''diagnosis'' should have values within the observed

In [61]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,.isComplete('diagnosis'),Failure,Value: 0.6666666666666666 does not meet the co...
1,".isContainedIn('diagnosis', ['cancer', 'cough'...",Failure,Value: 0.6666666666666666 does not meet the co...
2,.isComplete('bloodtype'),Success,
3,".isContainedIn('bloodtype', ['O pos', 'A pos',...",Failure,Value: 0.6666666666666666 does not meet the co...


## stop spark

In [62]:
spark_train.sparkContext._gateway.shutdown_callback_server()
spark_train.stop()
spark_test.sparkContext._gateway.shutdown_callback_server()
spark_test.stop()