## import libraries

In [38]:
from cadv_exploration.utils import load_dotenv

load_dotenv()
from scripts.python.utils import setup_logger
from llm.langchain.downstream_task_prompt import SQL_QUERY_TASK_DESCRIPTION, ML_INFERENCE_TASK_DESCRIPTION
from inspector.deequ.deequ_inspector_manager import DeequInspectorManager
from llm.langchain import LangChainCADV
from data_models import Constraints

from loader import FileLoader

from cadv_exploration.dq_manager import DeequDataQualityManager
from cadv_exploration.utils import get_project_root
import pandas as pd
import oyaml as yaml

logger = setup_logger("toy_example.log")

## Load the data and utility functions

In [39]:
dq_manager = DeequDataQualityManager()
train_file_path = get_project_root() / "data" / "toy_example" / "files" / "hospitalisations_train.csv"
test_file_path = get_project_root() / "data" / "toy_example" / "files" / "hospitalisations_test.csv"
train_data = FileLoader.load_csv(train_file_path, na_values=["NULL"])
test_data = FileLoader.load_csv(test_file_path, na_values=["NULL"])
spark_train_data, spark_train = dq_manager.spark_df_from_pandas_df(train_data)
spark_test_data, spark_test = dq_manager.spark_df_from_pandas_df(test_data)


def validate_on_test_data(constraints, test_data):
    code_column_map = constraints.get_suggestions_code_column_map(valid_only=False)
    code_list = [item for item in code_column_map.keys()]
    spark_test_data, spark_test = dq_manager.spark_df_from_pandas_df(test_data)
    status_on_test_data = dq_manager.validate_on_spark_df(spark_test, spark_test_data, code_list,
                                                          return_raw=True)
    code_list_for_constraints = [
        (code_list[i], status_on_test_data[i].constraint_status, status_on_test_data[i].constraint_message) for i
        in
        range(len(code_list))]
    return pd.DataFrame(code_list_for_constraints, columns=["column_name", "constraint_status", "constraint_message"])

In [40]:
train_data

Unnamed: 0,ssn,gender,race,bloodtype,diagnosis,admission_day,discharge_day,insurance,cost,complications
0,420-64-XXXX,1,white,A pos,cough,10,10,UHG,10,N
1,423-33-XXXX,1,asian,O pos,fraction,10,12,KP,1000,N
2,545-31-XXXX,2,white,A pos,fraction,17,19,UHG,1000,N
3,222-24-XXXX,1,hispanic,O pos,cancer,20,25,CG,10000,N


In [41]:
test_data

Unnamed: 0,ssn,gender,race,bloodtype,diagnosis,admission_day,discharge_day,insurance,cost,complications
0,221-04-XXXX,1.0,black,AB neg,stroke,21.0,21,UHG,0,N
1,434-29-XXXX,,asian,O pos,,,22,KP,1000,Y
2,212-56-XXXX,1.0,white,O pos,bloodknot,22.0,29,UHG,1000,N


## get constraints with deequ

In [42]:
constraints = dq_manager.get_constraints_for_spark_df(spark_train, spark_train_data, spark_test,
                                                      spark_test_data)
print(yaml.dump(constraints.to_dict()))

PythonCallback server restarted!
constraints:
  admission_day:
    code:
    - - .isComplete("admission_day")
      - Invalid
    - - .isNonNegative("admission_day")
      - Valid
    assumptions: []
  bloodtype:
    code:
    - - .isComplete("bloodtype")
      - Valid
    - - .isContainedIn("bloodtype", ["A pos", "O pos"])
      - Invalid
    assumptions: []
  complications:
    code:
    - - .isComplete("complications")
      - Valid
    - - .isContainedIn("complications", ["N"])
      - Invalid
    assumptions: []
  cost:
    code:
    - - .isComplete("cost")
      - Valid
    - - .isNonNegative("cost")
      - Valid
    assumptions: []
  diagnosis:
    code:
    - - .isComplete("diagnosis")
      - Invalid
    assumptions: []
  discharge_day:
    code:
    - - .isComplete("discharge_day")
      - Valid
    - - .isNonNegative("discharge_day")
      - Valid
    - - .isUnique("discharge_day")
      - Valid
    assumptions: []
  gender:
    code:
    - - .isComplete("gender")
      - I

In [43]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,".isComplete(""discharge_day"")",Success,
1,".isNonNegative(""discharge_day"")",Success,
2,".isUnique(""discharge_day"")",Success,
3,".isContainedIn(""complications"", [""N""])",Failure,Value: 0.6666666666666666 does not meet the co...
4,".isComplete(""complications"")",Success,
5,".isContainedIn(""bloodtype"", [""A pos"", ""O pos""])",Failure,Value: 0.6666666666666666 does not meet the co...
6,".isComplete(""bloodtype"")",Success,
7,".isComplete(""insurance"")",Success,
8,".isComplete(""cost"")",Success,
9,".isNonNegative(""cost"")",Success,


# downstream task 1

## prepare context for LLM

In [44]:
column_desc = DeequInspectorManager().spark_df_to_column_desc(spark_train_data, spark_train)
context = """
nonsensitive_df = duckdb.sql("SELECT * EXCLUDE ssn, gender, race
FROM 's3://datalake/latest/hospitalisations.csv'").df()
hosp_df = nonsensitive_df.dropna()
strokes_total = duckdb.sql("SELECT COUNT(*) FROM hosp_df
WHERE diagnosis = 'stroke'").fetch()
strokes_for_rare_bloodtypes = duckdb.sql("SELECT COUNT(*)
FROM hosp_df WHERE diagnosis = 'stroke'
AND bloodtype IN ('AB negative', 'B negative')").fetch()
generate_report(strokes_total, strokes_for_rare_bloodtypes)"""

## run LLM on toy model with default settings

In [45]:
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=SQL_QUERY_TASK_DESCRIPTION,
                   assumption_generation_trick=None, logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  bloodtype:
    code:
    - - .isContainedIn('bloodtype', ['AB negative', 'B negative'])
      - Invalid
    assumptions:
    - The column 'bloodtype' should be able to contain the values 'AB negative' and
      'B negative'.
  diagnosis:
    code:
    - - .isContainedIn('diagnosis', ['stroke'])
      - Invalid
    assumptions:
    - The column 'diagnosis' should contain the value 'stroke'.



In [46]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,".isContainedIn('diagnosis', ['stroke'])",Failure,Value: 0.6666666666666666 does not meet the co...
1,".isContainedIn('bloodtype', ['AB negative', 'B...",Failure,Value: 0.0 does not meet the constraint requir...


## run LLM on toy model with add_deequ trick

It will add, delete and modify the constraints generated by deequ

In [47]:
deequ_assumptions = dq_manager.get_constraints_for_spark_df(spark_train, spark_train_data).to_string()
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=SQL_QUERY_TASK_DESCRIPTION,
                   assumption_generation_trick='add_deequ', logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context, "deequ_assumptions": deequ_assumptions},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  bloodtype:
    code:
    - - .isComplete('bloodtype')
      - Valid
    - - .isContainedIn('bloodtype', ['A positive', 'A negative', 'B positive', 'B
        negative', 'AB positive', 'AB negative', 'O positive', 'O negative'])
      - Invalid
    - - .isContainedIn('bloodtype', ['AB negative', 'B negative'])
      - Invalid
    assumptions:
    - The column 'bloodtype' should be capable of containing valid blood types.
    - The column 'bloodtype' should be capable of containing 'AB negative' and 'B
      negative'.
  diagnosis:
    code:
    - - .isComplete('diagnosis')
      - Invalid
    - - .isContainedIn('diagnosis', ['stroke'])
      - Invalid
    assumptions:
    - The column 'diagnosis' should contain valid medical conditions.
    - The column 'diagnosis' should include the value 'stroke'.



In [48]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,.isComplete('diagnosis'),Failure,Value: 0.6666666666666666 does not meet the co...
1,".isContainedIn('diagnosis', ['stroke'])",Failure,Value: 0.6666666666666666 does not meet the co...
2,.isComplete('bloodtype'),Success,
3,".isContainedIn('bloodtype', ['A positive', 'A ...",Failure,Value: 0.0 does not meet the constraint requir...
4,".isContainedIn('bloodtype', ['AB negative', 'B...",Failure,Value: 0.0 does not meet the constraint requir...


## run LLM on toy model with add_experience trick

In [49]:
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=SQL_QUERY_TASK_DESCRIPTION,
                   assumption_generation_trick="add_experience", logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  bloodtype:
    code:
    - - .isComplete('bloodtype')
      - Valid
    - - .isContainedIn('bloodtype', ['O pos', 'A pos', 'AB negative', 'B negative'])
      - Invalid
    assumptions:
    - Values should be in ['O pos', 'A pos', 'AB negative', 'B negative']
    - Should not have null values
  diagnosis:
    code:
    - - .isComplete('diagnosis')
      - Invalid
    - - .isContainedIn('diagnosis', ['cancer', 'cough', 'fraction', 'stroke'])
      - Invalid
    assumptions:
    - Values should be in ['cancer', 'cough', 'fraction', 'stroke']
    - Should not have null values



In [50]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,".isContainedIn('diagnosis', ['cancer', 'cough'...",Failure,Value: 0.6666666666666666 does not meet the co...
1,.isComplete('diagnosis'),Failure,Value: 0.6666666666666666 does not meet the co...
2,".isContainedIn('bloodtype', ['O pos', 'A pos',...",Failure,Value: 0.6666666666666666 does not meet the co...
3,.isComplete('bloodtype'),Success,


# Downstream task 2

## prepare context for LLM

In [51]:
column_desc = DeequInspectorManager().spark_df_to_column_desc(spark_train_data, spark_train)
context = """
df = pd.read_csv("s3://datalake/latest/hospitalisations.csv") 
df['cost_smoothed'] = np.log(df['cost'])
df['admission_day'].fillna(df['discharge_day'])
df['duration'] = df['discharge_day'] - df['admission_day']
categorical_cols = ['diagnosis', 'insurance']
for col in categorical_cols:
 df[col] = pd.get_dummies(df[col], dummy_na=True)
features = df[categorical_cols + ['duration', 'cost_smoothed']]
labels = label_binarize(df['complications'], classes=['Y', 'N'])
model = sklearn.tree.DecisionTreeClassifier()
model.fit(train_features, train_labels)
deploy_to_production(model)
"""

## run LLM on toy model with default settings

In [52]:
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=ML_INFERENCE_TASK_DESCRIPTION,
                   assumption_generation_trick=None, logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  admission_day:
    code:
    - - .isNonNegative('admission_day')
      - Valid
    assumptions:
    - Values in the column 'admission_day' should be non-negative, as days should
      not be negative.
  complications:
    code:
    - - .isContainedIn('complications', ['Y', 'N'])
      - Valid
    assumptions:
    - The column 'complications' should only contain the values 'Y' or 'N' as it is
      label-binarized with these classes.
  cost:
    code:
    - - .isPositive('cost')
      - Invalid
    assumptions:
    - Values in the column 'cost' should be greater than 0 to ensure the logarithm
      operation is valid.
  diagnosis:
    code:
    - - .isContainedIn('diagnosis', ['cancer', 'cough', 'fraction'])
      - Invalid
    assumptions:
    - The column 'diagnosis' should only contain the values present in the dataset,
      which are 'cancer', 'cough', and 'fraction'.
  discharge_day:
    code:
    - - .isGreaterThanOrEqualTo('discharge_day', 'admission_day')
      -

In [53]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,.isPositive('cost'),Failure,Value: 0.6666666666666666 does not meet the co...
1,.isNonNegative('admission_day'),Success,
2,.isNonNegative('discharge_day'),Success,
3,".isGreaterThanOrEqualTo('discharge_day', 'admi...",Failure,Value: 0.6666666666666666 does not meet the co...
4,".isContainedIn('diagnosis', ['cancer', 'cough'...",Failure,Value: 0.3333333333333333 does not meet the co...
5,".isContainedIn('insurance', ['CG', 'UHG', 'KP'])",Success,
6,".isContainedIn('complications', ['Y', 'N'])",Success,


## run LLM on toy model with add_deequ trick

It will add, delete and modify the constraints generated by deequ

In [54]:
deequ_assumptions = dq_manager.get_constraints_for_spark_df(spark_train, spark_train_data).to_string()
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=ML_INFERENCE_TASK_DESCRIPTION,
                   assumption_generation_trick='add_deequ', logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context, "deequ_assumptions": deequ_assumptions},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  admission_day:
    code:
    - - .isComplete('admission_day')
      - Invalid
    - - .isNonNegative('admission_day')
      - Valid
    assumptions:
    - Admission day should not have missing values as it is used in calculations.
    - Admission day should be non-negative.
  complications:
    code:
    - - .isContainedIn('complications', ['Y', 'N'])
      - Valid
    assumptions:
    - Complications should only contain 'Y' or 'N' as it is binarized with these classes.
  cost:
    code:
    - - .isPositive('cost')
      - Invalid
    assumptions:
    - Cost values should be positive as log transformation is applied.
  diagnosis:
    code:
    - - .isComplete('diagnosis')
      - Invalid
    - - .isContainedIn('diagnosis', ['cancer', 'cough', 'fraction'])
      - Invalid
    assumptions:
    - Diagnosis should not have missing values as it is one-hot encoded.
    - Diagnosis values should be among ['cancer', 'cough', 'fraction'] for accurate
      encoding.
  discharge_d

In [55]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,.isPositive('cost'),Failure,Value: 0.6666666666666666 does not meet the co...
1,.isComplete('admission_day'),Failure,Value: 0.6666666666666666 does not meet the co...
2,.isNonNegative('admission_day'),Success,
3,.isComplete('discharge_day'),Success,
4,.isNonNegative('discharge_day'),Success,
5,".isGreaterThanOrEqualTo('discharge_day', 'admi...",Failure,Value: 0.6666666666666666 does not meet the co...
6,.isComplete('diagnosis'),Failure,Value: 0.6666666666666666 does not meet the co...
7,".isContainedIn('diagnosis', ['cancer', 'cough'...",Failure,Value: 0.3333333333333333 does not meet the co...
8,.isComplete('insurance'),Success,
9,".isContainedIn('insurance', ['CG', 'UHG', 'KP'])",Success,


## run LLM on toy model with add_experience trick

In [56]:
lc = LangChainCADV(model_name="gpt-4o", downstream_task_description=ML_INFERENCE_TASK_DESCRIPTION,
                   assumption_generation_trick="add_experience", logger=logger)

relevant_columns_list, expectations, suggestions = lc.invoke(
    input_variables={"column_desc": column_desc, "script": context},
    num_stages=3,
    max_retries=3
)
code_list_for_constraints = [item for v in suggestions.values() for item in v]

# Validate the constraints on the original data to see if they are grammarly correct
code_list_for_constraints_valid = dq_manager.filter_constraints(code_list_for_constraints, spark_test,
                                                                spark_test_data)
constraints = Constraints.from_llm_output(relevant_columns_list, expectations, suggestions,
                                          code_list_for_constraints_valid)

print(yaml.dump(constraints.to_dict()))

constraints:
  admission_day:
    code:
    - - '.hasMax(''admission_day'', lambda x: x <= 20)'
      - Invalid
    - - '.hasMin(''admission_day'', lambda x: x >= 10)'
      - Valid
    - - .isNonNegative('admission_day')
      - Valid
    assumptions:
    - The column 'admission_day' should be non-negative as negative days are not logical.
    - The column 'admission_day' should have a minimum value of 10 and a maximum value
      of 20.
  complications:
    code:
    - - .isComplete('complications')
      - Valid
    - - .isContainedIn('complications', ['Y', 'N'])
      - Valid
    assumptions:
    - The column 'complications' should be complete and not null, as it is complete
      in the sample.
    - The column 'complications' should be within the values ['Y', 'N'], as it is
      used for label binarization.
  cost:
    code:
    - - '.hasMax(''cost'', lambda x: x <= 10000)'
      - Valid
    - - '.hasMin(''cost'', lambda x: x >= 10)'
      - Invalid
    - - .isNonNegative('cost'

In [57]:
validate_on_test_data(constraints, test_data)

Unnamed: 0,column_name,constraint_status,constraint_message
0,.isNonNegative('cost'),Success,
1,".hasMin('cost', lambda x: x >= 10)",Failure,Value: 0.0 does not meet the constraint requir...
2,".hasMax('cost', lambda x: x <= 10000)",Success,
3,.isNonNegative('admission_day'),Success,
4,".hasMin('admission_day', lambda x: x >= 10)",Success,
5,".hasMax('admission_day', lambda x: x <= 20)",Failure,Value: 22.0 does not meet the constraint requi...
6,.isNonNegative('discharge_day'),Success,
7,".hasMin('discharge_day', lambda x: x >= 10)",Success,
8,".hasMax('discharge_day', lambda x: x <= 25)",Failure,Value: 29.0 does not meet the constraint requi...
9,".isContainedIn('diagnosis', ['cancer', 'cough'...",Failure,Value: 0.3333333333333333 does not meet the co...


## stop spark

In [58]:
spark_train.sparkContext._gateway.shutdown_callback_server()
spark_train.stop()
spark_test.sparkContext._gateway.shutdown_callback_server()
spark_test.stop()