In [1]:

from pathlib import Path
import pandas as pd

from tadv.utils import get_project_root

result_table_path = Path("../result_tables/")

dfs_list = []
for file in result_table_path.glob('*.csv'):
    dataset_name, downstream_task, error_injection_label = file.stem.split("__")
    df = pd.read_csv(file, index_col=0)
    df['Dataset'] = dataset_name
    df['Downstream Task'] = downstream_task
    df['Error Injection Label'] = error_injection_label
    dfs_list.append(df)

# stack csv files
df = pd.concat([df for df in dfs_list], ignore_index=True)

In [2]:
def performance_drop(result_on_clean_data, result_on_corrupted_data):
    if result_on_corrupted_data == "error":
        return True
    elif result_on_corrupted_data == "success":
        return False
    else:
        result_on_clean_data = float(result_on_clean_data)
        result_on_corrupted_data = float(result_on_corrupted_data)
        if (result_on_clean_data - result_on_corrupted_data) / result_on_clean_data > 0.05:
            return True
        else:
            return False


# df['performance_drop'] = df.apply(lambda x: performance_drop(x['Execution Result on Clean Data'], x['Execution Result on Corrupted Data']), axis=1)
df['performance_drop'] = df.apply(
    lambda x: performance_drop(x['Execution Result on Clean Data'], x['Execution Result on Corrupted Data']), axis=1)

df['passed_constraints_ratio_on_clean_data'] = df['Passed Constraints (Clean)'] / (
        df['Passed Constraints (Clean)'] + df['Failed Constraints (Clean)'])
df['passed_constraints_ratio_on_corrupted_data'] = df['Passed Constraints (Corrupted)'] / (
        df['Passed Constraints (Corrupted)'] + df['Failed Constraints (Corrupted)'])

df.columns

Index(['Script', 'Execution Result on Clean Data',
       'Execution Result on Corrupted Data', 'Model', 'Strategy',
       'Passed Constraints (Clean)', 'Passed Constraints (Corrupted)',
       'Failed Constraints (Clean)', 'Failed Constraints (Corrupted)',
       'Dataset', 'Downstream Task', 'Error Injection Label',
       'performance_drop', 'passed_constraints_ratio_on_clean_data',
       'passed_constraints_ratio_on_corrupted_data'],
      dtype='object')

In [3]:
df['Strategy'].fillna("None", inplace=True)
unique_combinations = df[['Model', 'Strategy']].drop_duplicates()
for model, strategy in unique_combinations.values:
    print(model, strategy)
    df_ = df[(df['Model'] == model) & (df['Strategy'] == strategy)]
    clean_no_impact_fail = (df_['Failed Constraints (Clean)'] > 0).sum()
    clean_no_impact_pass = (df_['Failed Constraints (Clean)'] == 0).sum()
    df_with_no_impact_errors = df_[df_['performance_drop'] == False]
    error_no_impact_fail = (df_with_no_impact_errors['Failed Constraints (Corrupted)'] > 0).sum()
    error_no_impact_pass = (df_with_no_impact_errors['Failed Constraints (Corrupted)'] == 0).sum()
    df_with_impact_errors = df_[df_['performance_drop'] == True]
    error_impact_fail = (df_with_impact_errors['Failed Constraints (Corrupted)'] > 0).sum()
    error_impact_pass = (df_with_impact_errors['Failed Constraints (Corrupted)'] == 0).sum()


deequ None
deequ column_skipped
gpt-4o None
gpt-4o with_deequ
gpt-4o with_experience
gpt-4.5-preview None
gpt-4.5-preview with_deequ
gpt-4.5-preview with_experience
nan None


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Strategy'].fillna("None", inplace=True)


In [4]:
results = []

for model, strategy in unique_combinations.values:
    df_ = df[(df['Model'] == model) & (df['Strategy'] == strategy)]
    clean_no_impact_fail = (df_['Failed Constraints (Clean)'] > 0).sum()
    clean_no_impact_pass = (df_['Failed Constraints (Clean)'] == 0).sum()
    df_with_no_impact_errors = df_[df_['performance_drop'] == False]
    error_no_impact_fail = (df_with_no_impact_errors['Failed Constraints (Corrupted)'] > 0).sum()
    error_no_impact_pass = (df_with_no_impact_errors['Failed Constraints (Corrupted)'] == 0).sum()
    df_with_impact_errors = df_[df_['performance_drop'] == True]
    error_impact_fail = (df_with_impact_errors['Failed Constraints (Corrupted)'] > 0).sum()
    error_impact_pass = (df_with_impact_errors['Failed Constraints (Corrupted)'] == 0).sum()

    results.append([model, strategy, clean_no_impact_fail, clean_no_impact_pass,
                    error_no_impact_fail, error_no_impact_pass,
                    error_impact_fail, error_impact_pass])

result_df = pd.DataFrame(results, columns=[
    'Model', 'Strategy',
    'Clean Fail', 'Clean Pass',
    'Error NoImpact Fail', 'Error NoImpact Pass',
    'Error Impact Fail', 'Error Impact Pass'
])

result_df

Unnamed: 0,Model,Strategy,Clean Fail,Clean Pass,Error NoImpact Fail,Error NoImpact Pass,Error Impact Fail,Error Impact Pass
0,deequ,,180,75,190,20,41,4
1,deequ,column_skipped,0,255,137,73,34,11
2,gpt-4o,,27,228,66,144,22,23
3,gpt-4o,with_deequ,45,209,76,133,33,12
4,gpt-4o,with_experience,21,233,70,139,28,17
5,gpt-4.5-preview,,12,21,6,5,19,3
6,gpt-4.5-preview,with_deequ,12,21,8,3,19,3
7,gpt-4.5-preview,with_experience,18,9,10,0,15,2
8,,,0,0,0,0,0,0


In [5]:
import pandas as pd

results = []

for model, strategy in unique_combinations.values:
    df_ = df[(df['Model'] == model) & (df['Strategy'] == strategy)]
    df_with_no_impact_errors = df_[df_['performance_drop'] == False]
    df_with_impact_errors = df_[df_['performance_drop'] == True]

    # True Positive: No impact and constraints passed
    no_impact_pass = (df_['Failed Constraints (Clean)'] == 0).sum() + (
        df_with_no_impact_errors['Failed Constraints (Corrupted)'] == 0).sum()
    
    # False Positive: No impact but constraints failed
    no_impact_fail = (df_['Failed Constraints (Clean)'] > 0).sum() + (
        df_with_no_impact_errors['Failed Constraints (Corrupted)'] > 0).sum()
    
    # True Negative: Impact and constraints failed
    impact_fail = (df_with_impact_errors['Failed Constraints (Corrupted)'] > 0).sum()
    
    # False Negative: Impact but constraints passed
    impact_pass = (df_with_impact_errors['Failed Constraints (Corrupted)'] == 0).sum()

    # Safe calculations for precision, recall, and F1-score
    precision = no_impact_pass / (no_impact_pass + no_impact_fail) if (no_impact_pass + no_impact_fail) > 0 else 0.0
    recall = no_impact_pass / (no_impact_pass + impact_pass) if (no_impact_pass + impact_pass) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    results.append([model, strategy, no_impact_pass, no_impact_fail, impact_fail, impact_pass, f1])

result_df = pd.DataFrame(results,
                         columns=['Model', 'Strategy', 'No Impact Pass', 'No Impact False alarm',
                                  'Task failure Detection', 'Task failure Miss', 'F1'])

result_df

Unnamed: 0,Model,Strategy,No Impact Pass,No Impact False alarm,Task failure Detection,Task failure Miss,F1
0,deequ,,95,370,41,4,0.336879
1,deequ,column_skipped,328,137,34,11,0.81592
2,gpt-4o,,372,93,22,23,0.865116
3,gpt-4o,with_deequ,342,121,33,12,0.837209
4,gpt-4o,with_experience,372,91,28,17,0.873239
5,gpt-4.5-preview,,26,18,19,3,0.712329
6,gpt-4.5-preview,with_deequ,24,20,19,3,0.676056
7,gpt-4.5-preview,with_experience,9,28,15,2,0.375
8,,,0,0,0,0,0.0


In [6]:
results = []

for model, strategy in unique_combinations.values:
    df_ = df[(df['Model'] == model) & (df['Strategy'] == strategy)]

    clean_pass = (df_['Failed Constraints (Clean)'] == 0).sum()  # true positive
    clean_fail = (df_['Failed Constraints (Clean)'] > 0).sum()  # false positive
    corrupted_fail = (df_['Failed Constraints (Corrupted)'] > 0).sum()  # true negative
    corrupted_pass = (df_['Failed Constraints (Corrupted)'] == 0).sum()  # false negative
    

    precision = clean_pass / (clean_pass + clean_fail) if (clean_pass + clean_fail) > 0 else 0.0
    recall = clean_pass / (clean_pass + corrupted_pass) if (clean_pass + corrupted_pass) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    results.append([model, strategy, clean_pass, clean_fail, corrupted_fail, corrupted_pass, f1])

# Convert to DataFrame
result_df = pd.DataFrame(results,
                         columns=['Model', 'Strategy', 'Clean Pass', 'Clean False alarm', 'Corrupted Detection',
                                  'Corrupted Miss',
                                  'F1'])
result_df

Unnamed: 0,Model,Strategy,Clean Pass,Clean False alarm,Corrupted Detection,Corrupted Miss,F1
0,deequ,,75,180,231,24,0.423729
1,deequ,column_skipped,255,0,171,84,0.858586
2,gpt-4o,,228,27,88,167,0.701538
3,gpt-4o,with_deequ,209,45,109,145,0.6875
4,gpt-4o,with_experience,233,21,98,156,0.724728
5,gpt-4.5-preview,,21,12,25,8,0.677419
6,gpt-4.5-preview,with_deequ,21,12,27,6,0.7
7,gpt-4.5-preview,with_experience,9,18,25,2,0.473684
8,,,0,0,0,0,0.0
