In [1]:
%config Completer.use_jedi = False
import sys
sys.path.append('../')

In [2]:
from checklist.test_suite import TestSuite
import pandas as pd
from math import ceil

pd.set_option('display.max_colwidth', None)

def load_suite(path):
    suite = TestSuite.from_file(path)
    summary_table = suite.visual_summary_table()
    
    return suite, summary_table


def proccess_summary_do_dataframe(suite, summary_table):
    test_templates = summary_table.get_state()['test_infos']

    items = []
    templates = []
    
    for test_template in test_templates:
        test_template_name = test_template['name']
        template_name = test_template_name.split(' - ')[1]
        test_cases = suite.visual_summary_by_test(test_template_name).filtered_testcases

        templates.append(template_name)

        for item in test_cases:
            text = item['examples'][0]['new']['text']
            pred = item['examples'][0]['new']['pred']
            label = item['examples'][0]['label']
            succeed = item['examples'][0]['succeed']

            items.append([text, label, pred, succeed, template_name])

    df = pd.DataFrame(items, columns=['text', 'label', 'pred', 'succeed', 'template'])
    
    return df, templates
    


def sample_size(n, dp=0.8, e=0.05, z=1.96):
    p0 = dp*(1-dp)
    numerador = ((z**2) * p0)/(e**2)
    denominador = 1 + (((z**2) * p0)/((e**2)*n))
    return ceil(numerador/denominador)

def sample_df(df_template, df_total_size):
    total_samples = sample_size(df_total_size)
    total = ceil(total_samples * (len(df_template) / df_total_size))

    if len(df_template) > 0:
        total = max(1, total)

    return df_template.sample(total, random_state=42)    

In [3]:
templates_df = pd.read_csv("generated_templates/generated_templates_approach2.csv")
templates_df

Unnamed: 0.1,Unnamed: 0,template_index,label,original_text,masked_text,template_text


In [4]:
from math import ceil

# !rmdir /s /q test_cases
# !mkdir test_cases

approachs = ['approach1','approach2','approach3','approach4','approach5','random']


for appr in approachs:
    suite, summary_table = load_suite(f'./suites/posneg-{appr}.suite')
    df, template_names = proccess_summary_do_dataframe(suite, summary_table)
    print(f'Proccessing {appr}...')

    filename = f'test_cases/{appr}.xlsx'
    
    df0 = df[df['succeed'] == 0]
    df1 = df[df['succeed'] == 1]
    
    sample0_count = 0
    sample1_count = 0
    with pd.ExcelWriter(filename) as writer:
        templates_df = pd.read_csv(f"generated_templates/generated_templates_{appr}.csv")
        templates_df.to_excel(writer, sheet_name="templates", index=False)
        for template_name in template_names:
            df_template0 = df0[df0['template'] == template_name]
            df_template1 = df1[df1['template'] == template_name]

            df_sampled0 = sample_df(df_template0, len(df0))
            df_sampled1 = sample_df(df_template1, len(df1))
            
            df_filtered = pd.concat([df_sampled0, df_sampled1], axis=0)
            columns = ['text', 'label', 'pred', 'succeed']

            df_filtered.to_excel(writer, sheet_name=template_name, index=False, columns=columns)
            sample0_count += len(df_sampled0)
            sample1_count += len(df_sampled1)

    print(appr, f'samples: (class 0: {sample0_count}, class 1: {sample1_count})')

Please wait as we prepare the table data...
Proccessing approach1...
approach1 samples: (class 0: 0, class 1: 0)
Please wait as we prepare the table data...
Proccessing approach2...
approach2 samples: (class 0: 0, class 1: 0)
Please wait as we prepare the table data...
Proccessing approach3...
approach3 samples: (class 0: 121, class 1: 222)
Please wait as we prepare the table data...
Proccessing approach4...
approach4 samples: (class 0: 169, class 1: 234)
Please wait as we prepare the table data...
Proccessing approach5...
approach5 samples: (class 0: 207, class 1: 243)
Please wait as we prepare the table data...
Proccessing random...
random samples: (class 0: 147, class 1: 220)
