In [1]:
import sys
import os
ROOT_DIR = '../../../../../../..'
sys.path.append(os.path.join(ROOT_DIR, 'src'))

import os
import sys

In [2]:
with open(f"{ROOT_DIR}/API_KEY.txt", "r") as file:
    api_key = file.read().strip()

os.environ['OPENAI_API_KEY'] = api_key
# os.environ['CACHE_DIR'] = os.path.join(ROOT_DIR, 'cache_dir_synthchain_stability')

In [3]:
data_name = 'synthchain_vanilla3_no_repeat_10_2'
method_name = 'cert_binary_temp0_nonexact'
cache = True
split = 'val'
# model_name = 'qwen2.5-7b-instruct'
# model_name = 'phi-4'
model_name = 'gpt-4o-mini'
p = 0.95
debug = True
suffix = '.no_cot'

# Experiment Config

In [4]:
# Possible Configs
# exp_name = 'eb1_simple', 'eb1_overcomplete' etc.
# Inside the config:
# task (eb1, eb2, eb3, nsf_scify): str
# overcomplete: bool
import importlib
import exp_helpers
importlib.reload(exp_helpers)
from exp_helpers.exp_configs import DATA_CONFIGS, METHOD_CONFIGS, MODEL_CONFIGS


data_config = DATA_CONFIGS[data_name]
method_config = METHOD_CONFIGS[method_name]
model_config = MODEL_CONFIGS[model_name]

results_dir = os.path.join(
    ROOT_DIR, 
    'results',
    'stability_rate', 
    f'{data_name}.{split}.{model_name}.{method_name}.p{p}{suffix}'
)
os.makedirs(results_dir, exist_ok=True)

# Load Data

In [5]:
from exp_helpers.datasets import get_dataset

[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
dataset = get_dataset(data_config, split)

In [7]:
len(dataset)

1000

In [8]:
dataset[0]['raw_claims'], dataset[0]['derived_claims']

(['Rule: H3 -> AZ (meaning that if I have H3, I can derive AZ)',
  'Rule: SG -> C6 (meaning that if I have SG, I can derive C6)',
  'Rule: C6 -> GM (meaning that if I have C6, I can derive GM)',
  'Rule: VD -> H3 (meaning that if I have VD, I can derive H3)',
  'Rule: G8 -> VD (meaning that if I have G8, I can derive VD)',
  'Rule: D8 -> U8 (meaning that if I have D8, I can derive U8)',
  'I have D8',
  'Rule: U8 -> DG (meaning that if I have U8, I can derive DG)',
  'Rule: DG -> G8 (meaning that if I have DG, I can derive G8)'],
 ['I have D8, I use rule (D8 -> U8) to derive U8, now I have U8',
  'I have U8, I use rule (U8 -> DG) to derive DG, now I have DG',
  'I have DG, I use rule (DG -> G8) to derive G8, now I have G8',
  'I have G8, I use rule (G8 -> VD) to derive VD, now I have VD',
  'I have VD, I use rule (VD -> H3) to derive H3, now I have H3',
  'I have H3, I use rule (H3 -> AZ) to derive AZ, now I have AZ',
  'I have AZ, I use rule (AZ -> SG) to derive SG, now I have SG',
  

# Load Model

In [9]:
from exp_helpers.models import get_llm, EntailmentModel

In [10]:
llm = get_llm(**model_config)
entailment_model = EntailmentModel(llm=llm, max_new_tokens=1000)

# Generate Entailment Prob

In [11]:
entailment_mode = entailment_model.create_custom_config(
    system_prompt = """
You are an expert judge for evaluating entailment. Given claims/reasoning chain as evidence and a hypothesis, determine if all the claims together supports that hypothesis is correct. 

The hypothesis is entailed if it can be derived from the premises.
For the hypothesis, please first make your reasoning about what is the precondition the hypothesis is (which clues or what previously derived results it uses etc.), and what conclusion it is making based on the precondition.
If the hypothesis says it is only use particular rules, then you should only make the judgement based on those clues, without relying on other previously stated claims, even if having access to those claims allows you to reach the conclusion.
Please only say it is entailed if the both the precondition and the logical derivation of the conclusion based on the precondition are correct.
Provide your judgment as one of the following: "YES", "NO", where YES means the hypothesis is entailed (given the premise), and NO means that the hypothesis contradicts to some premise logically.

Input format:
Context:
<claims as evidence>

Hypothesis
<hypothesis claim>

The output format must be the following format without additional words.
```json
{
"Entailment": "<YES/NO>"
}
```
###
""",
    mapping = {
        'YES': 1.0,
        'NO': 0.0
    },
    field_name='Entailment',
    default_value='NO'
)

method_config['kwargs']['entailment_mode'] = entailment_mode

In [12]:
from exp_helpers.methods import get_stability_scorer

stability_scorer = get_stability_scorer(method_config['method'], entailment_model, p, **method_config['kwargs'])

In [13]:
results_dir

'../../../../../../../results/stability_rate/synthchain_vanilla3_no_repeat_10_2.val.gpt-4o-mini.cert_binary_temp0_nonexact.p0.95.no_cot'

In [14]:
from tqdm.auto import tqdm
import json
import random
import time

total = len(dataset)
if debug:
    total = min(total, 100)
    
for di in tqdm(range(total)):
    data_entry = dataset[di]
    if os.path.exists(os.path.join(results_dir, f'{di}.json')):
        continue
    results_all = stability_scorer.get_stability_rate(data_entry)
    
    results_save = results_all._asdict()
    for i in range(len(results_save['stab_rate_results'])):
        results_save['stab_rate_results'][i]['stab_rate_results']['entailment_mode'] = suffix
    
    # sleep_duration = random.uniform(0.5, 2)
    # time.sleep(sleep_duration)
    
    with open(os.path.join(results_dir, f'{di}.json'), 'wt') as output_file:
        json.dump(results_save, output_file, indent=4)

  0%|          | 0/100 [00:00<?, ?it/s]

# Sanity Check: Show An Example Tree

# Evaluate Against Ground Truth