In [4]:
import sys
import os
ROOT_DIR = '..'
sys.path.append(os.path.join(ROOT_DIR, 'src'))

import os
import sys

In [5]:
with open(f"{ROOT_DIR}/API_KEY.txt", "r") as file:
    api_key = file.read().strip()

os.environ['OPENAI_API_KEY'] = api_key
# os.environ['CACHE_DIR'] = os.path.join(ROOT_DIR, 'cache_dir')

In [6]:
data_name = 'synthchain_vanilla3_no_repeat_10_2'
method_name = 'cert_granular_temp0_nonexact'
cache = True
split = 'val'
# model_name = 'qwen2.5-7b-instruct'
# model_name = 'phi-4'
model_name = 'gpt-4o-mini'
# model_name = 'gpt-4.1-mini'
p = 0.95
debug = True
suffix = '.no_cot'

# Experiment Config

In [7]:
# Possible Configs
# exp_name = 'eb1_simple', 'eb1_overcomplete' etc.
# Inside the config:
# task (eb1, eb2, eb3, nsf_scify): str
# overcomplete: bool
import importlib
import exp_helpers
importlib.reload(exp_helpers)
from exp_helpers.exp_configs import DATA_CONFIGS, METHOD_CONFIGS, MODEL_CONFIGS


data_config = DATA_CONFIGS[data_name]
method_config = METHOD_CONFIGS[method_name]
model_config = MODEL_CONFIGS[model_name]

results_dir = os.path.join(
    ROOT_DIR, 
    'results',
    'stability_rate', 
    f'{data_name}.{split}.{model_name}.{method_name}.p{p}{suffix}'
)
os.makedirs(results_dir, exist_ok=True)

# Load data

In [8]:
from exp_helpers.datasets import get_dataset

[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# Assuming data is in mythbusters/data/scify
dataset = get_dataset(data_config, split, root_dir=ROOT_DIR)

In [10]:
len(dataset)

1000

In [11]:
dataset[0]['raw_claims'], dataset[0]['derived_claims']

(['Rule: H3 -> AZ (meaning that if I have H3, I can derive AZ)',
  'Rule: SG -> C6 (meaning that if I have SG, I can derive C6)',
  'Rule: C6 -> GM (meaning that if I have C6, I can derive GM)',
  'Rule: VD -> H3 (meaning that if I have VD, I can derive H3)',
  'Rule: G8 -> VD (meaning that if I have G8, I can derive VD)',
  'Rule: D8 -> U8 (meaning that if I have D8, I can derive U8)',
  'I have D8',
  'Rule: U8 -> DG (meaning that if I have U8, I can derive DG)',
  'Rule: DG -> G8 (meaning that if I have DG, I can derive G8)'],
 ['I have D8, I use rule (D8 -> U8) to derive U8, now I have U8',
  'I have U8, I use rule (U8 -> DG) to derive DG, now I have DG',
  'I have DG, I use rule (DG -> G8) to derive G8, now I have G8',
  'I have G8, I use rule (G8 -> VD) to derive VD, now I have VD',
  'I have VD, I use rule (VD -> H3) to derive H3, now I have H3',
  'I have H3, I use rule (H3 -> AZ) to derive AZ, now I have AZ',
  'I have AZ, I use rule (AZ -> SG) to derive SG, now I have SG',
  

In [12]:
di = 1
dataset[di]['raw_claims'], dataset[di]['derived_claims']

(['Rule: G8 -> VD (meaning that if I have G8, I can derive VD)',
  'Rule: H3 -> DG (meaning that if I have H3, I can derive DG)',
  'Rule: VD -> SG (meaning that if I have VD, I can derive SG)',
  'Rule: SG -> H3 (meaning that if I have SG, I can derive H3)',
  'Rule: DG -> AZ (meaning that if I have DG, I can derive AZ)',
  'Rule: D8 -> G8 (meaning that if I have D8, I can derive G8)',
  'I have GM',
  'Rule: U8 -> C6 (meaning that if I have U8, I can derive C6)',
  'Rule: GM -> U8 (meaning that if I have GM, I can derive U8)'],
 ['I have GM, I use rule (GM -> U8) to derive U8, now I have U8',
  'I have U8, I use rule (U8 -> C6) to derive C6, now I have C6',
  'I have C6, I use rule (C6 -> D8) to derive D8, now I have D8',
  'I have D8, I use rule (D8 -> G8) to derive G8, now I have G8',
  'I have G8, I use rule (G8 -> VD) to derive VD, now I have VD',
  'I have VD, I use rule (VD -> SG) to derive SG, now I have SG',
  'I have SG, I use rule (SG -> H3) to derive H3, now I have H3',
  

In [13]:
# TODO: add to the end of the base claims: "Is claim {claim} scientifically feasible, where scientifically feasible means XXX?"
# At the very end of derived claim, there should be another claim added: "Therefore, the claim {claim} is {mapping[likert]}"

# Load Model

In [14]:
from exp_helpers.models import get_llm, EntailmentModel

INFO 01-12 19:13:10 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 01-12 19:13:11 [__init__.py:239] Automatically detected platform cuda.


In [15]:
llm = get_llm(**model_config)
entailment_model = EntailmentModel(llm=llm, max_new_tokens=1000)

# Generate Entailment Prob

In [16]:
entailment_mode = entailment_model.create_custom_config(
    system_prompt = """
You are an expert judge for evaluating entailment. Given claims/reasoning chain as evidence and a hypothesis, determine if all the claims together supports that hypothesis is correct. 

The hypothesis is entailed if it can be derived from the premises.
For the hypothesis, please first make your reasoning about what is the precondition the hypothesis is (which clues or what previously derived results it uses etc.), and what conclusion it is making based on the precondition.
If the hypothesis says it is only use particular rules, then you should only make the judgement based on those clues, without relying on other previously stated claims, even if having access to those claims allows you to reach the conclusion.
Please only say it is entailed if the both the precondition and the logical derivation of the conclusion based on the precondition are correct.
Provide your judgment as one of the following: "Very Likely", "Likely", "Somewhat Likely", "Neutral", "Somewhat Unlikely", "Unlikely", "Very Unlikely", where Very Likely means the hypothesis is very likely to be entailed (given the premise), and Very Unlikely means that the hypothesis contradicts to some premise logically.

Input format:
Context:
<claims as evidence>

Hypothesis
<hypothesis claim>

The output format must be one of the following without additional words:
Very Likely/Likely/Somewhat Likely/Neutral/Somewhat Unlikely/Unlikely/Very Unlikely
""",
    mapping = {
        'Very Likely': 1.0,
        'Likely': 0.8,
        'Somewhat Likely': 0.6,
        'Neutral': 0.5,
        'Somewhat Unlikely': 0.4,
        'Unlikely': 0.2,
        'Very Unlikely': 0.0
    },
    field_name='Entailment',
    default_value='Neutral'
)

method_config['kwargs']['entailment_mode'] = entailment_mode

In [17]:
from exp_helpers.methods import get_stability_scorer

stability_scorer = get_stability_scorer(method_config['method'], entailment_model, p, **method_config['kwargs'])

In [18]:
results_dir

'../results/stability_rate/synthchain_vanilla3_no_repeat_10_2.val.gpt-4o-mini.cert_granular_temp0_nonexact.p0.95.no_cot'

In [19]:
from tqdm.auto import tqdm
import json
import random
import time

total = len(dataset)
if debug:
    total = min(total, 5)
    
for di in tqdm(range(total)):
    data_entry = dataset[di]
    if os.path.exists(os.path.join(results_dir, f'{di}.json')):
        continue
    results_all = stability_scorer.get_stability_rate(data_entry)
    
    results_save = results_all._asdict()
    for i in range(len(results_save['stab_rate_results'])):
        results_save['stab_rate_results'][i]['stab_rate_results']['entailment_mode'] = suffix
    
    # sleep_duration = random.uniform(0.5, 2)
    # time.sleep(sleep_duration)
    
    with open(os.path.join(results_dir, f'{di}.json'), 'wt') as output_file:
        json.dump(results_save, output_file, indent=4)

  0%|          | 0/5 [00:00<?, ?it/s]

# Look at the results

In [20]:
di = 0

load_path = os.path.join(results_dir, f'{di}.json')
with open(load_path, 'rt') as input_file:
    results = json.load(input_file)

base_claims = dataset[di]['raw_claims']
derived_claims = dataset[di]['derived_claims']

In [21]:
results['stability_rates'], len(results['stability_rates']), len(derived_claims)

([0.9746154546737671,
  0.9676923751831055,
  0.9299999475479126,
  0.8892307281494141,
  0.9307692050933838,
  0.88692307472229,
  0.01230769231915474,
  0.01230769231915474,
  0.016153845936059952],
 9,
 9)

In [22]:
print("=== Base Claims ===")
for i, claim in enumerate(base_claims, 1):
    print(f"{i:2d}. {claim}")

print("\n=== Derived Claims with Stability Rates ===")
for i, (claim, rate) in enumerate(zip(derived_claims, results['stability_rates']), 1):
    print(f"{i:2d}. [{rate:.2f}] {claim}")

=== Base Claims ===
 1. Rule: H3 -> AZ (meaning that if I have H3, I can derive AZ)
 2. Rule: SG -> C6 (meaning that if I have SG, I can derive C6)
 3. Rule: C6 -> GM (meaning that if I have C6, I can derive GM)
 4. Rule: VD -> H3 (meaning that if I have VD, I can derive H3)
 5. Rule: G8 -> VD (meaning that if I have G8, I can derive VD)
 6. Rule: D8 -> U8 (meaning that if I have D8, I can derive U8)
 7. I have D8
 8. Rule: U8 -> DG (meaning that if I have U8, I can derive DG)
 9. Rule: DG -> G8 (meaning that if I have DG, I can derive G8)

=== Derived Claims with Stability Rates ===
 1. [0.97] I have D8, I use rule (D8 -> U8) to derive U8, now I have U8
 2. [0.97] I have U8, I use rule (U8 -> DG) to derive DG, now I have DG
 3. [0.93] I have DG, I use rule (DG -> G8) to derive G8, now I have G8
 4. [0.89] I have G8, I use rule (G8 -> VD) to derive VD, now I have VD
 5. [0.93] I have VD, I use rule (VD -> H3) to derive H3, now I have H3
 6. [0.89] I have H3, I use rule (H3 -> AZ) to de

In [23]:
import pandas as pd

# Base claims DataFrame
df_base = pd.DataFrame({
    "Index": range(1, len(base_claims) + 1),
    "Base Claim": base_claims
})

# Derived claims with stability rates DataFrame
df_derived = pd.DataFrame({
    "Index": range(1, len(derived_claims) + 1),
    "Stability Rate": results['stability_rates'],
    "Derived Claim": derived_claims
})

# Display nicely
print("=== Base Claims ===")
display(df_base)

print("\n=== Derived Claims with Stability Rates ===")
display(df_derived.style.format({"Stability Rate": "{:.2f}"}))


=== Base Claims ===


Unnamed: 0,Index,Base Claim
0,1,"Rule: H3 -> AZ (meaning that if I have H3, I c..."
1,2,"Rule: SG -> C6 (meaning that if I have SG, I c..."
2,3,"Rule: C6 -> GM (meaning that if I have C6, I c..."
3,4,"Rule: VD -> H3 (meaning that if I have VD, I c..."
4,5,"Rule: G8 -> VD (meaning that if I have G8, I c..."
5,6,"Rule: D8 -> U8 (meaning that if I have D8, I c..."
6,7,I have D8
7,8,"Rule: U8 -> DG (meaning that if I have U8, I c..."
8,9,"Rule: DG -> G8 (meaning that if I have DG, I c..."



=== Derived Claims with Stability Rates ===


Unnamed: 0,Index,Stability Rate,Derived Claim
0,1,0.97,"I have D8, I use rule (D8 -> U8) to derive U8, now I have U8"
1,2,0.97,"I have U8, I use rule (U8 -> DG) to derive DG, now I have DG"
2,3,0.93,"I have DG, I use rule (DG -> G8) to derive G8, now I have G8"
3,4,0.89,"I have G8, I use rule (G8 -> VD) to derive VD, now I have VD"
4,5,0.93,"I have VD, I use rule (VD -> H3) to derive H3, now I have H3"
5,6,0.89,"I have H3, I use rule (H3 -> AZ) to derive AZ, now I have AZ"
6,7,0.01,"I have AZ, I use rule (AZ -> SG) to derive SG, now I have SG"
7,8,0.01,"I have SG, I use rule (SG -> C6) to derive C6, now I have C6"
8,9,0.02,"I have C6, I use rule (C6 -> GM) to derive GM, now I have GM"
