In [1]:
import sys
import os
ROOT_DIR = '..'
sys.path.append(os.path.join(ROOT_DIR, 'src'))

import os
import sys

In [2]:
with open(f"{ROOT_DIR}/API_KEY.txt", "r") as file:
    api_key = file.read().strip()

os.environ['OPENAI_API_KEY'] = api_key
# os.environ['CACHE_DIR'] = os.path.join(ROOT_DIR, 'cache_dir')

In [5]:
data_name = 'recipe_graph3'
method_name = 'cert_granular_temp0_nonexact'
cache = True
split = 'val'
# model_name = 'qwen2.5-7b-instruct'
# model_name = 'phi-4'
model_name = 'gpt-4o-mini'
# model_name = 'gpt-4.1-mini'
p = 0.95
debug = True
suffix = '.no_cot'

# Experiment Config

In [6]:
# Possible Configs
# exp_name = 'eb1_simple', 'eb1_overcomplete' etc.
# Inside the config:
# task (eb1, eb2, eb3, nsf_scify): str
# overcomplete: bool
import importlib
import exp_helpers
importlib.reload(exp_helpers)
from exp_helpers.exp_configs import DATA_CONFIGS, METHOD_CONFIGS, MODEL_CONFIGS


data_config = DATA_CONFIGS[data_name]
method_config = METHOD_CONFIGS[method_name]
model_config = MODEL_CONFIGS[model_name]

results_dir = os.path.join(
    ROOT_DIR, 
    'results',
    'stability_rate', 
    f'{data_name}.{split}.{model_name}.{method_name}.p{p}{suffix}'
)
os.makedirs(results_dir, exist_ok=True)

# Load data

In [7]:
from exp_helpers.datasets import get_dataset

[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/runai-home/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
# Assuming data is in mythbusters/data/scify
dataset = get_dataset(data_config, split, root_dir=ROOT_DIR)

In [10]:
len(dataset)

1000

In [11]:
dataset[0]['raw_claims'], dataset[0]['derived_claims']

(['We have pepper.',
  'We have garlic.',
  'We have onion.',
  'Only after the necessary preceding steps (START), And if we have all the ingredients, we can then Peel-Peel 1 garlic clove.',
  'Only after the necessary preceding steps (Stir-Stir the contents in the microwave with a spoon), And if we have all the ingredients, we can then Microwave-Microwave the plate, covered, on high for 1.5 minutes.',
  'Only after the necessary preceding steps (Cut-Cut 1/8 garlic clove), And if we have all the ingredients, we can then Mince-Mince 1/8 garlic clove.',
  'Only after the necessary preceding steps (Mix-Mix 1/4 cup sweet-and-sour sauce and 1/2 teaspoon soy sauce in a small bowl, and Top-Top the plate with the carrots, onion, garlic and 1/4 tsp pepper powder), And if we have all the ingredients, we can then Pour-Pour the sauces over the meatballs.',
  'Only after the necessary preceding steps (START), And if we have all the ingredients, we can then Cut-Cut 1/4 medium carrot into short, thin

In [12]:
di = 1
dataset[di]['raw_claims'], dataset[di]['derived_claims']

(['We have corn.',
  'Only after the necessary preceding steps (Mix-Mix the contents of the bowl well), And if we have all the ingredients, we can then END.',
  'Only after the necessary preceding steps (Measure-Measure 2 cups of frozen corn), And if we have all the ingredients, we can then Thaw-Thaw the frozen corn by putting it in a sieve and running it under cold water.',
  'We have lime.',
  'Only after the necessary preceding steps (Microwave-Microwave the corn for 3 more minutes), And if we have all the ingredients, we can then Add-Add 1 teaspoon salt to the bowl.',
  'Only after the necessary preceding steps (Microwave-Microwave the corn for 3 more minutes, and Extract-Extract lime juice from 1/3 lime), And if we have all the ingredients, we can then add-add lime juice to the bowl.',
  'Only after the necessary preceding steps (START), And if we have all the ingredients, we can then Measure-Measure 2 cups of frozen corn.',
  'Only after the necessary preceding steps (Add-Add 1 t

In [None]:
# TODO: add to the end of the base claims: "Is claim {claim} scientifically feasible, where scientifically feasible means XXX?"
# At the very end of derived claim, there should be another claim added: "Therefore, the claim {claim} is {mapping[likert]}"

# Load Model

In [13]:
from exp_helpers.models import get_llm, EntailmentModel

INFO 01-12 19:10:28 [importing.py:53] Triton module has been replaced with a placeholder.
INFO 01-12 19:10:28 [__init__.py:239] Automatically detected platform cuda.


In [14]:
llm = get_llm(**model_config)
entailment_model = EntailmentModel(llm=llm, max_new_tokens=1000)

# Generate Entailment Prob

In [15]:
entailment_mode = entailment_model.create_custom_config(
    system_prompt = """
You are an expert judge for evaluating entailment. Given claims/reasoning chain as evidence and a hypothesis, determine if all the claims together supports that hypothesis is correct. 

The hypothesis is entailed if it can be derived from the premises.
For the hypothesis, please first make your reasoning about what is the precondition the hypothesis is (which clues or what previously derived results it uses etc.), and what conclusion it is making based on the precondition.
If the hypothesis says it is only use particular rules, then you should only make the judgement based on those clues, without relying on other previously stated claims, even if having access to those claims allows you to reach the conclusion.
Please only say it is entailed if the both the precondition and the logical derivation of the conclusion based on the precondition are correct.
Provide your judgment as one of the following: "Very Likely", "Likely", "Somewhat Likely", "Neutral", "Somewhat Unlikely", "Unlikely", "Very Unlikely", where Very Likely means the hypothesis is very likely to be entailed (given the premise), and Very Unlikely means that the hypothesis contradicts to some premise logically.
If there are certain ingredients needed in the hypothesis that are not in the premise, then it should be Very Unlikely.
If there are certain previous steps that need to be completed before doing the step in the hypothesis, and that step is not completed, then it should also be Very Unlikely.

Input format:
Context:
<claims as evidence>

Hypothesis
<hypothesis claim>

The output format must be one of the following without additional words:
Very Likely/Likely/Somewhat Likely/Neutral/Somewhat Unlikely/Unlikely/Very Unlikely
""",
    mapping = {
        'Very Likely': 1.0,
        'Likely': 0.8,
        'Somewhat Likely': 0.6,
        'Neutral': 0.5,
        'Somewhat Unlikely': 0.4,
        'Unlikely': 0.2,
        'Very Unlikely': 0.0
    },
    field_name='Entailment',
    default_value='Neutral'
)

method_config['kwargs']['entailment_mode'] = entailment_mode

In [16]:
from exp_helpers.methods import get_stability_scorer

stability_scorer = get_stability_scorer(method_config['method'], entailment_model, p, **method_config['kwargs'])

In [17]:
results_dir

'../results/stability_rate/recipe_graph3.val.gpt-4o-mini.cert_granular_temp0_nonexact.p0.95.no_cot'

In [18]:
from tqdm.auto import tqdm
import json
import random
import time

total = len(dataset)
if debug:
    total = min(total, 5)
    
for di in tqdm(range(total)):
    data_entry = dataset[di]
    if os.path.exists(os.path.join(results_dir, f'{di}.json')):
        continue
    results_all = stability_scorer.get_stability_rate(data_entry)
    
    results_save = results_all._asdict()
    for i in range(len(results_save['stab_rate_results'])):
        results_save['stab_rate_results'][i]['stab_rate_results']['entailment_mode'] = suffix
    
    # sleep_duration = random.uniform(0.5, 2)
    # time.sleep(sleep_duration)
    
    with open(os.path.join(results_dir, f'{di}.json'), 'wt') as output_file:
        json.dump(results_save, output_file, indent=4)

  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Look at the results

In [16]:
di = 0

load_path = os.path.join(results_dir, f'{di}.json')
with open(load_path, 'rt') as input_file:
    results = json.load(input_file)

base_claims = dataset[di]['raw_claims']
derived_claims = dataset[di]['derived_claims']

In [19]:
results['stability_rates'], len(results['stability_rates']), len(derived_claims)

([0.962598443031311,
  0.9960628747940063,
  0.9803148508071899,
  0.9625983834266663,
  0.960629940032959,
  0.9999998807907104,
  0.9999998807907104,
  0.9999998807907104],
 8,
 8)

In [22]:
print("=== Base Claims ===")
for i, claim in enumerate(base_claims, 1):
    print(f"{i:2d}. {claim}")

print("\n=== Derived Claims with Stability Rates ===")
for i, (claim, rate) in enumerate(zip(derived_claims, results['stability_rates']), 1):
    print(f"{i:2d}. [{rate:.2f}] {claim}")

=== Base Claims ===
 1. A kinetic energy cutoff of 40 Ry for the wave function and of 240 Ry for the density together with a (10 × 10 × 12) Monkhorst–Pack k-point grid were used in geometry optimization; a (12 × 12 × 16) k-point grid was instead used for density of states calculations. Since IrO2 is a metal, we applied the Marzari-Vanderbilt cold smearing (22) of 0.02 Ry to the computed eigenvalues so as to improve the k-point convergence of our calculations.
 2. We mentioned already the different demands for sampling density in semiconducting vs. metallic systems. Even if a material is semiconducting in the bulk, it may possess surface or interface states that are metallic.
 3. Interestingly, transition metals require more k-points in general, independently from being part of a metallic (Fig. 6c) or of a non-metallic (Fig. 6d) system.
 4. A baseline k-point mesh of 1000/(number of atoms in the cell) is used for all computations... As mentioned, we currently employ a k-point mesh of 10

In [23]:
import pandas as pd

# Base claims DataFrame
df_base = pd.DataFrame({
    "Index": range(1, len(base_claims) + 1),
    "Base Claim": base_claims
})

# Derived claims with stability rates DataFrame
df_derived = pd.DataFrame({
    "Index": range(1, len(derived_claims) + 1),
    "Stability Rate": results['stability_rates'],
    "Derived Claim": derived_claims
})

# Display nicely
print("=== Base Claims ===")
display(df_base)

print("\n=== Derived Claims with Stability Rates ===")
display(df_derived.style.format({"Stability Rate": "{:.2f}"}))


=== Base Claims ===


Unnamed: 0,Index,Base Claim
0,1,A kinetic energy cutoff of 40 Ry for the wave ...
1,2,We mentioned already the different demands for...
2,3,"Interestingly, transition metals require more ..."
3,4,A baseline k-point mesh of 1000/(number of ato...
4,5,"For bulk calculations, a 4 × 4 × 4 Monkhorst-P..."
5,6,Metallic systems require accurate Fermi surfac...
6,7,The tetragonal symmetry of rutile (space group...
7,8,The comparison between insulating rutile struc...
8,9,Based on convergence studies showing that meta...



=== Derived Claims with Stability Rates ===


Unnamed: 0,Index,Stability Rate,Derived Claim
0,1,0.96,The claim that a 5x5x5 k-point grid is sufficient for converged DFT energy calculations of rutile-IrO2 bulk is extremely infeasible.
1,2,1.0,"Published DFT studies on rutile IrO2 employ significantly denser k-point grids, with 10×10×12 for geometry optimization and 12×12×16 for density of states calculations."
2,3,0.98,"IrO2 is a metallic conductor, which requires denser k-point sampling compared to insulators due to the need to accurately capture the Fermi surface."
3,4,0.96,Transition metal compounds generally require higher k-point densities regardless of whether they are metallic or non-metallic.
4,5,0.96,"The rutile structure has tetragonal symmetry (a=b≠c), which typically requires different k-point densities along different crystallographic directions."
5,6,1.0,"Materials Project uses a baseline of 1000 k-points per reciprocal atom for transition metal oxides, which would translate to much denser grids than 5×5×5 for a 6-atom rutile unit cell."
6,7,1.0,"Even for insulating rutile structures, coarser grids like 4×4×4 are considered minimal, and metallic systems require significantly denser sampling."
7,8,1.0,"A 5×5×5 grid would likely result in errors of several meV/atom or more, which is insufficient for reliable total energy calculations."
