In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import numpy as np
import pandas as pd

In [2]:
from pathlib import Path
from bench.spider.schema import load_schemas
from bench.spider.dialogue import load_spider_data
from bench.spider.prompt_formatter import SpiderPromptFormatter

raw_spider_dir = Path('../../spider/data/spider')

train_data = load_spider_data(
    raw_spider_dir / 'train_spider.json'
)

dev_data = load_spider_data(
    raw_spider_dir / 'dev.json'
)

spider_schemas = load_schemas(
    schemas_path=raw_spider_dir / 'tables.json', 
    db_path=raw_spider_dir / 'database'
)

In [5]:
import os
import sys
import json

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(parent_dir)

from evaluation_utils import get_final_particles_from_record, create_particle_approx

In [10]:

results_name = 'table_column_potential_n_particles_50'

results = []
with open(results_name + '.jsonl', 'r') as f:
    for l in f:
        results.append(json.loads(l))

### Run evaluation

In [11]:
from evaluation_utils import run_and_add_evaluation

results = run_and_add_evaluation(
    results, 
    raw_spider_dir=raw_spider_dir, 
    n_workers=8, 
    overwrite=False, 
    timeout=10
)

with open(results_name + '-evaluated' + '.jsonl', 'w') as f:
    for r in results:
        f.write(json.dumps(r) + '\n')

 87%|████████▋ | 901/1034 [31:18<06:39,  3.00s/it]  

Query execution timed out after 10 seconds.


100%|██████████| 1034/1034 [37:30<00:00,  2.18s/it]
 68%|██████▊   | 705/1034 [38:39<07:40,  1.40s/it]  

Query execution timed out after 10 seconds.


 79%|███████▉  | 819/1034 [42:05<11:29,  3.21s/it]

Query execution timed out after 10 seconds.


 79%|███████▉  | 820/1034 [42:30<34:57,  9.80s/it]

Query execution timed out after 10 seconds.


 80%|███████▉  | 824/1034 [42:40<15:35,  4.45s/it]

Query execution timed out after 10 seconds.


 80%|███████▉  | 826/1034 [42:42<10:07,  2.92s/it]

Query execution timed out after 10 seconds.


 80%|███████▉  | 827/1034 [42:58<23:00,  6.67s/it]

Query execution timed out after 10 seconds.


 82%|████████▏ | 846/1034 [44:04<14:29,  4.63s/it]

Query execution timed out after 10 seconds.
Query execution timed out after 10 seconds.


 82%|████████▏ | 849/1034 [44:25<19:28,  6.31s/it]

Query execution timed out after 10 seconds.


 83%|████████▎ | 858/1034 [44:42<05:07,  1.75s/it]

Query execution timed out after 10 seconds.


 83%|████████▎ | 859/1034 [45:48<1:01:19, 21.02s/it]

Query execution timed out after 10 seconds.


100%|██████████| 1034/1034 [52:29<00:00,  3.05s/it] 
100%|██████████| 1034/1034 [54:52<00:00,  3.18s/it] 
100%|██████████| 1034/1034 [1:03:22<00:00,  3.68s/it]


### Downstream accuracy

In [1]:
import json

results_name = 'table_column_potential_n_particles_5-evaluated'

results = []
with open(results_name + '.jsonl', 'r') as f:
    for l in f:
        results.append(json.loads(l))

In [6]:
from pathlib import Path
from bench.spider.schema import load_schemas
from run_inference import table_column_potential

raw_spider_dir = Path('../../spider/data/spider')
spider_schemas = load_schemas(
    schemas_path=raw_spider_dir / 'tables.json', 
    db_path=raw_spider_dir / 'database'
)

In [20]:
from tqdm.notebook import tqdm

def make_df(new_results):

    df_lines = []
    for i, result in tqdm(enumerate(new_results)):        
        df_lines.append({
            'question': result['question'],
            'ess_threshold': result['ess_threshold'] if 'ess_threshold' in result else 0,
            'n_particles': result['n_particles'],
            'method': result['method'],
            'result': result['results']['posterior_weighted_acc']['result'],
            'n_replicate': result['n_replicate'],
            'instance_idx': i,
            'resample_method': result['resample_method'] if 'resample_method' in result else 'multinomial',
        })

        if result['method'] == 'sis_no_potential':
            particle_results = result['results']['posterior_weighted_acc']['particle_results']

            assert result['n_particles'] == len(particle_results)

            acc = 0
            for particle_result in particle_results:
                if particle_result[1][0]:
                    acc += 1

            acc = acc / result['n_particles']

            df_lines.append({
                'question': result['question'],
                'ess_threshold': result['ess_threshold'] if 'ess_threshold' in result else 0,
                'n_particles': result['n_particles'],
                'method': 'local_poe',
                'result': acc,
                'n_replicate': result['n_replicate'],
                'instance_idx': i,
                'resample_method': 'multinomial',
            })

        if result['method'] == 'sis':
            particle_results = result['results']['posterior_weighted_acc']['particle_results']

            assert result['n_particles'] == len(particle_results)

            particles = get_final_particles_from_record(result['record'])

            # Renormalize distribtion based on potential results.
            # Because we are log_eps as the value for invalid cases,
            # we cannot systematically recover the potential values
            # from the weights. 

            potential_values = table_column_potential(
                particles=create_particle_approx(particles), 
                schema_name=result['db_name'], 
                grammar_path='../spider_grammars',
                spider_schemas=spider_schemas
            )

            acc = 0
            n_valid = 0
            for i, particle_result in enumerate(particle_results):
                assert ''.join(particle_result[0]) == ''.join(particles[i]['context'])
                if potential_values[i] == 0:  
                #if particles[i]['weight'] > np.log(1e-10):
                    n_valid += 1
                    if particle_result[1][0]:
                        acc += 1

            acc = acc / n_valid if n_valid > 0 else 0

            df_lines.append({
                'question': result['question'],
                'ess_threshold': result['ess_threshold'] if 'ess_threshold' in result else 0,
                'n_particles': result['n_particles'],
                'method': 'local_poe_with_potential',
                'result': acc,
                'num_resample_steps': 0,
                'n_replicate': result['n_replicate'],
                'instance_idx': i,
                'resample_method': 'multinomial',
            })

    return pd.DataFrame(df_lines)

In [21]:
df = make_df(results)
df.groupby(['method', 'n_particles']).agg({'result': ['mean', 'std', 'size']})

0it [00:00, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,result,result,result
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,size
method,n_particles,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
lm_baseline,50,0.538356,0.364068,1034
local_poe,50,0.564391,0.366432,1034
local_poe_with_potential,50,0.598549,0.363924,1034
sis,50,0.625974,0.368413,1034
sis_no_potential,50,0.603244,0.370643,1034
smc,50,0.621714,0.37736,1034
smc_no_potential,50,0.598601,0.376286,1034


In [22]:
df.to_csv('table_column_potential_n_particles50_llama3.1-8B-instruct_results.csv', index=False)