## Imports

In [9]:
import openai
import tiktoken

import pandas as pd
import json

from dateutil import parser as date_parser
from unidecode import unidecode

import os
import time
import re

## Table Generator

In [10]:
class TableGenerator_JSON():
    TEMPLATE = """
    List %s - as many as possible to fit into response.
    The response will be formatted as JSON shown below.
    Each element of the response will contain %d fields: %s.
    Do not output any additional text that is not in JSON format.
    
    RESPONSE FORMAT:
    [{
        %s
    }]
    """   
    
    def _norm_field(self, s):
        s = s.lower().replace(" ","_").replace("-","_").replace(".", "").replace(",","_")\
                .replace("(", "").replace(")", "").replace(":", "").replace('"','').replace("'","")\
                .replace("/", "")
        return re.sub('_+', '_', s)
        
    def generate_prompts(self, query, fields):
        system_msg = "You are a retriever of facts."

        num_fields = len(fields)
        fields_json = []
        fields = [self._norm_field(f) for f in fields]
        for field in fields:
            fields_json.append('"%s": "%s"' % ('_'.join(field.replace("-", " ").split()), field))
        response_format = ', '.join(fields_json)
        user_msg = self.TEMPLATE % (query, num_fields, fields, response_format)
        return system_msg, user_msg        
    
    def parse_llm_response(self, response): 
        res = []
        try:
            if not response.startswith("[") and "[" in response:
                response = response[response.find("["):]

            if not response.endswith("]") and "]" in response:
                response = response[:response.rfind("]")+1]

            if '[' not in response and ']' not in response and '{' in response and '}' in response:
                response = '[' + response + ']'    

            response_json = json.loads(response)

            if isinstance(response_json, dict) and len(response_json.keys()) == 1:
                response_json = list(response_json.values())[0]    
        except:  
            split_response = response.split("{")
            response_json = []
            for s in split_response[1:]:
                split_s = s.split("}")
                if len(split_s) > 1:
                    content = split_s[0]
                    attributes = content.split(",")
                    elements = {}
                    for attr in attributes:
                        knv = attr.split(":")   
                        if len(knv) > 1:
                            parsed_k = "%s" % knv[0].replace('"','').strip()
                            parsed_v = "%s" % knv[1].replace('"','').strip()
                            elements[parsed_k] = parsed_v

                    response_json.append(elements)  

        df = pd.DataFrame.from_records(response_json) 
        return df

## Experiment Runner

In [None]:
class ExperimentRunner():
    OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY')
    client = openai.OpenAI(
        api_key=OPENROUTER_API_KEY,
        base_url="https://openrouter.ai/api/v1"# Model to use for LLM queries
    )
    MODEL = 'x-ai/grok-code-fast-1'  # OpenRouter format: provider/model
    NOTE = 'full_table'
        
    def __init__(self, table_generator, metadata_path):
        with open(metadata_path, "rb") as f:
            self.metadata = json.load(f)
            
        self.table_generator = table_generator
        
        self.result_folder = "DATA/%s_%s_%s" % (self.MODEL.replace('-', '_'), 
                                                   self.NOTE,
                                                   time.strftime("%Y%m%d-%H%M%S"))
        
        print("Experiment result folder: %s" % self.result_folder)
        
        os.makedirs(self.result_folder)
        os.makedirs("%s/Tables" % self.result_folder)
        
        self.result = {}
        
    def fetch_data(self, idx):
        task = self.metadata[idx]
        
        task_name = task['name']        
        print("Fetching data for %s" % task_name)
        
        query, columns = task['table_title'], task['columns']            
        print("Query: ", query) 
        
        system_msg, user_msg = self.table_generator.generate_prompts(query, columns)        

        self.result[idx] = {'system_msg': system_msg, 'user_msg': user_msg}
        
        response = self.client.chat.completions.create(
            model=self.MODEL,
            messages=[{"role": "system", "content": system_msg},
            {"role": "user", "content": user_msg}],
            temperature=0
        )
        response = response.choices[0].message.content.strip()

        if 'response' in self.result[idx]:
            self.result[idx]['response'].append(response)
        else:    
            self.result[idx]['response'] = [response]

        df = self.table_generator.parse_llm_response(response)          
        df_ref = pd.read_csv(task['path'])          
        df.columns = df_ref.columns
        df = df.drop_duplicates(subset=task['keys'])

        table_path = "%s/Tables/%s.csv" % (self.result_folder, task_name)
        self.result[idx]['table_path'] = table_path                
        df.to_csv(table_path, index=False)

        print("Created table with %d rows" % len(df))

        return df
    
    def save_result(self):
        with open("%s/result.json" % self.result_folder, "w") as outfile:
            result_json = json.dumps(self.result, indent=4)
            outfile.write(result_json)

## Test

In [12]:
tg = TableGenerator_JSON()

runner = ExperimentRunner(tg, metadata_path="cfg.json")

print("\n====================\n")

for i in range(100):
    print("Table # %d" % (i+1))
    idx = "%d" % i
    table = runner.fetch_data(idx)
    print("\n====================\n")
    
runner.save_result()

Experiment result folder: DATA/x_ai/grok_code_fast_1_full_table_20251005-000106


Table # 1
Fetching data for republican_straw_polls_2012
Query:  results of straw polls for the Republican Party presidential primaries, 2012


KeyError: 'path'