In [1]:
from utils import *
from prompt_generators import create_basic_prompt, generate_coding_prompt, generate_prompt_with_image, generate_code_fixing_prompt, create_reasoning_model_prompt
from llm_utils import generate_llm_response, initialize_client
from eval import extract_solution, evaluate_solution, convert_output_to_array, extract_code, execute_transform_function

import os

import ast
import time
import json
import asyncio

import numpy as np
import pandas as pd
     
config = {
    'model': 'pixtral-large-latest',
    'solution_type': 'simple'
}

client = initialize_client('mistral')
print("Client initialized!")

Client initialized!


In [2]:
# Create test_set
check_dataset()
train, eval = load_files_from_json()
test_set = filter_tasks_by_grid_size(train)
print(f"Test set has {len(test_set)} tasks in it.")

ARC data complete.
Test set has 300 tasks in it.


In [3]:
##################################
#   Visual Reasoning
##################################    
results = []
for task_index, (task_name, task_content) in enumerate(test_set.items()):
    print(f"Processing task {task_name} using visual reasoning.")
    print(f"Task #{task_index + 1} out of {len(test_set)}")
    
    messages = generate_prompt_with_image(task_name, task_content)
    response = generate_llm_response(messages=messages, 
                                     client=client, 
                                     model=config['model'])
    print("Evaluating answer")
    
    extracted_answer = None     # Initialize to None before the try block
    try:
        extracted_answer_temp = extract_solution(response)            
        extracted_answer = convert_output_to_array(extracted_answer_temp)
    except ValueError as e:
        print(f"Error converting extracted solution to NumPy array: {e}")
        extracted_answer = None
        
    correct_solution = np.array(task_content['test'][0]['output'])
    result = evaluate_solution(extracted_answer, correct_solution)
    
    print(result)
    
    row = (
        {
            'task': task_name,
            'llm_full_answer': response,
            'llm_extracted_answer': extracted_answer
        })
    row.update(result)
    results.append(row)
    
    print("Waiting for one second...")
    time.sleep(1)
    df = pd.DataFrame(results)
    df.to_csv("results/vl_models/Pixtral-Large_visual-reasoning.csv", index=False)
    print(f"Data saved to CSV after {task_index + 1} iterations.")

# save at the very end
df = pd.DataFrame(results)
df.to_csv("results/vl_models/Pixtral-Large_visual-reasoning.csv", index=False)
print("Data saved to CSV at the end of the loop.")

Processing task 75b8110e using visual reasoning.
Task #1 out of 300
Time taken for 1 sample(s): 99.56 seconds
Evaluating answer
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.06}
Waiting for one second...
Data saved to CSV after 1 iterations.
Processing task 3618c87e using visual reasoning.
Task #2 out of 300
Time taken for 1 sample(s): 44.28 seconds
Evaluating answer
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 1.0}
Waiting for one second...
Data saved to CSV after 2 iterations.
Processing task 3ac3eb23 using visual reasoning.
Task #3 out of 300
Time taken for 1 sample(s): 67.13 seconds
Evaluating answer
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.88}
Waiting for one second...
Data saved to CSV after 3 iterations.
Processing task 3c9b0459 using visual reasoning.
Task #4 out of 300
Time taken for 1 sample(s): 59.11 seconds
Evaluating answer
{'answer_extracted': True, 'correct_grid_size': Tru

In [None]:
df.query("percentage_correct == 1").count()