In [None]:
from utils import check_dataset, load_files_from_json, find_task, create_prompt, generate_coding_prompt
from llm_utils import generate_llm_response, initialize_client
from eval import extract_solution, evaluate_solution, fix_output_string, extract_code, execute_transform_function

import time
import json

import numpy as np
import pandas as pd


check_dataset()
train, eval = load_files_from_json()

config = {
    'model': 'mistral-small-latest',
    'solution_type': 'program_synthesis',
    #'temperatures': [0.1, 0.3, 0.5, 0.7, 0.9],
    'temperatures': [0.5, 0.6],
    'samples_per_step': 2,
    'code_correction_steps': 4
}

# Initializing client:
client = initialize_client('mistral')
print("Client initialized!")

# Create smaller dataset for testing purposes
test_set1 = dict(list(train.items())[0:2])

###################################
##   Use this for program synthesis
###################################
# Start looping over tasks!    
for task_index, (task_name, task_content) in enumerate(test_set1.items()):
        
    print(f"Processing task {task_name} using program synthesis!")
    
    user_prompt = generate_coding_prompt(task_content)
    system_prompt = "You are a talented python programmer."
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": [{"type": "text", "text": user_prompt}]}
    ]
    
    # Generate 10 coding solutions for one task
    # will return a list, as multiple samples are drawn
    print("Generating code...")
    responses_list = []
    additional_infos = []
    temperatures = config['temperatures']
    
    for temp in temperatures:
        print(f"Generating {config['samples_per_step']} solutions using temperature {temp}")
        responses, additional_info = generate_llm_response(messages, client=client, model=config['model'],
                                                    num_samples=config['samples_per_step'], temperature=temp)
        
        # Add 
        responses_list.extend(responses)
        
        additional_info.insert(0, task_name)
        additional_info.append(temp)
        additional_infos.append(additional_info)

ARC data complete.
Client initialized!
Processing task 75b8110e using program synthesis!
Generating code...
Generating 2 solutions using temperature 0.5
Time taken for 2 sample(s): 19.46 seconds
Generating 2 solutions using temperature 0.6
Time taken for 2 sample(s): 12.59 seconds
Processing task 3618c87e using program synthesis!
Generating code...
Generating 2 solutions using temperature 0.5
Time taken for 2 sample(s): 12.08 seconds
Generating 2 solutions using temperature 0.6
Time taken for 2 sample(s): 9.52 seconds


In [81]:
import pandas as pd
dfxy = pd.read_csv("results/codestral_test_results_temp04.csv")

In [82]:
dfxy

Unnamed: 0,task,sample_num,extracted_code,average_percentage_correct,count_perfect_scores
0,f5b8619d,1,def transform(input_grid):\n output_grid = ...,0.436667,0
1,f5b8619d,2,def transform(input_grid):\n output_grid = ...,0.383333,0
2,f5b8619d,3,def transform(input_grid):\n rows = len(inp...,0.456667,0
3,f5b8619d,4,def transform(input_grid):\n if not input_g...,0.413333,0
4,f5b8619d,5,def transform(input_grid):\n rows = len(inp...,0.456667,0
...,...,...,...,...,...
85,025d127b,6,def transform(grid: list[list[int]]) -> list[l...,0.780000,0
86,025d127b,7,def transform(grid):\n rows = len(grid)\n ...,0.665000,0
87,025d127b,8,def transform(grid: list[list[int]]) -> list[l...,0.680000,0
88,025d127b,9,def transform(grid: list[list[int]]) -> list[l...,0.895000,0


In [79]:
type(additional_info)

list

In [77]:
df1 = pd.DataFrame(additional_infos, 
                   columns=['task_name', 'processing_time', 'samples_drawn', 'input_tokens', 'output_tokens', 'temperature'])
df1

Unnamed: 0,task_name,processing_time,samples_drawn,input_tokens,output_tokens,temperature
0,3618c87e,12.08,2,833,2580,0.5
1,3618c87e,9.52,2,833,1696,0.6


In [80]:
responses_list

["<reasoning>\nThe transformation rule appears to involve moving the '1's and '5's in the grid to the rightmost position in their respective rows, while preserving the order of '1's and '5's. Specifically, '1's are moved to the rightmost position in their respective rows, and '5's are moved to the leftmost position in their respective rows. The '0's remain in their original positions.\n\nLet's break down the examples:\n\nExample 1:\n- Input:\n  ```\n  [\n  [0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0],\n  [0, 0, 1, 0, 0],\n  [0, 0, 5, 0, 0],\n  [5, 5, 5, 5, 5]\n  ]\n  ```\n- Output:\n  ```\n  [\n  [0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0],\n  [0, 0, 5, 0, 0],\n  [5, 5, 1, 5, 5]\n  ]\n  ```\n  Here, the '1' in the third row is moved to the rightmost position, and the '5's in the fourth and fifth rows are moved to the leftmost position.\n\nExample 2:\n- Input:\n  ```\n  [\n  [0, 0, 0, 0, 0],\n  [0, 0, 0, 0, 0],\n  [0, 1, 0, 1, 0],\n  [0, 5, 0, 5, 0],\n  [5, 5, 5, 5, 5]\n  ]\n  ```\n-

In [30]:
code_results = []

for i, generated_code in enumerate(generated_programs):
        
    print(f"Attempting to run program #{i}...")

    dicts_list = []
    # Set up individual input/output examples as arrays
    for example in task_content['train']:
        example_input = example['input']
        example_output = np.array(example['output'])
        
        # Then execute generated code for each example
        output_grid_with_llm_code = execute_transform_function(generated_code, example_input)
        try:
            solution_array = np.array(output_grid_with_llm_code)
        except:
            print("No numpy array could be made :(")
            
        result = evaluate_solution(solution_array, example_output)
        dicts_list.append(result)
        
        for d in dicts_list:
            print(d)
            
    percentage_correct_values = [d['percentage_correct'] for d in dicts_list]
    average_percentage_correct = sum(percentage_correct_values) / len(percentage_correct_values)

    # count_perfect_scores = 0   
    # for d in dicts_list:
    #     if d['percentage_correct'] == 1.0:
    #         count_perfect_scores += 1       
    # simplified to:
    count_perfect_scores = sum(1 for d in dicts_list if d['percentage_correct'] == 1.0)

    print(i, average_percentage_correct, count_perfect_scores)
    # try to save results:

    row = (
    {
        'task': task_name,
        'sample_num': i+1,
        'full_answer': "xx",
        'extracted_code': generated_code,
        'average_percentage_correct': average_percentage_correct,
        'count_perfect_scores': count_perfect_scores
    })

    code_results.append(row)

Attempting to run program #0...
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.12}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.12}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.06}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.12}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.06}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.19}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.12}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.06}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.19}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.19}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_correct': 0.12}
{'answer_extracted': True, 'correct_grid_size': True, 'percentage_

In [115]:
df = pd.read_csv("results/program_synthesis/mistral-24b_code.csv")
df.query("average_percentage_correct == 1")

Unnamed: 0,task,sample_num,full_answer,extracted_code,generated_grids,average_percentage_correct,count_perfect_scores
1101,5582e5ca,2,<reasoning>\n\n1. **Grid Size**: The input and...,def transform_grid(input_grid):\n from coll...,"[[[4, 4, 4], [4, 4, 4], [4, 4, 4]], [[9, 9, 9]...",1.0,3
1110,5582e5ca,11,<reasoning>\n1. **Grid Size**: The input and o...,def transform_grid(input_grid):\n from coll...,"[[[4, 4, 4], [4, 4, 4], [4, 4, 4]], [[9, 9, 9]...",1.0,3
1112,5582e5ca,13,<reasoning>\n1. **Grid Size**: The input and o...,def transform_grid(input_grid):\n from coll...,"[[[4, 4, 4], [4, 4, 4], [4, 4, 4]], [[9, 9, 9]...",1.0,3
1116,5582e5ca,17,<reasoning>\n\n1. **Grid Size Observation**:\n...,def transform_grid(input_grid):\n from coll...,"[[[4, 4, 4], [4, 4, 4], [4, 4, 4]], [[9, 9, 9]...",1.0,3
1135,5582e5ca,36,<reasoning>\n\n1. **Grid Size**: The input and...,def transform_grid(input_grid):\n from coll...,"[[[4, 4, 4], [4, 4, 4], [4, 4, 4]], [[9, 9, 9]...",1.0,3
...,...,...,...,...,...,...,...
9496,c8f0f002,97,<reasoning>\n1. **Grid Size Observation**:\n ...,def transform_grid(grid):\n for i in range(...,"[[[1, 8, 8, 5, 5, 8], [1, 1, 5, 5, 1, 8], [5, ...",1.0,3
9497,c8f0f002,98,<reasoning>\n\n1. **Grid Size Observation**:\n...,def transform_grid(grid):\n for i in range(...,"[[[1, 8, 8, 5, 5, 8], [1, 1, 5, 5, 1, 8], [5, ...",1.0,3
9498,c8f0f002,99,<reasoning>\n1. **Grid Size**: The input and o...,def transform_grid(grid):\n for i in range(...,"[[[1, 8, 8, 5, 5, 8], [1, 1, 5, 5, 1, 8], [5, ...",1.0,3
9499,c8f0f002,100,<reasoning>\n1. **Grid Size**: The input and o...,def transform_grid(input_grid):\n output_gr...,"[[[1, 8, 8, 5, 5, 8], [1, 1, 5, 5, 1, 8], [5, ...",1.0,3


In [145]:
df2 = pd.read_csv("results/program_synthesis/codestral_code_p1.csv")
df2.query("average_percentage_correct == 1")['task'].unique()

df3 = pd.read_csv("results/program_synthesis/codestral_code_p2.csv")
df4 = pd.read_csv("results/program_synthesis/codestral_code_p2_2.csv")

df5 = pd.read_csv("results/program_synthesis/codestral_code_p3.csv")


set1 = set(df2["task"].unique())
set2 = set(df3["task"].unique())
set3 = set(df4["task"].unique())
set4 = set(df5["task"].unique())


In [146]:
common_12 = set1.intersection(set2)  # Common in df2 and df3
common_13 = set1.intersection(set3)  # Common in df2 and df4
common_23 = set2.intersection(set3)  # Common in df3 and df4
common_all = set1.intersection(set2, set3, set4)  # Common in all three

# Print results
print("Common between df2 and df3:", common_12)
print("Common between df2 and df4:", common_13)
print("Common between df3 and df4:", common_23)
print("Common in all three:", common_all)

Common between df2 and df3: set()
Common between df2 and df4: set()
Common between df3 and df4: {'fcc82909'}
Common in all three: set()


In [147]:
df5

Unnamed: 0,task,sample_num,full_answer,extracted_code,generated_grids,average_percentage_correct,count_perfect_scores
0,f8ff0b80,1,"<reasoning>\nFirst, let's analyze the given ex...",def transform_grid(input_grid):\n colors = ...,"[[[8], [2], [3]], [[1], [2], [7]], [[1], [2], ...",0.330000,0
1,f8ff0b80,2,"<reasoning>\nFirst, let's analyze the given ex...",def transform_grid(input_grid):\n def most_...,"[[[3], [2], [0]], [[1], [2], [0]], [[2], [4], ...",0.333333,0
2,f8ff0b80,3,"<reasoning>\nFirst, let's analyze the given ex...",,"[None, None, None]",0.000000,0
3,f8ff0b80,4,"<reasoning>\nFirst, let's analyze the given ex...",,"[None, None, None]",0.000000,0
4,f8ff0b80,5,"<reasoning>\nFirst, let's analyze the given ex...",def transform_grid(input_grid):\n output_gr...,"[[[8], [2], [0]], [[1], [2], [0]], [[2], [4], ...",0.220000,0
...,...,...,...,...,...,...,...
4995,3af2c5a8,46,<reasoning>\n\nTo determine the transformation...,def transform_grid(input_grid):\n output_gr...,"[[[0, 0], [0, 0], [0, 0], [8, 8], [0, 0], [0, ...",0.000000,0
4996,3af2c5a8,47,"<reasoning>\nFirst, let's determine the input ...",def transform_grid(input_grid):\n m = len(i...,"[[[0, 0, 0, 0, 8, 8, 0, 0], [0, 0, 0, 0, 8, 8,...",0.580000,0
4997,3af2c5a8,48,"<reasoning>\n\nFirst, let's analyze the given ...",def transform_grid(input_grid):\n rows = le...,"[[[0, 0, 8, 0, 0, 0, 8, 0], [0, 8, 0, 8, 0, 8,...",0.610000,0
4998,3af2c5a8,49,<reasoning>\n\nTo determine the transformation...,def transform_grid(input_grid):\n output_gr...,"[[[0, 0, 0, 0, 8, 8, 0, 0], [0, 0, 0, 0, 8, 8,...",0.580000,0


In [152]:
df5 = pd.read_csv("results/program_synthesis/codestral_code_p4.csv")
df5.query("average_percentage_correct == 1")['task'].unique()

array(['8be77c9e', 'b1948b0a', '4258a5f9', 'ce22a75a', '62c24649',
       '1fad071e', 'de1cd16c', '496994bd', 'bd4472b8', '253bf280',
       'be94b721', 'c3f564a4', '0d3d703e', 'a9f96cdd', '7fe24cdd',
       '1f85a75f', '5bd6f4ac', 'd10ecb37', 'c9f8e694', '74dd1130',
       '91714a58', 'e9afcf9a', 'b91ae062'], dtype=object)

In [128]:
df2 = pd.read_csv("results/program_synthesis/codestral_code_info_p2.csv")
df2

Unnamed: 0,task_name,processing_time,samples_drawn,input_tokens,output_tokens,temperature
0,e6721834,18.91,10,4476,15141,0.1
1,e6721834,18.22,10,4476,12160,0.3
2,e6721834,17.97,10,4476,13306,0.5
3,e6721834,18.79,10,4476,18746,0.7
4,e6721834,18.41,10,4476,18029,0.9
...,...,...,...,...,...,...
450,fcc82909,15.31,10,2271,14816,0.1
451,fcc82909,15.16,10,2271,17539,0.3
452,fcc82909,19.66,10,2271,17260,0.5
453,fcc82909,18.54,10,2271,21906,0.7


In [2]:
answer = "Answer: ``` [0, 8, 0], [8, 8, 8], [0, 8, 8] ```"

In [5]:
import re
from eval import fix_output_string
match = re.search(r"\s*Answer:\s*```\s*([\s\S]*?)\s*```\s*", answer, re.S)
extracted_answer = match.group(1).strip()
extracted_answer

'[0, 8, 0], [8, 8, 8], [0, 8, 8]'

In [11]:
str_removed_characters = re.sub(r'[a-zA-Z]', '', extracted_answer).strip()
str_removed_characters



'[0, 8, 0], [8, 8, 8], [0, 8, 8]'

In [10]:
import json
final = json.loads(str_removed_characters)

JSONDecodeError: Extra data: line 1 column 10 (char 9)