In [1]:
# conda environment (project)
# /home/student/.conda/envs/project/bin/python 
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import os, gc
import torch

from transformers import set_seed
from datasets import load_dataset
from evaluate import *
from arc.arc import ARCSolver

from datasets import Dataset
from utils import render_grid

In [2]:
# prepare the test dataset
data_path = "dataset"
dataset, task_list = load_data(data_path)
df300 = sample_data(dataset, task_list, n_row=30000) 
df300.head(5) 

Imported 300 different tasks in the dataset.
# of samples in a task: min(82), Q1(1000), Q2(1000), Q3(1000), max(1000), mean(897.7)


Unnamed: 0,task,train,test_input,test_output,test
0,239be575,"[{'input': [[0, 0, 6, 6, 2, 0, 0], [2, 0, 6, 6...","[{'input': [[0, 5, 5, 3, 3, 3, 0], [0, 5, 5, 0...",[[[0]]],"[{'input': [[0, 5, 5, 3, 3, 3, 0], [0, 5, 5, 0..."
1,4258a5f9,"[{'input': [[3, 3, 3, 3], [3, 3, 3, 3], [3, 3,...","[{'input': [[6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 2...","[[[6, 6, 6, 1, 1, 1], [1, 1, 1, 1, 2, 1], [1, ...","[{'input': [[6, 6, 6, 6, 6, 6], [6, 6, 6, 6, 2..."
2,1caeab9d,"[{'input': [[6, 6, 6, 6, 6, 6, 6], [6, 6, 8, 8...","[{'input': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4...","[[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0,...","[{'input': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [4..."
3,623ea044,"[{'input': [[3, 3, 3, 3, 3, 3, 3], [3, 3, 3, 3...","[{'input': [[6, 6, 6, 6, 6], [6, 6, 6, 6, 6], ...","[[[6, 8, 6, 6, 6], [6, 6, 8, 6, 8], [6, 6, 6, ...","[{'input': [[6, 6, 6, 6, 6], [6, 6, 6, 6, 6], ..."
4,e9afcf9a,"[{'input': [[4, 4, 4, 4, 4], [6, 6, 6, 6, 6], ...","[{'input': [[5, 5, 5, 5, 5, 5, 5, 5, 5], [7, 7...","[[[5, 7, 5, 7, 5, 7, 5, 7, 5], [7, 0, 7, 0, 7,...","[{'input': [[5, 5, 5, 5, 5, 5, 5, 5, 5], [7, 7..."


In [3]:
# prepare samples for each task
task_samples = []
for t in range(300):
    df = sample_data(dataset, task_list, n_row=1000, indices=[t])
    task_samples.append(df)

In [4]:
# Visualize a task (EDA)
task_indices = [17] # select which task you want to examine
n_sample = 1
for task_idx in task_indices:
    print(task_idx)
    for data in Dataset.from_pandas(task_samples[task_idx]).shuffle().select(range(n_sample)):
        for case in data['train']:
            print("==================================================")
            print("Example input")
            render_grid(case['input'])
            print("Example output")
            render_grid(case['output'])
            break
        print("==================================================")
        print("Example test input")
        render_grid(data['test'][0]['input'])
        print("Example test output")
        render_grid(data['test'][0]['output'])
    print("==================================================")

17
Example input


Example output


Example test input


Example test output




In [6]:
simple_tasks = []
hard_tasks = []
for task_idx in range(300):
    check = True
    for data in Dataset.from_pandas(task_samples[task_idx]).shuffle().select(range(3)):
        for case in data['train']:
            wi, hi = len(case['input'][0]), len(case['input'])
            wo, ho = len(case['output'][0]), len(case['output'])
            if (wi!=wo) or (hi!=ho): check = False
        case = data['test'][0]
        wi, hi = len(case['input'][0]), len(case['input'])
        wo, ho = len(case['output'][0]), len(case['output'])
        if (wi!=wo) or (hi!=ho): check = False
    if check: simple_tasks.append(task_idx)
    else: hard_tasks.append(task_idx)
print(simple_tasks)

[0, 1, 3, 7, 8, 9, 10, 11, 13, 15, 18, 19, 20, 22, 27, 29, 30, 32, 33, 34, 35, 38, 40, 41, 44, 45, 46, 51, 52, 54, 55, 56, 57, 58, 61, 64, 65, 66, 67, 68, 69, 70, 72, 73, 75, 78, 79, 80, 81, 83, 85, 87, 89, 90, 91, 92, 93, 95, 97, 98, 99, 100, 101, 102, 104, 105, 106, 107, 110, 111, 113, 114, 116, 117, 118, 119, 120, 123, 124, 128, 129, 130, 133, 135, 137, 138, 139, 140, 141, 142, 143, 145, 147, 148, 150, 151, 153, 154, 155, 157, 158, 159, 160, 161, 162, 166, 167, 169, 170, 172, 173, 174, 175, 176, 177, 178, 179, 181, 182, 185, 188, 190, 193, 194, 195, 196, 197, 198, 201, 203, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 218, 220, 222, 224, 225, 226, 227, 228, 230, 231, 233, 234, 235, 236, 239, 244, 245, 246, 248, 249, 250, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 264, 267, 268, 270, 271, 273, 274, 275, 276, 277, 281, 282, 285, 286, 287, 291, 292, 293, 295, 296, 297]


In [57]:
set_seed(1234567890)
token = os.environ.get("HF_TOKEN", None)
solver = ARCSolver(model_id="Qwen/Qwen3-1.7B", hf_token=token)

In [7]:
# solver.prepare_train()
n_train = len(hard_tasks)*1000
n_eval = 500
dfsimple = sample_data(dataset, task_list, n_row=n_train+n_eval, indices=simple_tasks, random=56)
dfhard = sample_data(dataset, task_list, n_row=n_train+n_eval, indices=hard_tasks, random=56)
train_dataset = Dataset.from_pandas(dfsimple).select(range(n_train))
# solver.train(train_dataset)

In [58]:
n_eval = 100
solver.prepare_evaluation(select_adapter="20250702_220556") # make sure you set the right model

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [59]:
# evaluate our model (eval set)
scores = []
n_eval = 20
scores_task = []
for task in range(20):
    eval_dataset = Dataset.from_pandas(task_samples[task]).select(range(n_eval))
    for eval_data in tqdm(eval_dataset):
        # print("============================================")
        # print("Test input")
        # render_grid(eval_data["test"][0]['input'])

        # print("Predict output")
        preds = solver.predict_bfs(eval_data) # list of grids
        for p in preds:
            if p is not None: render_grid(p)
        # pred = solver.predict(eval_data) # single grid
        # if pred is not None: render_grid(pred)

        # print("Test output")
        # render_grid(eval_data["test"][0]['output'])
        # print("============================================")

        s = 0
        for p in preds:
            if p is None: s += 0
            else: s += check_match(p, eval_data["test"][0]["output"]) 
        # s = check_match(pred, eval_data["test"][0]["output"])
        scores.append(s) # s = 0 or 1
    score = np.array(scores).mean() * 100
    scores_task.append(score)
    print(f"Evaluation score: {score:.2f}", flush=True)
    scores = []

from matplotlib import pyplot as plt
x = np.arange(20)
plt.bar(x, scores_task)
plt.xticks(x, list(range(20)))
plt.ylim(0,100)
plt.show()

  0%|          | 0/20 [00:00<?, ?it/s]

[]
[]
[]
[tensor([151644,   8948,    198,   2610,    525,    458,   6203,    369,  21828,
         19819,  46523,     13,   1446,    525,   2661,   1045,  13530,    315,
          1946,    323,   2550,    438,    220,     17,     35,   5827,    448,
           279,   1852,  16533,  27979,   5383,   1948,   1105,     13,   1446,
           614,    311,  73045,  23583,    279,   4734,   5912,    429,  28475,
          1817,   1946,    311,    279,  12159,   2550,    624, 151645,    198,
        151644,    872,    198,   8420,    525,    279,   3110,   1946,    323,
          2550,  13530,    504,    892,    498,   1265,   3960,    279,  16533,
          5912,    311,   2937,   7023,    279,   2550,    369,    279,   2661,
          1273,   1946,    510,   1408,  28665,   1355,    510,     17,     20,
            20,     20,     20,     20,    198,     20,     20,     20,     20,
            23,     20,    198,     20,     20,     20,     20,     20,     23,
           198,     20,     20

[]
[]
[]
[]
[]


KeyboardInterrupt: 