In [16]:
pip install numpy faiss-cpu torch torchvision transformers sentencepiece


Looking in indexes: https://pypi.python.org/simple

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [17]:
# Import necessary libraries
import os
import json
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import faiss
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from transformers import AutoTokenizer, AutoModelForCausalLM


In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cpu


In [19]:
# Define paths to the ARC dataset
arc_dataset_path = 'data/training'  # Adjust this path

# Function to load ARC tasks
def load_arc_tasks(task_dir):
    tasks = []
    for filename in os.listdir(task_dir):
        if filename.endswith('.json'):
            with open(os.path.join(task_dir, filename), 'r') as f:
                task = json.load(f)
                tasks.append(task)
    return tasks

# Load training tasks
train_tasks = load_arc_tasks(arc_dataset_path)
print(f"Total training tasks loaded: {len(train_tasks)}")


Total training tasks loaded: 400


In [20]:
# Define maximum grid size
MAX_GRID_SIZE = 30  # You can adjust this based on the dataset

# Function to preprocess grids
def preprocess_grid(grid, max_size=MAX_GRID_SIZE):
    height = len(grid)
    width = len(grid[0])
    # Initialize a grid filled with zeros
    processed_grid = np.zeros((max_size, max_size), dtype=np.int64)
    # Copy the original grid into the processed grid
    for i in range(height):
        for j in range(width):
            processed_grid[i, j] = grid[i][j]
    return processed_grid

# Build a set of all colors/symbols used in the dataset
symbols = set()
for task in train_tasks:
    for example in task['train']:
        for row in example['input']:
            symbols.update(row)
        for row in example['output']:
            symbols.update(row)

symbol_to_int = {symbol: idx for idx, symbol in enumerate(sorted(symbols))}
int_to_symbol = {idx: symbol for symbol, idx in symbol_to_int.items()}
num_symbols = len(symbol_to_int)
print(f"Total unique symbols: {num_symbols}")


Total unique symbols: 10


In [21]:
class GridEncoder(nn.Module):
    def __init__(self, embedding_dim=128, num_symbols=num_symbols):
        super(GridEncoder, self).__init__()
        self.embedding = nn.Embedding(num_symbols, 16)
        self.conv_layers = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # Reduces spatial dimensions by half
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # Reduces spatial dimensions by half again
        )
        # Initialize fc later after computing output size
        self.fc = None

    def forward(self, x):
        x = self.embedding(x)  # x: (batch_size, grid_h, grid_w, embedding_dim)
        x = x.permute(0, 3, 1, 2)  # x: (batch_size, channels, grid_h, grid_w)
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)

        # Initialize fc layer if not already done
        if self.fc is None:
            self.fc = nn.Linear(x.size(1), embedding_dim).to(x.device)

        x = self.fc(x)
        return x


In [22]:
class TripletGridDataset(Dataset):
    def __init__(self, tasks):
        self.samples = []
        for task in tasks:
            for example in task['train']:
                input_grid = preprocess_grid(example['input'])
                output_grid = preprocess_grid(example['output'])
                self.samples.append((input_grid, output_grid))
        
        # Create negatives (shuffle outputs)
        self.negatives = [output for _, output in self.samples]
        random.shuffle(self.negatives)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        anchor, positive = self.samples[idx]
        negative = self.negatives[idx]
        # Convert to tensors
        anchor = torch.tensor(anchor, dtype=torch.long)
        positive = torch.tensor(positive, dtype=torch.long)
        negative = torch.tensor(negative, dtype=torch.long)
        return anchor, positive, negative


In [23]:
# Instantiate the dataset and data loader
triplet_dataset = TripletGridDataset(train_tasks)
triplet_loader = DataLoader(triplet_dataset, batch_size=32, shuffle=True)

# Initialize the encoder and move it to the device
encoder = GridEncoder().to(device)

# Define the loss function and optimizer
criterion = nn.TripletMarginLoss(margin=1.0)
optimizer = optim.Adam(encoder.parameters(), lr=1e-3)

# Training loop
num_epochs = 5  # Adjust as needed

for epoch in range(num_epochs):
    encoder.train()
    total_loss = 0
    for batch_idx, (anchor, positive, negative) in enumerate(triplet_loader):
        anchor = anchor.to(device)
        positive = positive.to(device)
        negative = negative.to(device)
        
        optimizer.zero_grad()
        anchor_emb = encoder(anchor)
        positive_emb = encoder(positive)
        negative_emb = encoder(negative)
        loss = criterion(anchor_emb, positive_emb, negative_emb)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(triplet_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")


NameError: name 'embedding_dim' is not defined

In [12]:
# Collect all input grids
all_input_grids = []
for task in train_tasks:
    for example in task['train']:
        input_grid = preprocess_grid(example['input'])
        all_input_grids.append(input_grid)

# Encode all input grids
encoder.eval()
embeddings = []
with torch.no_grad():
    for grid in all_input_grids:
        grid_tensor = torch.tensor(grid, dtype=torch.long).unsqueeze(0).to(device)
        embedding = encoder(grid_tensor)
        embeddings.append(embedding.cpu().numpy())

# Convert embeddings to numpy array
embeddings_np = np.vstack(embeddings).astype('float32')

# Build the FAISS index
embedding_dim = embeddings_np.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings_np)
print(f"FAISS index built with {index.ntotal} embeddings.")


RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.