In [1]:
import os
from collections import defaultdict

import torch
import numpy as np
import pandas as pd
from torch import nn
from torch_geometric.nn import GCNConv,SAGEConv
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm 
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
layout_dir = r"H:\data\gfos\predict-ai-model-runtime\npz_all\npz\layout"

In [3]:
def load_layout(
    base_dir: str, compile_type: str, model_type: str | None = None
):
    if model_type is not None:
        assert model_type in (
            "nlp",
            "xla",
        ), f"model_type must be nlp or xla but got {model_type}"
    assert compile_type in (
        "default",
        "random",
    ), f"compile_type must be default or random but got {compile_type}"

    dfs = defaultdict(list)

    if model_type is None:
        model_types = ("nlp", "xla")
    else:
        model_types = (model_type,)

    dirs = [
        os.path.join(base_dir, model_type, compile_type, training)
        for model_type in model_types
        for training in ["train", "valid", "test"]
    ]

    for path in dirs:
        split = path.split("\\")[-1]
        files = os.listdir(path)

        dfs[split] += [os.path.join(path, file) for file in files]

    return dfs


layout_xla_random = load_layout(
    layout_dir,
    compile_type="random",
)

## Dataset

In [4]:
class LayoutDataset(Dataset):
    def __init__(self, files: list[str]):
        self.files = files  # 💼 Initialize the dataset with a DataFrame containing the data
        self.npzs = [np.load(file) for file in self.files]
        self.num_records = [len(npz["config_runtime"]) for npz in self.npzs]

    def __len__(self):
        return len(self.npzs)  # 🔢 Define the length of the dataset, which is the number of rows in the DataFrame

    def __getitem__(self, idx):
        # cum_records = np.cumsum(self.num_records)
        # npz_idx = np.searchsorted(cum_records, idx)
        # row = self.npzs[npz_idx]
        
        # cfg_idx = idx - cum_records[npz_idx - 1] if npz_idx > 0 else idx
        row = self.npzs[idx]
        config_feat = torch.tensor(row['node_config_feat'].astype(np.float32))  # 🧮 Convert and store 'config_feat' as a PyTorch tensor
        node_config_ids = torch.tensor(row["node_config_ids"].astype(np.int32))  # 🧮 Convert and store 'node_cfg_idx' as a PyTorch tensor
        node_feat = torch.tensor(row['node_feat'].astype(np.float32))  # 🧮 Convert and store 'node_feat' as a PyTorch tensor
        node_opcode = torch.tensor(row['node_opcode'].astype(np.int32))  # 🧮 Convert and store 'node_opcode' as a PyTorch tensor
        edge_index = torch.tensor(np.swapaxes(row['edge_index'],0,1).astype(np.int32))  # 🧮 Convert and store 'edge_index' as a PyTorch tensor with axis swapping
        target = row['config_runtime'].astype(np.float32)  # 📈 Calculate and store the target value with preprocessing
        # 📊 Min-max scale the target value to ensure it's within a specific range (standardization)
        target = (target - np.mean(target)) / (np.std(target) + 1e-5)
        target = torch.tensor(target)  # 🧮 Convert and store the target as a PyTorch tensor
        return config_feat, node_feat, node_opcode, edge_index, node_config_ids, target  # 🔁 Return the data and target for a specific sample


## Model

In [5]:
class SimpleModel(torch.nn.Module):
    def __init__(self, hidden_channels, graph_feats, hidden_dim):
        super().__init__()  # 🧬 Initialize the parent class 'torch.nn.Module'

        op_embedding_dim = 4  # I choose 4-dimensional embedding
        self.embedding = torch.nn.Embedding(
            120,  # 120 different op-codes
            op_embedding_dim,
        )
        assert len(hidden_channels) > 0
        in_channels = op_embedding_dim + 140
        self.convs = torch.nn.ModuleList()
        last_dim = hidden_channels[0]

        # Create a sequence of Graph Convolutional Network (GCN) layers
        self.convs.append(GCNConv(in_channels, hidden_channels[0]))
        for i in range(len(hidden_channels) - 1):
            self.convs.append(
                GCNConv(hidden_channels[i], hidden_channels[i + 1])
            )
            last_dim = hidden_channels[i + 1]
        self.convs.append(GCNConv(last_dim, graph_feats))

        # Define a sequential dense neural network
        self.dense = torch.nn.Sequential(
            nn.Linear(graph_feats + 18, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )
        
        self.conv1d = nn.Conv1d(1, 1, 3, padding=1)

    def forward(
        self, x_cfg: torch.Tensor, x_feat: torch.Tensor, x_op: torch.Tensor, edge_index: torch.Tensor, node_config_ids: torch.Tensor
    ) -> torch.Tensor:
        # Get graph features
        x = torch.cat(
            [x_feat, self.embedding(x_op)], dim=1
        )  # 📊 Concatenate input features with opcode embeddings

        # Pass data through convolutional layers
        for conv in self.convs:
            x = conv(x, edge_index).relu()

        # Shape (n, nc, 64)
        # x_graph = torch.index_select(x, 1, node_config_ids)
        # x_cfg = x_cfg.flatten(1)
        x_graph = torch.mean(x, dim=0)
        x_cfg = torch.mean(x_cfg, dim=1).squeeze(1)

        # Combine graph data with config data
        x = torch.cat(
            [x_cfg, x_graph.repeat((len(x_cfg), 1))], axis=1
        )  # 🔄 Concatenate config data with repeated graph embeddings

        # Pass the combined data through the dense neural network
        x = torch.flatten(self.dense(x))

        # Standardize the output
        x = (x - torch.mean(x)) / (torch.std(x) + 1e-5)
        return x


# Create an instance of the 'SimpleModel' and move it to the specified device (CPU or GPU)
model = SimpleModel(
    hidden_channels=[16, 32, 16, 48], graph_feats=64, hidden_dim=64
).to(device)

In [None]:
# 🔄 Cross-Validation Training Loop (Enhanced)


# Define the score_tile_mean function
def score_tile_mean(predictions, df):
    score = 0
    for i in range(len(df)):
        predbest = np.mean(df.iloc[i]["config_runtime"][predictions[i]])
        best = np.mean(np.sort(df.iloc[i]["config_runtime"])[:5])
        score += 2 - predbest / best
    score /= len(df)
    return score


# Define the score_tile_max function
def score_tile_max(predictions, df):
    score = 0
    for i in range(len(df)):
        predbest = np.min(df.iloc[i]["config_runtime"][predictions[i]])
        best = np.min(df.iloc[i]["config_runtime"])
        score += 2 - predbest / best
    score /= len(df)
    return score


# Create a K-Fold cross-validator with 5 splits
# kfold = sklearn.model_selection.KFold(n_splits=5, shuffle=True, random_state=0)

# Lists to store mean and max scores for each fold
score_means = []
score_maxs = []

# Define hyperparameters
learning_rate = 5e-4  # Adjust the learning rate to a different value
weight_decay = 1e-6  # Adjust weight decay to a different value
num_epochs = 90  # You can keep the number of epochs as 90 or adjust as needed


# Iterate through each fold
# for fold, (tr_idx, va_idx) in enumerate(kfold.split(df)):
train_dataset = LayoutDataset(layout_xla_random["train"])
val_dataset = LayoutDataset(layout_xla_random["valid"])
criterion = torch.nn.MSELoss()
steps = len(train_dataset) * num_epochs  # Update the number of training steps
warmup_steps = int(steps * 0.1)
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=weight_decay
)
# scheduler = CosineLRScheduler(optimizer, t_initial=steps, warmup_t=warmup_steps, warmup_lr_init=1e-6, lr_min=2e-8)

best_score = 0
best_score_max = 0

# Training loop with increased epochs
for epoch in range(num_epochs):
    model.train()
    pbar = tqdm(range(len(train_dataset)), leave=False)
    loss_sum = 0
    n = 0

    for i in pbar:
        cfg_ft, nd_ft, nd_op, ind, cids, target = train_dataset[i]
        cfg_ft, nd_ft, nd_op, ind, cids, target = (
            cfg_ft.to(device),
            nd_ft.to(device),
            nd_op.to(device),
            ind.to(device),
            cids.to(device),
            target.to(device),
        )

        out = model(cfg_ft, nd_ft, nd_op, ind, cids)
        loss = criterion(out, target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1e-2)
        # scheduler.step(i + len(train_dataset) * epoch)
        optimizer.step()
        loss_sum += loss.item()
        n += 1
        pbar.set_description(
            f"running loss: {(loss_sum/n):.2f}, current loss: {(loss.item()):.2f}"
        )
    pbar.close()
    model.eval()
    tile_xla_predictions = []
    pbar = tqdm(range(len(val_dataset)), leave=False)

    for i in pbar:
        cfg_ft, nd_ft, nd_op, ind, cids, target = val_dataset[i]
        cfg_ft, nd_ft, nd_op, ind, cids, target = (
            cfg_ft.to(device),
            nd_ft.to(device),
            nd_op.to(device),
            ind.to(device),
            cids.to(device),
            target.to(device),
        )

        out = model(cfg_ft, nd_ft, nd_op, ind, cids)
        tile_xla_predictions.append(np.argsort(out.cpu().detach().numpy())[:5])

    pbar.close()

    # Calculate and display scores for the current fold and epoch
    score_mean = score_tile_mean(tile_xla_predictions, val_dataset.df)
    score_max = score_tile_max(tile_xla_predictions, val_dataset.df)
    print(
        f"epoch {epoch}, comp_score = {score_max:.3f}, mean_score = {score_mean:.3f},"
    )

    # Update best scores and save the model if the mean score improves
    if score_mean > best_score:
        best_score = score_mean
        best_score_max = score_max
        torch.save(model.state_dict(), f"best_model_{fold}.pth")

# Append the best scores for this fold to the respective lists
score_means.append(best_score)
score_maxs.append(best_score_max)

# Calculate and display the mean scores across all folds
print(
    f"comp_score = {np.mean(score_maxs)}, mean_score = {np.mean(score_means)},"
)