In [52]:
# Get total number of different nodes amd edges from given set
# Put Decoder together 
# Create a full training pipeline 

# Remove recalculating graphs for the nodes that are already calculated as part of another graph.

# Create Dataset
# Create Encoder training pipeline [DONE] 
# Put cells together to create a Embedding Network [Done]

In [53]:
# Load the dataset into PyTactician's visualizer.
from pytact import data_reader, graph_visualize_browse
import pathlib
from typing import Optional, List, DefaultDict
from pytact.data_reader import Node
from pytact.graph_api_capnp_cython import EdgeClassification
from pytact.graph_api_capnp_cython import Graph_Node_Label_Which
from collections import defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
from sklearn.metrics import classification_report

In [58]:
def get_node_id(node):
    prefix = f"{node.graph}-{node.nodeid}"
    return prefix
    
def get_file_size(dataset_location,
                 filename="coq-tactician-stdlib.8.11.dev/theories/Init/Logic.bin"): 
    with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
        datapath = pathlib.Path(filename)
        peano_dataset = reader[datapath] 
        pdl = peano_dataset.lowlevel
        size = len(pdl.graph.nodes)
        return size

def get_dag(index, dataset_location, filename="coq-tactician-stdlib.8.11.dev/theories/Init/Logic.bin", max_iterations=100000000000000):
    dag: DefaultDict[str, int] = defaultdict(int) 
    
    with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
        datapath = pathlib.Path(filename)
        peano_dataset = reader[datapath] 
    
        # Define Graph to Traverse
        current_node = peano_dataset.node_by_id(index)
        # Initialize the required data structures
        # Define initial variables
        node_type_list: List[Graph_Node_Label_Which] = []
        node_pytac_id_list: List[str] = []
        edge_list: List[tuple[int,int]] = [] #paretnt_id, child_id,
        edge_type_list: List[Optional[EdgeClassification]] = [] # edge_type
        node_to_children_dict: DefaultDict[str, str] = defaultdict(list)
        queue: List[tuple[Node, Optional[int], Optional[EdgeClassification]]] = [] #current_node, parent_id, edge_type
        
        # Initial Node Processing
        current_node_pytac_id = get_node_id(current_node)
        current_node_id = len(node_type_list)
        dag[current_node_pytac_id] = current_node_id
        node_type_list.append(current_node.label.which.name)
        node_pytac_id_list.append(get_node_id(current_node))
        
        # Add children of the initial node to the queue
        children = list(current_node.children)
        if not current_node.label.which.name == 'REL': 
            for edge_label, child_node in children:
                # Add child and edge details
                queue.append((child_node, current_node_id, edge_label))
                node_to_children_dict[get_node_id(current_node)].append(get_node_id(child_node)) 
        
        # Other nodes processing 
        # Limit the traversal to a maximum depth or iterations
        iteration = 1
        while len(queue) and  iteration < max_iterations:
            iteration += 1
            # Dequeue the next node
            current_node, parent_id, edge_type = queue.pop(0)
        
            # If the node is in the graph then
            # Add Edge, Dont expand Children, Dont add Node 
            current_node_pytac_id = get_node_id(current_node)
            if current_node_pytac_id in dag: 
                # Add current existing node details
                edge_list.append((parent_id, dag[current_node_pytac_id]))
                edge_type_list.append(edge_type.name)
                continue
                
            # Otherwise Add Edge, Node and Expand Children (besides children of REL node)    
            else: 
                current_node_id = len(node_type_list)
                dag[current_node_pytac_id] = current_node_id
                # Add current node details
                edge_list.append((parent_id, current_node_id))
                edge_type_list.append(edge_type.name)
                node_type_list.append(current_node.label.which.name)
                node_pytac_id_list.append(get_node_id(current_node))
                
                # Process children of the current node
                children = list(current_node.children)
                if current_node.label.which.name == 'REL': 
                    continue
                for edge_label, child_node in children:
                    # Add child and edge details
                    queue.append((child_node, current_node_id, edge_label))
                    node_to_children_dict[get_node_id(current_node)].append(get_node_id(child_node)) 
            
    return node_type_list, edge_list, edge_type_list, node_to_children_dict, node_pytac_id_list

In [None]:
dataset_location = '../../../../v15-stdlib-coq8.11/dataset'
datapath = pathlib.Path("coq-tactician-stdlib.8.11.dev/theories/Init/Logic.bin")
filename="coq-tactician-stdlib.8.11.dev/theories/Init/Logic.bin"
ss = get_dag(22, dataset_location, filename)
[len(i) for i in ss]

[36, 53, 53, 26, 36]

In [23]:
class BasicRNN(nn.Module):
    def __init__(self, n_inputs, n_neurons):
        super(BasicRNN, self).__init__()

        self.Wx = torch.randn(n_inputs, n_neurons) # n_inputs X n_neurons
        self.Wh = torch.randn(n_neurons, n_neurons) # n_neurons X n_neurons

        self.b = torch.zeros(1, n_neurons) # 1 X n_neurons

    def forward(self, x, hidden):
        return torch.tanh(torch.mm(x, self.Wx) + torch.mm(hidden, self.Wh) + self.b)

In [24]:
class BasicCSRNN(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_dim, vocab_size, edges_size):
        super(BasicCSRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.Wx = torch.randn(input_size, hidden_size) # n_inputs X n_neurons
        self.We = torch.randn(edges_size, 1, hidden_size) # n_edges X 1 X n_neurons
        self.Wh = torch.randn(hidden_size, hidden_size) # n_neurons X n_neurons
        self.hidden_size =hidden_size
        self.b = torch.zeros(1, hidden_size) # 1 X n_neurons

    def forward(self, node):
        return self.node_forward(node)

    def node_forward(self, node):
        x = self.embedding(torch.tensor(node.label.which.value))
        x = x.unsqueeze(0)
        if node.children and not node.label.which.name == 'REL':
            hidden = torch.sum(torch.stack([self.node_forward(child)*self.We[edge_type.value] for edge_type, child in list(node.children)]), dim=0)
            #hidden = torch.sum(torch.stack(self.node_forward(child) for child in node.children), dim=0)
        else:
            # Ensure that the zero tensor is of the correct shape [batch size, hidden size]
            hidden = torch.zeros(x.size(0), self.hidden_size, dtype=torch.float, device=x.device)
        return torch.tanh(torch.mm(x, self.Wx) + hidden + self.b)
tmp = BasicCSRNN(10,20,10,100,100)
index=3123
with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
    datapath = pathlib.Path(filename)
    peano_dataset = reader[datapath] 

    # Define Graph to Traverse
    current_node = peano_dataset.node_by_id(index)
    a = tmp.forward(current_node)
print(a)

tensor([[ 0.5614, -0.7634,  0.3944, -1.0000,  0.8981,  0.8435,  0.5908,  0.9964,
         -0.9985, -1.0000, -1.0000, -0.9984,  1.0000, -0.9841, -0.6364,  0.9939,
          0.9217, -0.9988,  0.9987,  0.9973]], grad_fn=<TanhBackward0>)


In [25]:


class BasicCSRNNClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, embedding_dim, vocab_size, edges_size, output_size):
        super(BasicCSRNNClassifier, self).__init__()
        self.cs_rnn = BasicCSRNN(input_size, hidden_size, embedding_dim, vocab_size, edges_size)
        self.fc = nn.Linear(hidden_size, output_size)  # Linear classification layer

    def forward(self, node):
        # x shape: (batch_size, sequence_length, input_size)
        hn = self.cs_rnn(node)  # hn is the last hidden state
        # output shape: (batch_size, sequence_length, hidden_size)
        # hn shape: (1, batch_size, hidden_size)
        hn = hn.squeeze(0)  # Remove the first dimension to match input of linear layer
        logits = self.fc(hn)
        probabilities = F.softmax(logits)  # Applying softmax on the logits for probabilities
        return probabilities

model = BasicCSRNNClassifier(10,20,10,100,100,100)
index=3123
with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
    datapath = pathlib.Path(filename)
    peano_dataset = reader[datapath] 

    # Define Graph to Traverse
    current_node = peano_dataset.node_by_id(index)
    a = model.forward(current_node)
    print(a)

tensor([0.0099, 0.0164, 0.0050, 0.0072, 0.0064, 0.0119, 0.0073, 0.0069, 0.0174,
        0.0102, 0.0074, 0.0304, 0.0058, 0.0045, 0.0159, 0.0091, 0.0066, 0.0110,
        0.0116, 0.0121, 0.0090, 0.0058, 0.0063, 0.0080, 0.0072, 0.0123, 0.0124,
        0.0157, 0.0057, 0.0046, 0.0114, 0.0128, 0.0095, 0.0040, 0.0067, 0.0045,
        0.0083, 0.0103, 0.0080, 0.0150, 0.0063, 0.0084, 0.0029, 0.0026, 0.0193,
        0.0156, 0.0059, 0.0104, 0.0054, 0.0085, 0.0079, 0.0037, 0.0071, 0.0091,
        0.0142, 0.0119, 0.0152, 0.0166, 0.0088, 0.0078, 0.0075, 0.0128, 0.0063,
        0.0057, 0.0207, 0.0205, 0.0098, 0.0150, 0.0065, 0.0107, 0.0049, 0.0067,
        0.0027, 0.0261, 0.0193, 0.0064, 0.0094, 0.0106, 0.0064, 0.0085, 0.0140,
        0.0029, 0.0114, 0.0126, 0.0072, 0.0033, 0.0051, 0.0170, 0.0096, 0.0088,
        0.0103, 0.0202, 0.0066, 0.0191, 0.0125, 0.0077, 0.0115, 0.0058, 0.0131,
        0.0069], grad_fn=<SoftmaxBackward0>)


  probabilities = F.softmax(logits)  # Applying softmax on the logits for probabilities


In [26]:
from torch.utils.data import Dataset, DataLoader
import torch

class TacticianCpuDatasetaset(Dataset):
    def __init__(self, peano_dataset):
        self.peano_dataset = peano_dataset

    def __len__(self):
        return 10
        # return len(self.data)

    def __getitem__(self, idx):
        sample = self.peano_dataset.node_by_id(idx), torch.tensor(self.peano_dataset.node_by_id(idx).label.which.value)
        return sample


In [None]:
criterion = nn.CrossEntropyLoss()  # Appropriate for classification tasks
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
accumulation_steps = 10 

with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
    datapath = pathlib.Path(filename)
    peano_dataset = reader[datapath] 
    
    custom_dataset = TacticianCpuDatasetaset(peano_dataset)

    for epoch in range(5):  # Loop over the dataset multiple times
        for i in range(3100):#random.sample(range(3100), 400): 
            graph, label = custom_dataset.__getitem__(i)
            model.train()
            optimizer.zero_grad()
            outputs = model(graph)
            loss = criterion(outputs, label)
            loss = loss / accumulation_steps  # Normalize our loss (if averaged)

            # Backward pass (accumulates gradients)
            loss.backward()

            if (i + 1) % accumulation_steps == 0:  # Perform a step every 'accumulation_steps'
                optimizer.step()  # Update parameters
                optimizer.zero_grad()  # Clear gradients

            #print(f'Batch {i+1}')
        print(f'Epoch {epoch+1}/{5} Loss: {loss.item()}')

In [None]:
import random
predictions = []
actuals = []
with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
    datapath = pathlib.Path(filename)
    peano_dataset = reader[datapath] 
    
    custom_dataset = TacticianCpuDatasetaset(peano_dataset)
    for i in range(3100): #random.sample(range(3100), 300):
        graph, label = custom_dataset.__getitem__(i)
        predictions.append(torch.argmax(model.forward(graph), dim=0))
        actuals.append(label)
        
        #print(torch.argmax((model.forward(graph))), label)

In [17]:
from sklearn.metrics import classification_report
import numpy as np
report = classification_report(actuals, predictions)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       213
           2       1.00      1.00      1.00        62
           3       0.99      1.00      1.00       187
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       1.00      1.00      1.00       485
          10       0.99      1.00      1.00       195
          11       1.00      1.00      1.00       183
          12       0.00      0.00      0.00         2
          13       1.00      1.00      1.00       358
          14       1.00      1.00      1.00       455
          15       0.00      0.00      0.00         8
          16       0.99      1.00      0.99       782
          17       0.99      1.00      0.99        79
          18       0.99    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
    datapath = pathlib.Path(filename)
    peano_dataset = reader[datapath] 
    current_node = peano_dataset.node_by_id(index)
    print([current_node])

[node-170-3123]


In [None]:
# Create a Pygeometric dataset
# Train the 

In [89]:
dataset_location = '../../../../v15-stdlib-coq8.11/dataset'
datapath = pathlib.Path("coq-tactician-stdlib.8.11.dev/theories/Init/Logic.bin")
filename="coq-tactician-stdlib.8.11.dev/theories/Init/Logic.bin"
a = Tree(0, dataset_location, filename)
get_dag(0, dataset_location, filename)
get_file_size(dataset_location, filename)
def get_pytac_node_id(index, dataset_location, filename="coq-tactician-stdlib.8.11.dev/theories/Init/Logic.bin"):    
    with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
        datapath = pathlib.Path(filename)
        peano_dataset = reader[datapath] 
        current_node = peano_dataset.node_by_id(index)
        node_pytac_id = get_node_id(current_node)
    return node_pytac_id


def get_graph_nodes_depth(index, dataset_location, filename="coq-tactician-stdlib.8.11.dev/theories/Init/Logic.bin"):    
    depth = defaultdict(int)  # None indicates not yet processed
    visited = defaultdict(bool)
    stack = []  
    with data_reader.data_reader(pathlib.Path(dataset_location)) as reader:
        datapath = pathlib.Path(filename)
        peano_dataset = reader[datapath] 
        current_node = peano_dataset.node_by_id(index)
        # Process initial node 

        stack.append((current_node, 0))
        depth
        while stack: 
            current_node, state = stack.pop()
            node_pytac_id = get_node_id(current_node)
            if state == 0:  # First time visiting the node
                    stack.append((current_node, 1))  # Push back with state 1
                    # Add all children to the stack
                    for _, child in list(current_node.children):

                        if get_node_id(child) not in visited:  # Process only unprocessed children
                            stack.append((child, 0))
                            visited[get_node_id(child)]=True
            else: # all children processed
                if len(list(current_node.children))==0 or current_node.label.which.name=='REL': 
                    depth[node_pytac_id] = 0
                else:
                    depth[node_pytac_id] = max([depth[get_node_id(child)] for _,child in list(current_node.children)])+1
    return depth
get_current_node_id(0, dataset_location)   

defaultdict(int, {'170-187': 0, '170-1': 1, '170-0': 2, '170-188': 0})

In [99]:
import os
import torch
from torch_geometric.data import Dataset, Data

# Define a custom dataset class with an option to save and load
class PytacPygeomDataset(Dataset):
    def __init__(self, root, dataset_location, files_list, transform=None, pre_transform=None):
        super().__init__(root, dataset_location, files_list, transform, pre_transform)
        self.root = root
        self.graphs = []
        self.dataset_location = dataset_location
        self.files_list = files_list
    
    def _process(self):
        # Example: Generate some graphs
        for filename in self.files_list:    
            size = get_file_size(self.dataset_location, filename=filename)
 

            for index in range(size):
                if not index%100: 
                    print(index)
                dag_info = get_dag(index, self.dataset_location, filename=filename, max_iterations=100000000000000)
                
                execution_order_dict = get_graph_nodes_depth(index, self.dataset_location, filename=filename)
                x = dag_info[0] # node_attributes
                edge_index = dag_info[1] 
                edge_attr = dag_info[2]
                execution_order = [execution_order_dict[i] for i in dag_info[4]] # execution order for the TreeLstm from low to high - first Leaf nodes

                self.graphs.append(Data(x=x, edge_index=edge_index, edge_attr=edge_attr, execution_order=execution_order))

            # Save the processed data
        self.save_to_disk()
    
    def save_to_disk(self):
        os.makedirs(self.processed_dir, exist_ok=True)
        for idx, graph in enumerate(self.graphs):
            path = os.path.join(self.processed_dir, f'graph_{idx}.pt')
            torch.save(graph, path)
    
    def load_from_disk(self):
        self.graphs = []
        for filename in os.listdir(self.processed_dir):
            if filename.endswith('.pt'):
                path = os.path.join(self.processed_dir, filename)
                graph = torch.load(path)
                self.graphs.append(graph)

    def len(self):
        return len(self.graphs)

    def get(self, idx):
        return self.graphs[idx]

# Define paths and process dataset
root_dir = './graph_dataset'
dataset = PytacPygeomDataset(root_dir, dataset_location, [filename])
dataset._process()  # Create and save the dataset

0
100


KeyboardInterrupt: 

In [117]:
a = dataset.get(2)
a.x, a.edge_index, a.edge_attr,a.execution_order

(['DEFINITION',
  'LAMBDA',
  'PROD',
  'SORT_TYPE',
  'LAMBDA',
  'PROD',
  'REL',
  'LAMBDA',
  'REL',
  'PROD',
  'DEFINITION',
  'CASE',
  'SORT_PROP',
  'DEFINITION',
  'LAMBDA',
  'REL',
  'CASE_BRANCH',
  'REL',
  'REL'],
 [(0, 1),
  (0, 2),
  (1, 3),
  (1, 4),
  (2, 3),
  (2, 5),
  (4, 6),
  (4, 7),
  (5, 8),
  (5, 9),
  (7, 10),
  (7, 11),
  (9, 10),
  (9, 8),
  (10, 12),
  (10, 13),
  (11, 10),
  (11, 14),
  (11, 15),
  (11, 16),
  (13, 17),
  (14, 10),
  (14, 6),
  (16, 13),
  (16, 18)],
 ['CONST_DEF',
  'CONST_TYPE',
  'LAMBDA_TYPE',
  'LAMBDA_TERM',
  'PROD_TYPE',
  'PROD_TERM',
  'LAMBDA_TYPE',
  'LAMBDA_TERM',
  'PROD_TYPE',
  'PROD_TERM',
  'LAMBDA_TYPE',
  'LAMBDA_TERM',
  'PROD_TYPE',
  'PROD_TERM',
  'IND_TYPE',
  'IND_CONSTRUCT',
  'CASE_IND',
  'CASE_RETURN',
  'CASE_TERM',
  'CASE_BRANCH_POINTER',
  'CONSTRUCT_TERM',
  'LAMBDA_TYPE',
  'LAMBDA_TERM',
  'C_B_CONSTRUCT',
  'C_B_TERM'],
 [8, 7, 5, 0, 6, 4, 0, 5, 0, 3, 2, 4, 0, 1, 3, 0, 2, 0, 0])

In [None]:
#node_type_list, edge_list, edge_type_list, node_to_children_dict, node_pytac_id_list
id = 2
ss = get_dag(id, dataset_location, filename)
st = get_current_node_id(id, dataset_location)   

x = ss[0] # node_attributes
edge_list = ss[1] 
edge_attr = ss[2]
distnace_from_leaf = [st[i] for i in ss[4]] # execution order for the TreeLstm from low to high - first Leaf nodes



([8, 7, 5, 0, 6, 4, 0, 5, 0, 3, 2, 4, 0, 1, 3, 0, 2, 0, 0],
 ['DEFINITION',
  'LAMBDA',
  'PROD',
  'SORT_TYPE',
  'LAMBDA',
  'PROD',
  'REL',
  'LAMBDA',
  'REL',
  'PROD',
  'DEFINITION',
  'CASE',
  'SORT_PROP',
  'DEFINITION',
  'LAMBDA',
  'REL',
  'CASE_BRANCH',
  'REL',
  'REL'])

In [None]:
a 