In [118]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '../src')

%load_ext autoreload
%autoreload 2

import torch
import torch_geometric
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt


from torch_geometric.datasets import TUDataset
from preprocessing import data_transformation
from similarity import calculate_similarity_matrix

from model import GCN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [182]:
dataset = TUDataset(root='datasets/', name='MUTAG')
torch.manual_seed(1234)
dataset = dataset.shuffle()

## Preprocessing

#### Split: Train test validation

```train_dataset```: for training model<br/>
```val_dataset```: evaluate model for hyperparameter tunning<br/>
```test_dataset```: testing model after complete training<br/>

In [183]:
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data

In [184]:
tr, ts, vl = 0.8, 0.1, 0.1
dslen = len(dataset)
tri = round(tr*dslen)
tsi = round((tr+ts)*dslen)
train_dataset = dataset[:tri]
test_dataset = dataset[tri:tsi]
val_dataset = dataset[tsi:]

In [185]:
dataset.y

tensor([1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
        1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
        0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
        1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1,
        0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])

In [186]:
print(len(train_dataset))
train_dataset.y

150


tensor([1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
        1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,
        0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
        0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
        1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1,
        1, 1, 0, 1, 1, 1])

In [187]:
len(test_dataset)
test_dataset.y

tensor([0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0])

In [188]:
len(val_dataset)
val_dataset.y

tensor([0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1])

#### Batching

In [189]:
# paper 128
batch_size = 2

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [190]:
batch1 = None
for batch in train_loader:
    batch1 = batch
    break
print(batch1)
print(batch1.batch)
print("edge_index", batch1.edge_index)
print("batch",batch1.edge_attr)
print("ptr",batch1.ptr)

DataBatch(edge_index=[2, 92], x=[40, 7], edge_attr=[92, 4], y=[2], batch=[40], ptr=[3])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
edge_index tensor([[ 0,  0,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  5,  5,  5,  6,  6,  6,
          7,  8,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15,
         15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23,
         23, 24, 24, 25, 25, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30,
         30, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37,
         38, 39],
        [ 1,  5,  0,  2,  1,  3, 11,  2,  4,  9,  3,  5,  0,  4,  6,  5,  7,  8,
          6,  6,  3, 10,  9, 11, 15,  2, 10, 12, 11, 13, 12, 14, 13, 15, 16, 10,
         14, 14, 18, 22, 17, 19, 18, 20, 19, 21, 30, 20, 22, 23, 17, 21, 21, 24,
         28, 23, 25, 24, 26, 25, 27, 26, 28, 36, 23, 27, 29, 28, 30, 34, 20, 29,
         31, 30, 

In [191]:
print('train loader')
for data in train_loader:
    print(data.y)
    
print('val loader')
for data in val_loader:
    print(data.y)
    
print('test loader')
for data in test_loader:
    print(data.y)

train loader
tensor([1, 1])
tensor([0, 1])
tensor([0, 1])
tensor([1, 1])
tensor([0, 1])
tensor([1, 1])
tensor([1, 1])
tensor([0, 1])
tensor([1, 0])
tensor([1, 0])
tensor([1, 1])
tensor([0, 0])
tensor([1, 0])
tensor([1, 0])
tensor([1, 1])
tensor([1, 0])
tensor([1, 0])
tensor([1, 1])
tensor([0, 1])
tensor([1, 1])
tensor([1, 1])
tensor([0, 0])
tensor([1, 0])
tensor([1, 1])
tensor([0, 0])
tensor([1, 0])
tensor([1, 0])
tensor([1, 0])
tensor([1, 1])
tensor([1, 0])
tensor([1, 0])
tensor([1, 1])
tensor([1, 1])
tensor([1, 0])
tensor([1, 1])
tensor([1, 0])
tensor([0, 1])
tensor([1, 0])
tensor([1, 1])
tensor([0, 1])
tensor([1, 1])
tensor([1, 0])
tensor([1, 1])
tensor([1, 0])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([1, 0])
tensor([0, 0])
tensor([1, 1])
tensor([1, 1])
tensor([0, 1])
tensor([1, 1])
tensor([1, 1])
tensor([1, 0])
tensor([1, 1])
tensor([1, 1])
tensor([1, 1])
tensor([0, 1])
tensor([1, 1])
tensor([1, 0])
tensor([0, 1])
tensor([1, 1])
tensor([0, 1])
tensor([0, 1

## Building Model

In [192]:
from torch_geometric.nn import GCNConv
from torch.nn import Linear

from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import global_add_pool

### Base model

In [193]:
class Base(torch.nn.Module):
    # merging type: o --> complement only, s --> substraction, c --> concatenation
    def __init__(self, dataset, hidden_channels):
        super(Base, self).__init__()
        
        # weight seed
        torch.manual_seed(42)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        # classification layer
        
        self.lin = Linear(hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch):
        # Embed original
        embedding = self.conv1(x, edge_index)
        embedding = embedding.relu()
        embedding = self.conv2(embedding, edge_index)
        embedding = embedding.relu()
        embedding = self.conv3(embedding, edge_index)
        embedding = embedding.relu()
        # subgraph_embedding = subgraph_embedding.relu()
        
        embedding = global_mean_pool(embedding, batch)
        h = self.lin(embedding)
        h = h.relu()
        h = self.lin2(h)
        
        return embedding, h

In [194]:
base = Base(dataset, 64)
base

Base(
  (conv1): GCNConv(7, 64)
  (conv2): GCNConv(64, 64)
  (conv3): GCNConv(64, 64)
  (lin): Linear(in_features=64, out_features=64, bias=True)
  (lin2): Linear(in_features=64, out_features=2, bias=True)
)

In [197]:
def train_base(model, loader, experiment_mode=False):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
    
    model.train()
    
    for data in loader:
        if experiment_mode:
            emb, h = model(data.x, data.edge_index, data.batch, data.ptr)
        else:
            emb, h = model(data.x, data.edge_index, data.batch)
        loss = criterion(h, data.y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return loss
    #     print(h[0])
    # print(loss)

@torch.no_grad()
def test_base(model, loader, experiment_mode=False):
    model.eval()
    correct = 0
    for data in loader:
        if experiment_mode:
            emb, h = model(data.x, data.edge_index, data.batch, data.ptr)
        else:
            emb, h = model(data.x, data.edge_index, data.batch)
        pred = h.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct/len(loader.dataset)

base = Base(dataset, 64)
train_base(base, train_loader)

tensor(0.4234, grad_fn=<NllLossBackward0>)

In [198]:
# epoch = 100

# base = Base(dataset, 64)
# train_base(base, train_loader)

# # Train
# for _ in range(epoch):
#     loss = round(train_base(base, train_loader).item(), 2)
#     train_acc = round(test_base(base, train_loader), 2)
#     val_acc = round(test_base(base, val_loader), 2)
    
#     print(f'epoch {_}; loss: {loss}; train_acc: {train_acc}; test_acc: {val_acc}')

# # Test
# test = test_base(base, test_loader)
# print(f'Accuracy: {test}')

### Experiment Model

In [199]:
from sklearn.cluster import AffinityPropagation

In [200]:
batch1 = None
for batch in train_loader:
    batch1 = batch
    break
print(batch1)
print(batch1.batch)
print("edge_index", batch1.edge_index)
print("batch",batch1.edge_attr)
print("ptr",batch1.ptr)

DataBatch(edge_index=[2, 92], x=[40, 7], edge_attr=[92, 4], y=[2], batch=[40], ptr=[3])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
edge_index tensor([[ 0,  0,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  5,  5,  5,  6,  6,  6,
          7,  8,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15,
         15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20, 21, 21, 21, 22, 22, 23, 23,
         23, 24, 24, 25, 25, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30,
         30, 31, 31, 31, 32, 32, 33, 33, 34, 34, 34, 35, 35, 36, 36, 37, 37, 37,
         38, 39],
        [ 1,  5,  0,  2,  1,  3, 11,  2,  4,  9,  3,  5,  0,  4,  6,  5,  7,  8,
          6,  6,  3, 10,  9, 11, 15,  2, 10, 12, 11, 13, 12, 14, 13, 15, 16, 10,
         14, 14, 18, 22, 17, 19, 18, 20, 19, 21, 30, 20, 22, 23, 17, 21, 21, 24,
         28, 23, 25, 24, 26, 25, 27, 26, 28, 36, 23, 27, 29, 28, 30, 34, 20, 29,
         31, 30, 

In [201]:
print(max(batch1.edge_index[0]))
print(max(batch1.edge_index[1]))
print((dataset[0].edge_index))
print((batch1.ptr), '; len:', len(batch1.ptr))

tensor(39)
tensor(39)
tensor([[ 0,  0,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  5,  5,  5,  6,  6,  6,
          7,  8,  9,  9, 10, 10, 10, 11, 11, 11, 12, 12, 13, 13, 14, 14, 14, 15,
         15, 16],
        [ 1,  5,  0,  2,  1,  3, 11,  2,  4,  9,  3,  5,  0,  4,  6,  5,  7,  8,
          6,  6,  3, 10,  9, 11, 15,  2, 10, 12, 11, 13, 12, 14, 13, 15, 16, 10,
         14, 14]])
tensor([ 0, 17, 40]) ; len: 3


Below --> Subgraph extractor with batch information

#### Model modification

In [365]:
# paper 128
batch_size = 5

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

batch1 = next(iter(train_loader))

In [371]:


from similarity import calculate_similarity_matrix, testt


# AP Clustering
from sklearn.cluster import AffinityPropagation

from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import global_max_pool

import torch.nn.functional as F

class Experiment(torch.nn.Module):
    # merging type: o --> complement only, s --> substraction, c --> concatenation
    def __init__(self, dataset, hidden_channels):
        super(Experiment, self).__init__()
        
        # weight seed
        torch.manual_seed(42)
        self.conv1 = GCNConv(dataset.num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        # self.conv3 = GCNConv(hidden_channels, hidden_channels)
        
        # embeddings for subgraph
        self.conv4 = GCNConv(hidden_channels, hidden_channels)
        self.conv5 = GCNConv(hidden_channels, hidden_channels)
        
        # classification layer
        self.lin = Linear(hidden_channels*2, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, batch, ptr):
        # Embed original
        embedding = self.conv1(x, edge_index)
        embedding = embedding.relu()
        embedding = self.conv2(embedding, edge_index)
        # embedding = embedding.relu()
        # embedding = self.conv3(embedding, edge_index)
        # embedding = embedding.relu()
        
        # generate subgraph based on embeddings
        feature_emb = embedding.detach()
        
        subgraph_edge_index, communities, S, batch_communities = self.subgraph_generator(feature_emb, edge_index, batch, ptr)
        subgraph_embedding = self.conv4(embedding, subgraph_edge_index)
        subgraph_embedding = subgraph_embedding.relu()
        subgraph_embedding = self.conv5(subgraph_embedding, subgraph_edge_index)
        # subgraph_embedding = subgraph_embedding.relu()
        
        self.subgraph_pooling(subgraph_embedding, communities, batch, ptr, batch_communities)
        
        embedding = global_mean_pool(embedding, batch)
        subgraph_embedding = global_max_pool(subgraph_embedding, batch)
        
        
        h = torch.cat((embedding, subgraph_embedding), 1)
        
        h = F.dropout(h, p=0.3, training=self.training)
        h = self.lin(h)
        h = h.relu()
        x = F.dropout(h, p=0.3, training=self.training)
        h = self.lin2(h)
        
        return embedding, h, S, communities
    
    def subgraph_generator(self, embeddings, batch_edge_index, batch, ptr):
        '''
        Return subgraph_edge_index (edge_index of created subgraph)
        '''
        graph_counter = 0
        edge_index = [[],[]]
        subgraph_edge_index = [[],[]]
        # Gs = []
        sub_created = False
        graph_bound = {}
        all_communities = []
        batch_communities = {}
        S = []

        for i in range(len(ptr)-1):
            graph_bound[i] = [ptr[i].item(), ptr[i+1].item()]
        
        for i, (src, dst) in enumerate(zip(batch_edge_index[0], batch_edge_index[1])):
            lower_bound = graph_bound[graph_counter][0]
            upper_bound = graph_bound[graph_counter][1]
            if ((src >= lower_bound and src < upper_bound) or
                (dst >= lower_bound and dst < upper_bound)):
                
                edge_index[0].append(src - lower_bound)
                edge_index[1].append(dst - lower_bound)
            else:
                sub_created = True
                
            if (i == len(batch_edge_index[0]) - 1) or sub_created:
                sub_created = False
                
                embs = []
                # make new graph
                for i, (b, emb) in enumerate(zip(batch, embeddings)):
                    if (b == graph_counter):
                        embs.append(emb)
                # print('emb x', embs)
                G = data_transformation(edge_index, embs)
                # dont need this at the moment
                # Gs.append(G)
                
                # Calculate similarity matrix
                S = calculate_similarity_matrix(G)
                
                # print('S matrix', S)
                # AP Clustering        
                clustering = AffinityPropagation(affinity='precomputed', damping=0.9, random_state=0, convergence_iter=15, max_iter=1000)
                clustering.fit(S)
                
                print('clustering label', clustering.labels_)
                
                # Get community
                communities = {}
                for lab in clustering.labels_:
                    communities[lab] = []
                    all_communities.append(lab)
                for nd, clust in enumerate(clustering.labels_):
                    communities[clust].append(nd)
                print(communities)
                
                edge_index = [[],[]]
                batch_communities[graph_counter] = communities
                
                graph_counter+=1
                
                # Make subgraph edge_index
                for c in communities:
                    w = G.subgraph(communities[c])
                    for sub in w.edges:
                        subgraph_edge_index[0].append(sub[0] + lower_bound)
                        subgraph_edge_index[1].append(sub[1] + lower_bound)
                
                print('all com', all_communities)
                # break # sementara aja
        
        # print('batch communities', batch_communities)
        return torch.tensor(subgraph_edge_index), all_communities, S, batch_communities
    
        
    def subgraph_pooling(self, embeddings, communities, batch, ptr, batch_communities):
        # batch communities: batch -> communities -> member
        pool_type = 'mean'
        curr_batch = 0
        emb_temp = None
        emb_pool = []
        print('batch communities', batch_communities)
        
        print('batch loop')
        print('')
        
        # LOOP THROUGH BATCH
        for b in batch_communities:
            print(f'==== BATCH {b} ====')
            print('lower bound', ptr[b].item())
            print('len communities on this batch', len(batch_communities[b]))
            
            # initialize array
            emb_temp = [[] for _ in range(len(batch_communities[b]))]
            emb_pool = [[] for _ in range(len(batch_communities[b]))]
            for comm in batch_communities[b]:
                for member in batch_communities[b][comm]:
                    # emb_temp[comm].append(member + ptr[b].item())
                    index_used = member + ptr[b].item()
                    emb_temp[comm].append(embeddings[index_used].detach().tolist())
                    # print('embtemp-log', emb_temp)
                    # print(comm, "-",member)
                print('break new community', comm)

                # Pooling per sub graph                
                if pool_type == 'mean': # mean pool
                    emb_pool[comm] = np.array(emb_temp[comm]).mean(axis=0)
                elif pool_type == 'add': # add pool
                    emb_pool[comm] = np.array(emb_temp[comm]).sum(axis=0)
                else:
                    print('TODO: fill later')
                
            print("pool subgraph === ", np.array(emb_pool))
            print("Pool size ", np.array(emb_pool).shape)
            print()
            # emb_temp = np.array(emb_temp)
            # print("emb_temp", emb_temp)
            # print("len emb_temp", len(emb_temp))
            # print("emb_temp average")
            # print("size emb_temp", len(emb_temp))
        # print(len(emb_temp))
                    
        # curr_batch+=1
        # # print('num iteration', curr_batch)
        # print("communities", communities)
        # print("batch", batch)
        # print("ptr", ptr)
        # print('calling subgraph pooling')
        

experiment = Experiment(dataset, 64)
emb, h, S, communities = experiment(batch1.x, batch1.edge_index, batch1.batch, batch1.ptr)

clustering label [0 0 0 0 0 0 1 0 0 0 1 2 2 2 2 1 2]
{0: [0, 1, 2, 3, 4, 5, 7, 8, 9], 1: [6, 10, 15], 2: [11, 12, 13, 14, 16]}
all com [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 2, 1, 2]
clustering label [0 1 1 3 0 0 2 2 2 2 2 3 3 1 3 3 3 3 3 2 1 3 3]
{0: [0, 4, 5], 1: [1, 2, 13, 20], 3: [3, 11, 12, 14, 15, 16, 17, 18, 21, 22], 2: [6, 7, 8, 9, 10, 19]}
all com [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 2, 1, 2, 0, 1, 1, 3, 0, 0, 2, 2, 2, 2, 2, 3, 3, 1, 3, 3, 3, 3, 3, 2, 1, 3, 3]
clustering label [0 0 0 0 1 1 1 2 2 2 2 2 2 2 2 2 2]
{0: [0, 1, 2, 3], 1: [4, 5, 6], 2: [7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}
all com [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 2, 1, 2, 0, 1, 1, 3, 0, 0, 2, 2, 2, 2, 2, 3, 3, 1, 3, 3, 3, 3, 3, 2, 1, 3, 3, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
clustering label [0 0 0 0 1 1 1 1 1 2 1 1 0 0 3 1 2 2 3 3 2 1 1]
{0: [0, 1, 2, 3, 12, 13], 1: [4, 5, 6, 7, 8, 10, 11, 15, 21, 22], 2: [9, 16, 17, 20], 3: [14, 18, 19]}
all com [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,

In [350]:
test = np.array([[1,2,3],[1,3,2],[2,2,2]])

print('mean', np.mean(test, axis=0))
print('sum', np.sum(test, axis=0))

mean [1.33333333 2.33333333 2.33333333]
sum [4 7 7]


In [295]:
emb_temp = [[] for _ in range(3)]
emb_temp[1].append(2)
print(emb_temp)

[[], [2], []]


In [252]:
emb.size()
# len(communities)

torch.Size([2, 64])

In [229]:
print(len(communities) == 
batch1.batch.size()[0])

True


In [207]:
clustering = AffinityPropagation(affinity='precomputed', damping=0.7, random_state=42, convergence_iter=15, max_iter=3000)
clustering.fit(S)
clustering.labels_
# clustering.

array([0, 0, 0, 0, 3, 0, 1, 1, 2, 1, 2, 2, 3, 3, 3, 4, 3, 3, 3, 3, 4, 4,
       4], dtype=int64)

In [208]:
dataset

MUTAG(188)

In [210]:
def expTrain(train_loader, val_loader, test_loader, epoch = 2):
    import warnings
    warnings.filterwarnings("ignore", category=UserWarning) 

    experiment = Experiment(dataset, 64)

    # Train
    print('process training')
    for _ in range(epoch):
        loss = round(train_base(experiment, train_loader, True).item(), 5)
        train_acc = round(test_base(experiment, train_loader, True), 5)
        val_acc = round(test_base(experiment, val_loader, True), 5)
        
        print(f'epoch {_}; loss: {loss}; train_acc: {train_acc}; test_acc: {val_acc}')

    # Test
    print('process testing')
    test = test_base(experiment, test_loader, True)
    print(f'Accuracy: {test}')

# expTrain(train_loader, val_loader, test_loader, epoch = 1)

In [211]:
def baseTrain(train_loader, val_loader, test_loader, epoch = 10):
    base = Base(dataset, 64)

    # Train
    for _ in range(epoch):
        loss = round(train_base(base, train_loader).item(), 5)
        train_acc = round(test_base(base, train_loader), 5)
        val_acc = round(test_base(base, val_loader), 5)
        
        print(f'epoch {_}; loss: {loss}; train_acc: {train_acc}; val_acc: {val_acc}; test: {round(test_base(base, test_loader), 2)}')

    # Test
    test = test_base(base, test_loader)
    print(f'Accuracy: {test}')

#### Cross validation 10

In [30]:
from sklearn.model_selection import KFold

In [45]:
train_dataset = dataset[:round(len(dataset) * 0.8)]
test_dataset = dataset[round(len(dataset) * 0.8):]

In [None]:
S

In [46]:
# 
train_dataset
test_dataset
k = 10

splits = KFold(n_splits=k,shuffle=True,random_state=42)
k_counter = 0

for fold, (train_idx,val_idx) in enumerate(splits.split(np.arange(len(train_dataset)))):
    # print('Fold {}'.format(fold + 1))
    # print(f'Fold',fold,'Train_idx',train_idx,'Val_idx',val_idx)
    print(f'Fold {fold}/{k}')
    #if k_counter > 2:
    #    break
    
    fold_train = []
    for key in train_idx:
        fold_train.append(train_dataset[key])

    fold_val = [] 
    for key in val_idx:
        fold_val.append(train_dataset[key])

    tr = DataLoader(fold_train, batch_size=batch_size, shuffle=True)
    vd = DataLoader(fold_val, batch_size=batch_size, shuffle=True)
    ts = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
    
    # Base model
    print("=== Base model ===")
    baseTrain(tr, vd, ts, 10)
    print("=== Experiment model ===")
    expTrain(tr, vd, ts, 10)
    
    k_counter += 1

Fold 0/10
=== Base model ===
epoch 0; loss: 0.69358; train_acc: 0.7125; val_acc: 0.675; test: 0.81
epoch 1; loss: 0.51878; train_acc: 0.72083; val_acc: 0.675; test: 0.73
epoch 2; loss: 0.57897; train_acc: 0.71944; val_acc: 0.6375; test: 0.73
epoch 3; loss: 0.55901; train_acc: 0.72778; val_acc: 0.6875; test: 0.74
epoch 4; loss: 0.58857; train_acc: 0.70556; val_acc: 0.575; test: 0.68
epoch 5; loss: 0.49834; train_acc: 0.7125; val_acc: 0.6625; test: 0.72
epoch 6; loss: 0.55777; train_acc: 0.73472; val_acc: 0.7125; test: 0.77
epoch 7; loss: 0.59016; train_acc: 0.74306; val_acc: 0.725; test: 0.76
epoch 8; loss: 0.52072; train_acc: 0.74722; val_acc: 0.6625; test: 0.77
epoch 9; loss: 0.58987; train_acc: 0.7125; val_acc: 0.75; test: 0.77
Accuracy: 0.77
=== Experiment model ===
process training
epoch 0; loss: 0.67796; train_acc: 0.475; test_acc: 0.575
epoch 1; loss: 0.66832; train_acc: 0.65417; test_acc: 0.7125
epoch 2; loss: 0.7253; train_acc: 0.60972; test_acc: 0.6625
epoch 3; loss: 0.65472; 