# For RAND baseline, it suffices to compute log(1/n) where n is the number of nodes in the graph

### if the regression target is continuous and multivariate, we create an uniform distribution for all output dimensions, then we multiply all dimensions' probabilities together and take the natural log of the result

In [14]:
import os
import json
import yaml
import torch
import seaborn as sns
from pathlib import Path
from pydoc import locate
from pydgn.experiment.experiment import s2c
import matplotlib.pyplot as plt

from pydgn.experiment.experiment import Experiment

from pydgn.evaluation.grid import Grid
from pydgn.data.provider import DataProvider
from gmdn_dataset import BarabasiAlbertDataset, ErdosRenyiDataset, TUDatasetInterface

In [15]:
splits_folder = Path('SPLITS')

num_workers = 0

barabasi_albert_root = Path('DATA', 'BARABASI_ALBERT')
erdos_renyi_root = Path('DATA', 'ERDOS_RENYI')
tudataset_root = Path('DATA')

### Define Barabasi-Albert dataset loaders

In [16]:
def get_barabasi_albert(size, connectivity):
    dataset_name = Path(f'barabasi_albert_{size}_{connectivity}')
    if not os.path.exists(barabasi_albert_root / dataset_name):
        print("Create your dataset first")
        return None
    else:
        d = BarabasiAlbertDataset(barabasi_albert_root, 'barabasi_albert', size, connectivity)
        batch_size = len(d)
        shuffle = False
        dataset_getter_class = s2c('pydgn.data.provider.DataProvider')
        dataset_getter = dataset_getter_class(barabasi_albert_root,
                                              'SPLITS',
                                              s2c('data.dataset.BarabasiAlbertDataset'),
                                              dataset_name,
                                              1, # outer_folds
                                              1, # inner folds
                                              0, # num_workers
                                              False)  # pin memory
        dataset_getter.set_outer_k(0)
        dataset_getter.set_inner_k(0)

        # Instantiate the Dataset Loaders
        train_loader = dataset_getter.get_outer_train(batch_size=batch_size, shuffle=shuffle)
        val_loader = dataset_getter.get_outer_val(batch_size=batch_size, shuffle=shuffle)
        test_loader = dataset_getter.get_outer_test(batch_size=batch_size, shuffle=shuffle)
                
        return train_loader, val_loader, test_loader

### Istantiate Erdos-Renyi dataset

In [17]:
def get_erdos_renyi(size, connectivity):
    dataset_name = Path(f'erdos_renyi_{size}_{connectivity}')
    if not os.path.exists(erdos_renyi_root / dataset_name):
        print("Create your dataset first")
        return None
    else:
        d = ErdosRenyiDataset(erdos_renyi_root, 'erdos_renyi', size, connectivity)
        batch_size = len(d)
        shuffle = False
        dataset_getter_class = s2c('pydgn.data.provider.DataProvider')
        dataset_getter = dataset_getter_class(erdos_renyi_root,
                                              'SPLITS',
                                              s2c('data.dataset.ErdosRenyiDataset'),
                                              dataset_name,
                                              1, # outer_folds
                                              1, # inner folds
                                              0, # num_workers
                                              False)  # pin memory
        dataset_getter.set_outer_k(0)
        dataset_getter.set_inner_k(0)

        # Instantiate the Dataset Loaders
        train_loader = dataset_getter.get_outer_train(batch_size=batch_size, shuffle=shuffle)
        val_loader = dataset_getter.get_outer_val(batch_size=batch_size, shuffle=shuffle)
        test_loader = dataset_getter.get_outer_test(batch_size=batch_size, shuffle=shuffle)
                
        return train_loader, val_loader, test_loader

### Istantiate TuDataset

In [24]:
def get_TUDataset(dataset_name):

    if not os.path.exists(tudataset_root / dataset_name):
        print("Create your dataset first")
        return None
    else:
        d = TUDatasetInterface(tudataset_root, dataset_name, use_node_attr=True)
        batch_size = len(d)
        shuffle = False
        dataset_getter_class = s2c('pydgn.data.provider.DataProvider')
        dataset_getter = dataset_getter_class(tudataset_root,
                                              'SPLITS',
                                              s2c('data.dataset.TUDatasetInterface'),
                                              dataset_name,
                                              1, # outer_folds
                                              1, # inner folds
                                              0, # num_workers
                                              False)  # pin memory
        dataset_getter.set_outer_k(0)
        dataset_getter.set_inner_k(0)

        # Instantiate the Dataset Loaders
        train_loader = dataset_getter.get_outer_train(batch_size=batch_size, shuffle=shuffle)
        val_loader = dataset_getter.get_outer_val(batch_size=batch_size, shuffle=shuffle)
        test_loader = dataset_getter.get_outer_test(batch_size=batch_size, shuffle=shuffle)
                
        return d, train_loader, val_loader, test_loader

### Pick a dataset and get the mean log likelihood using a model of your choice

#### Barabasi-Albert dataset

In [18]:
size = 100
connectivity= "2@5@10@20"

barabasi_albert_name = Path(f'barabasi_albert_{size}_{connectivity}')
# Get full batch
barabasi_albert_train_loader, barabasi_albert_val_loader, barabasi_albert_test_loader = get_barabasi_albert(size, connectivity)

# NB: size of graphs is fixed
for train_data in barabasi_albert_train_loader:
    train_y = train_data.y
    
# NB: size of graphs is fixed
for val_data in barabasi_albert_val_loader:
    val_y = val_data.y
    
# NB: size of graphs is fixed
for test_data in barabasi_albert_test_loader:
    test_y = test_data.y
    
train_bins = torch.bincount(train_y.squeeze())
normalized_train_bins = train_bins.float()/torch.sum(train_bins)
# sns.distplot(train_y.squeeze().numpy(), bins=100, kde=True)

p_y = normalized_train_bins[train_y] + 1e-8
print("HIST performance on TRAIN is ", p_y.log().mean())

p_y = normalized_train_bins[val_y] + 1e-8
print("HIST performance on VAL is ", p_y.log().mean())

p_y = normalized_train_bins[test_y] + 1e-8
print("HIST performance on TEST is ", p_y.log().mean())

HIST performance on TRAIN is  tensor(-1.1418)
HIST performance on VAL is  tensor(-1.1545)
HIST performance on TEST is  tensor(-1.1607)


#### Erdos-Renyi dataset

In [21]:
size = 100
connectivity= "0.01@0.05@0.1@0.2"
erdos_renyi_name = Path(f'erdos_renyi_{size}_{connectivity}')
erdos_renyi_train_loader, erdos_renyi_val_loader, erdos_renyi_test_loader = get_erdos_renyi(size, connectivity)

# NB: size of graphs is fixed
for train_data in erdos_renyi_train_loader:
    train_y = train_data.y
    
# NB: size of graphs is fixed
for val_data in erdos_renyi_val_loader:
    val_y = val_data.y
    
# NB: size of graphs is fixed
for test_data in erdos_renyi_test_loader:
    test_y = test_data.y
    
train_bins = torch.bincount(train_y.squeeze())
normalized_train_bins = train_bins.float()/torch.sum(train_bins)
# sns.distplot(train_y.squeeze().numpy(), bins=100, kde=True)

p_y = normalized_train_bins[train_y] + 1e-8
print("HIST performance on TRAIN is ", p_y.log().mean())

p_y = normalized_train_bins[val_y] + 1e-8
print("HIST performance on VAL is ", p_y.log().mean())

p_y = normalized_train_bins[test_y] + 1e-8
print("HIST performance on TEST is ", p_y.log().mean())

HIST performance on TRAIN is  tensor(-2.3129)
HIST performance on VAL is  tensor(-2.2881)
HIST performance on TEST is  tensor(-2.3249)


#### alchemy_full

In [81]:
dataset_name = 'alchemy_full'
dataset, train_loader, val_loader, test_loader = get_TUDataset(dataset_name)

max_val, _ = dataset.data.y.max(dim=0)
min_val, _ = dataset.data.y.min(dim=0)
    
# NB: size of graphs is fixed
for train_data in train_loader:
    train_y = train_data.y
    
# NB: size of graphs is fixed
for val_data in val_loader:
    val_y = val_data.y
    
# NB: size of graphs is fixed
for test_data in test_loader:
    test_y = test_data.y


# BEST CHOSEN BY TRIALS WHILE LOOKING AT THE VALIDATION SCORE, THEN TEST SCORE WAS CHECKED
no_bins = 17

train_p_y_by_component = []
val_p_y_by_component = []
test_p_y_by_component = []
for i in range(train_y.shape[1]):    
    # print(f'Component {i+1}')
    
    bin_size = (max_val[i]-min_val[i])/no_bins
    train_hist = torch.histc(train_y[:,i], bins=no_bins, min=min_val[i], max=max_val[i])
    normalized_train_hist = train_hist.float()/torch.sum(train_bins)

    train_bin_assignment = ((train_y[:,i]-min_val[i])//bin_size).long() - 1
    val_bin_assignment = ((val_y[:,i]-min_val[i])//bin_size).long() - 1
    test_bin_assignment = ((test_y[:,i]-min_val[i])//bin_size).long() - 1

    p_y = normalized_train_hist[train_bin_assignment] + 1e-8
    train_p_y_by_component.append(p_y.log())
    # print(f"HIST performance on TRAIN for component {i+1} is ", p_y.log().mean())
  
    p_y = normalized_train_hist[val_bin_assignment] + 1e-8
    val_p_y_by_component.append(p_y.log())
    # print(f"HIST performance on VAL for component {i+1} is ", p_y.log().mean())

    p_y = normalized_train_hist[test_bin_assignment] + 1e-8
    test_p_y_by_component.append(p_y.log())
    # print(f"HIST performance on TEST for component {i+1} is ", p_y.log().mean())

train_p_y  = torch.stack(train_p_y_by_component, dim=1)
val_p_y  = torch.stack(val_p_y_by_component, dim=1)
test_p_y  = torch.stack(test_p_y_by_component, dim=1)
    
print("HIST performance on TRAIN is ", train_p_y.sum(dim=1).mean())
print("HIST performance on VAL is ", val_p_y.sum(dim=1).mean())
print("HIST performance on TEST is ", test_p_y.sum(dim=1).mean())

#### ZINC_full

In [63]:
dataset_name = 'ZINC_full'
dataset, train_loader, val_loader, test_loader = get_TUDataset(dataset_name)

max_val, _ = dataset.data.y.max(dim=0)
min_val, _ = dataset.data.y.min(dim=0)
    
# NB: size of graphs is fixed
for train_data in train_loader:
    train_y = train_data.y
    
# NB: size of graphs is fixed
for val_data in val_loader:
    val_y = val_data.y
    
# NB: size of graphs is fixed
for test_data in test_loader:
    test_y = test_data.y


# BEST CHOSEN BY TRIALS WHILE LOOKING AT THE VALIDATION SCORE, THEN TEST SCORE WAS CHECKED
no_bins = 31

train_p_y_by_component = []
val_p_y_by_component = []
test_p_y_by_component = []
for i in range(train_y.shape[1]):    
    print(f'Component {i+1}')
    
    bin_size = (max_val[i]-min_val[i])/no_bins
    train_hist = torch.histc(train_y[:,i], bins=no_bins, min=min_val[i], max=max_val[i])
    normalized_train_hist = train_hist.float()/torch.sum(train_bins)

    train_bin_assignment = ((train_y[:,i]-min_val[i])//bin_size).long() - 1
    val_bin_assignment = ((val_y[:,i]-min_val[i])//bin_size).long() - 1
    test_bin_assignment = ((test_y[:,i]-min_val[i])//bin_size).long() - 1

    p_y = normalized_train_hist[train_bin_assignment] + 1e-8
    train_p_y_by_component.append(p_y.log())
  
    p_y = normalized_train_hist[val_bin_assignment] + 1e-8
    val_p_y_by_component.append(p_y.log())

    p_y = normalized_train_hist[test_bin_assignment] + 1e-8
    test_p_y_by_component.append(p_y.log())


train_p_y  = torch.stack(train_p_y_by_component, dim=1)
val_p_y  = torch.stack(val_p_y_by_component, dim=1)
test_p_y  = torch.stack(test_p_y_by_component, dim=1)
    
print("HIST performance on TRAIN is ", train_p_y.sum(dim=1).mean())
print("HIST performance on VAL is ", val_p_y.sum(dim=1).mean())
print("HIST performance on TEST is ", test_p_y.sum(dim=1).mean())

Component 1
HIST performance on TRAIN is  tensor(-1.2752)
HIST performance on VAL is  tensor(-1.2793)
HIST performance on TEST is  tensor(-1.2841)
