In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.sparse as sp
import os.path as osp
import os
import torch
import torch_geometric as tg
import pickle
from collections import defaultdict

In [2]:
%cd ..

/nfs/homedirs/fuchsgru/MastersThesis


In [3]:
import data.constants as dc
from util import all_equal

In [4]:
df = []

for dataset in (dc.OGBN_ARXIV, dc.CORA_FULL, dc.CORA_ML, dc.AMAZON_COMPUTERS, dc.AMAZON_PHOTO, dc.CITESEER, dc.COAUTHOR_CS, dc.COAUTHOR_PHYSICS, dc.PUBMED, ):
    print(f'Analyzing {dataset}')
    base_dir = osp.join('.exported_datasets', dataset)
    for setting in ('hybrid', 'transductive'):
        for ood_type, (_ood_val, _ood_test) in (
            ('left-out-classes', ('ood-val', 'ood-test')), 
            # Both bernoulli and normal perturbations follow the same structure
            ('perturbations', ('ood-val-ber', 'ood-test-ber'))):


            split_dir = osp.join(base_dir, f'{setting}-{ood_type}')
            for fn in [osp.join(split_dir, fn) for fn in os.listdir(split_dir)]:
                with open(fn, 'rb') as f:
                    storage = pickle.load(f)
                    datasets = storage['data']

                    n_train = datasets['train'].x.size(0)
                    n_full = datasets['ood-test'].x.size(0)

                    label_u = datasets['train'].y.numpy()[datasets['train'].edge_index[0]]
                    label_v = datasets['train'].y.numpy()[datasets['train'].edge_index[1]]
                    
                    label_utest = datasets['ood-test'].y.numpy()[datasets['ood-test'].edge_index[0]]
                    label_vtest = datasets['ood-test'].y.numpy()[datasets['ood-test'].edge_index[1]]

                    train_labels = set(datasets['train'].y[datasets['train'].mask].numpy())
                    ood_labels = set(datasets['ood-test'].y[datasets['ood-test'].mask].numpy()) - train_labels

                    is_ood_label = np.zeros(datasets['ood-test'].x.size(0))
                    for label in ood_labels:
                        is_ood_label[datasets['ood-test'].y == label] = True

                    # For both ood-val and ood-test count many have the 'ood' tag
                    mask_ood_val_test = datasets['ood-val'].mask | datasets['ood-test'].mask
                    is_ood = ~(datasets[_ood_test].is_in_distribution[mask_ood_val_test])
                    fraction_ood_tag = is_ood.sum() / mask_ood_val_test.sum()

                    df.append({
                        'Train Vertices' : n_train,
                        'Features' : datasets['train'].x.size(1),
                        'Train Edges' : datasets['train'].edge_index.size(1),
                        'Fraction Dropped' : 1 - (n_train / n_full),
                        'Fraction OOD labels' : is_ood_label.sum() / is_ood_label.shape[0],
                        'Train Edge Density' : datasets['train'].edge_index.size(1) / (datasets['train'].x.size(0)**2),
                        'Fraction OOD tag in OOD dataset' : fraction_ood_tag.item(),
                        'Train Classes' : datasets['train'].y.max().item() + 1,
                        'Setting' : setting,
                        'OOD Experiment' : ood_type,
                        'Homophily Train Graph' : (label_u == label_v).sum() / label_u.shape[0],
                        'Homophily OOD Graph' : (label_utest == label_vtest).sum() / label_utest.shape[0],
                        'Dataset' : dataset,
                    })









Analyzing ogbn_arxiv
Analyzing cora_full
Analyzing cora_ml
Analyzing amazon_computers
Analyzing amazon_photo
Analyzing citeseer
Analyzing coauthor_cs
Analyzing coauthor_physics
Analyzing pubmed


In [5]:
df = pd.DataFrame(df)

In [6]:
df.index

RangeIndex(start=0, stop=164, step=1)

In [7]:
means = df.groupby(by=['Dataset', 'Setting', 'OOD Experiment']).mean()
stds = df.groupby(by=['Dataset', 'Setting', 'OOD Experiment']).std()

In [8]:
means[:8]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Train Vertices,Features,Train Edges,Fraction Dropped,Fraction OOD labels,Train Edge Density,Fraction OOD tag in OOD dataset,Train Classes,Homophily Train Graph,Homophily OOD Graph
Dataset,Setting,OOD Experiment,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
amazon_computers,hybrid,left-out-classes,5636.8,767.0,129952.0,0.578746,0.52724,0.00409,0.911005,8.0,0.83097,0.777193
amazon_computers,hybrid,perturbations,10780.4,767.0,324593.2,0.19435,0.0,0.002793,0.513353,10.0,0.778021,0.777193
amazon_computers,transductive,left-out-classes,13381.0,767.0,491556.0,0.0,0.52724,0.002745,0.918022,10.0,0.777193,0.777193
amazon_computers,transductive,perturbations,13381.0,767.0,491556.0,0.0,0.0,0.002745,0.52642,10.0,0.777193,0.777193
amazon_photo,hybrid,left-out-classes,5047.8,745.0,147060.4,0.325791,0.246961,0.005771,0.758037,5.0,0.79816,0.827206
amazon_photo,hybrid,perturbations,6031.0,745.0,156033.4,0.19447,0.0,0.00429,0.512379,8.0,0.826991,0.827206
amazon_photo,transductive,left-out-classes,7487.0,745.0,238087.0,0.0,0.246961,0.004247,0.766584,8.0,0.827206,0.827206
amazon_photo,transductive,perturbations,7487.0,745.0,238087.0,0.0,0.0,0.004247,0.526836,8.0,0.827206,0.827206


In [9]:
means.loc[('amazon_computers',       'hybrid', 'left-out-classes')]

Train Vertices                       5636.800000
Features                              767.000000
Train Edges                        129952.000000
Fraction Dropped                        0.578746
Fraction OOD labels                     0.527240
Train Edge Density                      0.004090
Fraction OOD tag in OOD dataset         0.911005
Train Classes                           8.000000
Homophily Train Graph                   0.830970
Homophily OOD Graph                     0.777193
Name: (amazon_computers, hybrid, left-out-classes), dtype: float64

In [10]:
df_clean = []
for dataset in (dc.OGBN_ARXIV, dc.CORA_FULL, dc.CORA_ML, dc.AMAZON_COMPUTERS, dc.AMAZON_PHOTO, dc.CITESEER, dc.COAUTHOR_CS, dc.COAUTHOR_PHYSICS, dc.PUBMED, ):
    homophily = means.loc[(dataset, 'transductive', 'left-out-classes')]['Homophily Train Graph']
    loc_frac = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction OOD labels']
    edge_density = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Train Edge Density']
    
    # Homophily on inductive train graphs
    homophily_loc_train_mean = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Homophily Train Graph']
    homophily_loc_train_std = stds.loc[(dataset, 'hybrid', 'left-out-classes')]['Homophily Train Graph']
    
    homophily_per_train_mean = means.loc[(dataset, 'hybrid', 'perturbations')]['Homophily Train Graph']
    homophily_per_train_std = stds.loc[(dataset, 'hybrid', 'perturbations')]['Homophily Train Graph']
    
    # Dropped fraction on inductive train graphs
    frac_dropped_loc_train_mean = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction Dropped']
    frac_dropped_loc_train_std = stds.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction Dropped']
    
    frac_dropped_per_train_mean = means.loc[(dataset, 'hybrid', 'perturbations')]['Fraction Dropped']
    frac_dropped_per_train_std = stds.loc[(dataset, 'hybrid', 'perturbations')]['Fraction Dropped']
    
    # OOD fraction on inductive ood graphs
    frac_ood_loc_train_mean = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction OOD tag in OOD dataset']
    frac_ood_loc_train_std = stds.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction OOD tag in OOD dataset']
    
    frac_ood_per_train_mean = means.loc[(dataset, 'hybrid', 'perturbations')]['Fraction OOD tag in OOD dataset']
    frac_ood_per_train_std = stds.loc[(dataset, 'hybrid', 'perturbations')]['Fraction OOD tag in OOD dataset']
    
    df_clean.append({
        'Dataset' : dataset,
        'Vertices' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Train Vertices']),
        'Features' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Features']),
        'Edges' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Train Edges']),
        'Classes' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Train Classes']),
        'Edge Density' : f'{edge_density:.4f}',
        '#LOC' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Train Classes']) - int(means.loc[(dataset, 'hybrid', 'left-out-classes')]['Train Classes']),
        'LOC fraction' : f'{loc_frac:.3f}',
        'Homophily' : f'{homophily:.3f}',
        'Hybrid LOC train homophily' : f'{homophily_loc_train_mean:.2f} +- {homophily_loc_train_std:.2f}',
        'Hybrid Per. train homophily' : f'{homophily_per_train_mean:.2f} +- {homophily_per_train_std:.2f}',
        'Hybrid LOC dropped fraction' : f'{frac_dropped_loc_train_mean:.2f} +- {frac_dropped_loc_train_std:.2f}',
        'Hybrid Per. dropped fraction' : f'{frac_dropped_per_train_mean:.2f} +- {frac_dropped_per_train_std:.2f}',
        'Hybrid LOC ood fraction' : f'{frac_ood_loc_train_mean:.2f} +- {frac_ood_loc_train_std:.2f}',
        'Hybrid Per. ood fraction' : f'{frac_ood_per_train_mean:.2f} +- {frac_ood_per_train_std:.2f}',
    })

df_clean = pd.DataFrame(df_clean)
df_clean.set_index('Dataset') 


Unnamed: 0_level_0,Vertices,Features,Edges,Classes,Edge Density,#LOC,LOC fraction,Homophily,Hybrid LOC train homophily,Hybrid Per. train homophily,Hybrid LOC dropped fraction,Hybrid Per. dropped fraction,Hybrid LOC ood fraction,Hybrid Per. ood fraction
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ogbn_arxiv,169343,128,2315598,40,0.0001,10,0.491,0.654,0.74 +- nan,0.65 +- nan,0.54 +- nan,0.10 +- nan,0.91 +- nan,0.48 +- nan
cora_full,4553,8710,28542,11,0.0022,4,0.374,0.8,0.78 +- 0.01,0.78 +- 0.01,0.47 +- 0.00,0.24 +- 0.01,0.80 +- 0.00,0.41 +- 0.02
cora_ml,2810,2879,15962,7,0.0027,2,0.26,0.784,0.77 +- 0.01,0.78 +- 0.01,0.37 +- 0.01,0.24 +- 0.01,0.70 +- 0.01,0.41 +- 0.02
amazon_computers,13381,767,491556,10,0.0041,2,0.527,0.777,0.83 +- 0.00,0.78 +- 0.01,0.58 +- 0.00,0.19 +- 0.00,0.91 +- 0.00,0.51 +- 0.00
amazon_photo,7487,745,238087,8,0.0058,3,0.247,0.827,0.80 +- 0.00,0.83 +- 0.00,0.33 +- 0.00,0.19 +- 0.00,0.76 +- 0.00,0.51 +- 0.00
citeseer,2110,3703,7388,6,0.0032,2,0.398,0.738,0.83 +- 0.01,0.74 +- 0.01,0.57 +- 0.01,0.34 +- 0.02,0.70 +- 0.01,0.29 +- 0.02
coauthor_cs,18333,6805,163788,15,0.0008,3,0.413,0.808,0.82 +- 0.00,0.81 +- 0.00,0.48 +- 0.00,0.21 +- 0.00,0.86 +- 0.00,0.47 +- 0.01
coauthor_physics,34493,8415,495924,5,0.0006,1,0.505,0.931,0.93 +- 0.00,0.93 +- 0.00,0.56 +- 0.00,0.20 +- 0.00,0.89 +- 0.00,0.50 +- 0.00
pubmed,19717,500,88648,3,0.0005,1,0.393,0.802,0.85 +- 0.00,0.80 +- 0.00,0.53 +- 0.00,0.28 +- 0.00,0.74 +- 0.00,0.36 +- 0.01
