In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.sparse as sp
import os.path as osp
import os
import torch
import torch_geometric as tg
import pickle
from collections import defaultdict

In [2]:
%cd ..

/nfs/homedirs/fuchsgru/MastersThesis


In [3]:
%ls .exported_datasets/coauthor_cs/hybrid-perturbations

split-0.pkl  split-1.pkl  split-2.pkl  split-3.pkl  split-4.pkl


In [4]:
with open('.exported_datasets/amazon_photo/hybrid-perturbations/split-0.pkl', 'rb') as f:
    storage = pickle.load(f)
    storage.keys()

In [5]:
storage['data']['test']['x'].size()

torch.Size([6036, 745])

In [6]:
import data.constants as dc
from util import all_equal

In [7]:
df = []

for dataset in (dc.OGBN_ARXIV, dc.CORA_FULL, dc.CORA_ML, dc.AMAZON_PHOTO, dc.CITESEER, dc.COAUTHOR_CS, dc.PUBMED, ):
    print(f'Analyzing {dataset}')
    base_dir = osp.join('.exported_datasets', dataset)
    for setting in ('hybrid', 'transductive'):
        for ood_type, (_ood_val, _ood_test) in (
            ('left-out-classes', ('ood-val', 'ood-test')), 
            # Both bernoulli and normal perturbations follow the same structure
            ('perturbations', ('ood-val-ber', 'ood-test-ber'))):


            split_dir = osp.join(base_dir, f'{setting}-{ood_type}')
            for fn in [osp.join(split_dir, fn) for fn in os.listdir(split_dir)]:
                with open(fn, 'rb') as f:
                    storage = pickle.load(f)
                    datasets = storage['data']

                    n_train = datasets['train'].x.size(0)
                    n_full = datasets['ood-test'].x.size(0)

                    label_u = datasets['train'].y.numpy()[datasets['train'].edge_index[0]]
                    label_v = datasets['train'].y.numpy()[datasets['train'].edge_index[1]]
                    
                    label_utest = datasets['ood-test'].y.numpy()[datasets['ood-test'].edge_index[0]]
                    label_vtest = datasets['ood-test'].y.numpy()[datasets['ood-test'].edge_index[1]]

                    train_labels = set(datasets['train'].y[datasets['train'].mask].numpy())
                    ood_labels = set(datasets['ood-test'].y[datasets['ood-test'].mask].numpy()) - train_labels

                    is_ood_label = np.zeros(datasets['ood-test'].x.size(0))
                    for label in ood_labels:
                        is_ood_label[datasets['ood-test'].y == label] = True

                    # For both ood-val and ood-test count many have the 'ood' tag
                    mask_ood_val_test = datasets['ood-val'].mask | datasets['ood-test'].mask
                    is_ood = ~(datasets[_ood_test].is_in_distribution[mask_ood_val_test])
                    fraction_ood_tag = is_ood.sum() / mask_ood_val_test.sum()

                    df.append({
                        'Train Vertices' : n_train,
                        'Features' : datasets['train'].x.size(1),
                        'Train Edges' : datasets['train'].edge_index.size(1),
                        'Fraction Dropped' : 1 - (n_train / n_full),
                        'Fraction OOD labels' : is_ood_label.sum() / is_ood_label.shape[0],
                        'Train Edge Density' : datasets['train'].edge_index.size(1) / (datasets['train'].x.size(0)**2),
                        'Fraction OOD tag in OOD dataset' : fraction_ood_tag.item(),
                        'Train Classes' : datasets['train'].y.max().item() + 1,
                        'Setting' : setting,
                        'OOD Experiment' : ood_type,
                        'Homophily Train Graph' : (label_u == label_v).sum() / label_u.shape[0],
                        'Homophily OOD Graph' : (label_utest == label_vtest).sum() / label_utest.shape[0],
                        'Dataset' : dataset,
                    })









Analyzing ogbn_arxiv
Analyzing cora_full
Analyzing cora_ml
Analyzing amazon_photo
Analyzing citeseer
Analyzing coauthor_cs
Analyzing pubmed


In [8]:
df = pd.DataFrame(df)

In [9]:
df.index

RangeIndex(start=0, stop=124, step=1)

In [10]:
means = df.groupby(by=['Dataset', 'Setting', 'OOD Experiment']).mean()
stds = df.groupby(by=['Dataset', 'Setting', 'OOD Experiment']).std()

In [11]:
means.loc[(dc.CORA_FULL, 'transductive', dc.PERTURBATION)]

Train Vertices                      2810.000000
Features                            8710.000000
Train Edges                        15962.000000
Fraction Dropped                       0.000000
Fraction OOD labels                    0.000000
Train Edge Density                     0.002022
Fraction OOD tag in OOD dataset        0.527410
Train Classes                          7.000000
Homophily Train Graph                  0.784363
Homophily OOD Graph                    0.784363
Name: (cora_full, transductive, perturbations), dtype: float64

In [12]:
#means.loc[('amazon_computers',       'hybrid', 'left-out-classes')]

In [13]:
df_clean = []
for dataset in (dc.OGBN_ARXIV, dc.CORA_FULL, dc.AMAZON_PHOTO, dc.CITESEER, dc.COAUTHOR_CS, dc.PUBMED, ):
    homophily = means.loc[(dataset, 'transductive', 'left-out-classes')]['Homophily Train Graph']
    loc_frac = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction OOD labels']
    edge_density = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Train Edge Density']
    
    # Homophily on inductive train graphs
    homophily_loc_train_mean = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Homophily Train Graph']
    homophily_loc_train_std = stds.loc[(dataset, 'hybrid', 'left-out-classes')]['Homophily Train Graph']
    
    homophily_per_train_mean = means.loc[(dataset, 'hybrid', 'perturbations')]['Homophily Train Graph']
    homophily_per_train_std = stds.loc[(dataset, 'hybrid', 'perturbations')]['Homophily Train Graph']
    
    # Dropped fraction on inductive train graphs
    frac_dropped_loc_train_mean = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction Dropped']
    frac_dropped_loc_train_std = stds.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction Dropped']
    
    frac_dropped_per_train_mean = means.loc[(dataset, 'hybrid', 'perturbations')]['Fraction Dropped']
    frac_dropped_per_train_std = stds.loc[(dataset, 'hybrid', 'perturbations')]['Fraction Dropped']
    
    # OOD fraction on inductive ood graphs
    frac_ood_loc_train_mean = means.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction OOD tag in OOD dataset']
    frac_ood_loc_train_std = stds.loc[(dataset, 'hybrid', 'left-out-classes')]['Fraction OOD tag in OOD dataset']
    
    frac_ood_per_train_mean = means.loc[(dataset, 'hybrid', 'perturbations')]['Fraction OOD tag in OOD dataset']
    frac_ood_per_train_std = stds.loc[(dataset, 'hybrid', 'perturbations')]['Fraction OOD tag in OOD dataset']
    
    df_clean.append({
        'Dataset' : dataset,
        'Vertices' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Train Vertices']),
        'Features' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Features']),
        'Edges' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Train Edges']),
        'Classes' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Train Classes']),
        'Edge Density' : f'{edge_density:.4f}',
        '#LOC' : int(means.loc[(dataset, 'transductive', 'left-out-classes')]['Train Classes']) - int(means.loc[(dataset, 'hybrid', 'left-out-classes')]['Train Classes']),
        'LOC fraction' : f'{loc_frac:.3f}',
        'Homophily' : f'{homophily:.3f}',
        'Hybrid LOC train homophily' : f'{100 * homophily_loc_train_mean:.2f}\\% +- {100 * homophily_loc_train_std:.2f}\\%',
        'Hybrid Per. train homophily' : f'{100 * homophily_per_train_mean:.2f}\\% +- {100 * homophily_per_train_std:.2f}\\%',
        'Hybrid LOC dropped fraction' : f'{100 * frac_dropped_loc_train_mean:.2f}\\% +- {100 * frac_dropped_loc_train_std:.2f}\\%',
        'Hybrid Per. dropped fraction' : f'{100 * frac_dropped_per_train_mean:.2f}\\% +- {100 * frac_dropped_per_train_std:.2f}\\%',
        'Hybrid LOC ood fraction' : f'{100 * frac_ood_loc_train_mean:.2f}\\% +- {100 * frac_ood_loc_train_std:.2f}\\%',
        'Hybrid Per. ood fraction' : f'{100 * frac_ood_per_train_mean:.2f}\\% +- {100 * frac_ood_per_train_std:.2f}\\%',
    })

df_clean = pd.DataFrame(df_clean)

df_clean.set_index('Dataset').loc[[dc.CORA_FULL, dc.CITESEER, dc.PUBMED, dc.COAUTHOR_CS, dc.AMAZON_PHOTO, dc.OGBN_ARXIV]].T


Dataset,cora_full,citeseer,pubmed,coauthor_cs,amazon_photo,ogbn_arxiv
Vertices,4553,2110,19717,18333,7487,169343
Features,8710,3703,500,6805,745,128
Edges,28542,7388,88648,163788,238087,2315598
Classes,11,6,3,15,8,40
Edge Density,0.0022,0.0032,0.0005,0.0008,0.0058,0.0001
#LOC,4,2,1,3,3,10
LOC fraction,0.374,0.398,0.393,0.413,0.247,0.491
Homophily,0.800,0.738,0.802,0.808,0.827,0.654
Hybrid LOC train homophily,78.29\% +- 0.58\%,83.43\% +- 1.11\%,85.13\% +- 0.27\%,82.28\% +- 0.21\%,79.82\% +- 0.28\%,74.84\% +- nan\%
Hybrid Per. train homophily,78.44\% +- 0.80\%,73.92\% +- 1.19\%,80.27\% +- 0.41\%,80.71\% +- 0.20\%,82.70\% +- 0.43\%,65.73\% +- nan\%


In [14]:
dfx = df_clean.set_index('Dataset').loc[[dc.CORA_FULL, dc.CITESEER, dc.PUBMED, dc.COAUTHOR_CS, dc.AMAZON_PHOTO, dc.OGBN_ARXIV]].T
for name, row in zip(dfx.index, dfx.to_numpy()):
    print(name)
    print(' & '.join(f'${i}$' for i in row))
    print()
    print()

Vertices
$4553$ & $2110$ & $19717$ & $18333$ & $7487$ & $169343$


Features
$8710$ & $3703$ & $500$ & $6805$ & $745$ & $128$


Edges
$28542$ & $7388$ & $88648$ & $163788$ & $238087$ & $2315598$


Classes
$11$ & $6$ & $3$ & $15$ & $8$ & $40$


Edge Density
$0.0022$ & $0.0032$ & $0.0005$ & $0.0008$ & $0.0058$ & $0.0001$


#LOC
$4$ & $2$ & $1$ & $3$ & $3$ & $10$


LOC fraction
$0.374$ & $0.398$ & $0.393$ & $0.413$ & $0.247$ & $0.491$


Homophily
$0.800$ & $0.738$ & $0.802$ & $0.808$ & $0.827$ & $0.654$


Hybrid LOC train homophily
$78.29\% +- 0.58\%$ & $83.43\% +- 1.11\%$ & $85.13\% +- 0.27\%$ & $82.28\% +- 0.21\%$ & $79.82\% +- 0.28\%$ & $74.84\% +- nan\%$


Hybrid Per. train homophily
$78.44\% +- 0.80\%$ & $73.92\% +- 1.19\%$ & $80.27\% +- 0.41\%$ & $80.71\% +- 0.20\%$ & $82.70\% +- 0.43\%$ & $65.73\% +- nan\%$


Hybrid LOC dropped fraction
$46.63\% +- 0.17\%$ & $57.02\% +- 0.68\%$ & $52.89\% +- 0.22\%$ & $48.35\% +- 0.16\%$ & $32.58\% +- 0.05\%$ & $63.03\% +- nan\%$


Hybrid Per. dropp

In [15]:
dfx = df_clean[[
    'Dataset',
    'Hybrid LOC train homophily',
    'Hybrid Per. train homophily',
    'Hybrid LOC dropped fraction',
    'Hybrid Per. dropped fraction',
    'Hybrid LOC ood fraction',
    'Hybrid Per. ood fraction',
    ]].set_index('Dataset')
for name, row in zip(dfx.index, dfx.to_numpy()):
    print(name)
    print(' & '.join(f'${i}$' for i in row))
    print()
    print()


ogbn_arxiv
$74.84\% +- nan\%$ & $65.73\% +- nan\%$ & $63.03\% +- nan\%$ & $21.97\% +- nan\%$ & $77.88\% +- nan\%$ & $45.46\% +- nan\%$


cora_full
$78.29\% +- 0.58\%$ & $78.44\% +- 0.80\%$ & $46.63\% +- 0.17\%$ & $24.16\% +- 1.18\%$ & $80.26\% +- 0.29\%$ & $41.18\% +- 2.06\%$


amazon_photo
$79.82\% +- 0.28\%$ & $82.70\% +- 0.43\%$ & $32.58\% +- 0.05\%$ & $19.45\% +- 0.12\%$ & $75.80\% +- 0.13\%$ & $51.24\% +- 0.32\%$


citeseer
$83.43\% +- 1.11\%$ & $73.92\% +- 1.19\%$ & $57.02\% +- 0.68\%$ & $34.27\% +- 2.34\%$ & $69.82\% +- 0.84\%$ & $28.88\% +- 1.96\%$


coauthor_cs
$82.28\% +- 0.21\%$ & $80.71\% +- 0.20\%$ & $48.35\% +- 0.16\%$ & $21.18\% +- 0.25\%$ & $85.52\% +- 0.29\%$ & $47.06\% +- 0.56\%$


pubmed
$85.13\% +- 0.27\%$ & $80.27\% +- 0.41\%$ & $52.89\% +- 0.22\%$ & $27.56\% +- 0.50\%$ & $74.22\% +- 0.31\%$ & $36.26\% +- 0.65\%$




In [16]:
dfx

Unnamed: 0_level_0,Hybrid LOC train homophily,Hybrid Per. train homophily,Hybrid LOC dropped fraction,Hybrid Per. dropped fraction,Hybrid LOC ood fraction,Hybrid Per. ood fraction
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ogbn_arxiv,74.84\% +- nan\%,65.73\% +- nan\%,63.03\% +- nan\%,21.97\% +- nan\%,77.88\% +- nan\%,45.46\% +- nan\%
cora_full,78.29\% +- 0.58\%,78.44\% +- 0.80\%,46.63\% +- 0.17\%,24.16\% +- 1.18\%,80.26\% +- 0.29\%,41.18\% +- 2.06\%
amazon_photo,79.82\% +- 0.28\%,82.70\% +- 0.43\%,32.58\% +- 0.05\%,19.45\% +- 0.12\%,75.80\% +- 0.13\%,51.24\% +- 0.32\%
citeseer,83.43\% +- 1.11\%,73.92\% +- 1.19\%,57.02\% +- 0.68\%,34.27\% +- 2.34\%,69.82\% +- 0.84\%,28.88\% +- 1.96\%
coauthor_cs,82.28\% +- 0.21\%,80.71\% +- 0.20\%,48.35\% +- 0.16\%,21.18\% +- 0.25\%,85.52\% +- 0.29\%,47.06\% +- 0.56\%
pubmed,85.13\% +- 0.27\%,80.27\% +- 0.41\%,52.89\% +- 0.22\%,27.56\% +- 0.50\%,74.22\% +- 0.31\%,36.26\% +- 0.65\%
