In [3]:
import matplotlib.pyplot as plt
import networkx as nx
import json
import os
from os import path
from addict import Dict
import sys
import numpy as np
import pandas as pd
sys.path.append(os.path.expanduser('~/mlp/lgw/'))
from lgw.graph_generator import GraphGen, BigGraphGen
from lgw.args import get_args
from codes.utils.inspect_utils import *

In [4]:
data_loc = os.path.expanduser('~/checkpoint/lgw/data')
data_name = 'comp_r10_n100_ov'
loc = os.path.join(data_loc, data_name)

In [5]:
all_paths = {}
modes = ["train","valid","test"]
for mode in modes:
    data_path = os.path.join(loc, mode)
    all_paths[mode] = [
        folder
        for folder in os.listdir(data_path)
        if os.path.isdir(os.path.join(data_path, folder)) and os.path.exists(os.path.join(data_path, folder, 'config.json'))
    ] 
len(all_paths['train'])

51

In [6]:
worlds = {}
for mode, rule_worlds in all_paths.items():
    worlds[mode] = {}
    for rule_world in rule_worlds:
        worlds[mode][rule_world] = load_world(get_paths(mode, rule_world, loc))

In [7]:
all_patterns = []
for mode in worlds:
    for rule_world in worlds[mode]:
        all_patterns.extend([get_rel_pattern(g) for g in worlds[mode][rule_world].train])

In [8]:
## Stats methods
def get_graph_stats(world):
    num_nodes = []
    num_edges = []
    in_deg = []
    out_deg = []
    modes = ['train','valid','test']
    for mode in modes:
        graphs = world[mode]
        nx_graphs,_ = load_networkx_graphs(graphs)
        for nxg in nx_graphs:
            info = nx.info(nxg)
            info_d = {f.split(":")[0]:f.split(":")[1].strip() for f in info.split('\n')}
            num_nodes.append(int(info_d['Number of nodes']))
            num_edges.append(int(info_d['Number of edges']))
            in_deg.append(float(info_d['Average in degree']))
            out_deg.append(float(info_d['Average out degree']))
    return np.mean(num_nodes), np.mean(num_edges), np.mean(in_deg), np.mean(out_deg)

def get_simple_graph_stats(world):
    num_nodes = []
    num_edges = []
    modes = ['train','valid','test']
    for mode in modes:
        graphs = world[mode]
        for gr in graphs:
            num_edges.append(len(gr['edges']))
            all_nodes = [e[:2] for e in gr['edges']]
            all_nodes = [r for n in all_nodes for r in n]
            num_nodes.append(len(set(all_nodes)))
    return "{}-{},{}-{}".format(round(np.mean(num_nodes),3), round(np.std(num_nodes),2),
                                round(np.mean(num_edges),3), round(np.std(num_edges),2))

### Dataset Statistics

In [9]:
num_class = []
world_ids = []
num_des = []
world_type = []
world_modes = ['train','valid','test']
avg_resolution_length = []
num_nodes = []
num_edges = []
for world_mode in world_modes:
    for world_id, world in worlds[world_mode].items():
        world_ids.append(world_id)
        num_class.append(len(get_class(world)))
        num_des.append(len(get_descriptors(world)))
        world_type.append(world_mode)
        avg_resolution_length.append(get_avg_resolution_length(world))
        stats = get_simple_graph_stats(world)
        num_nodes.append(stats.split(',')[0])
        num_edges.append(stats.split(',')[1])
graphlog_stats = pd.DataFrame({'world_id':world_ids,
                               'world_id_num': [int(w.split('_')[-1]) for w in world_ids],
                               'num_class': num_class,
                               'ND': num_des,
                               'Average Resolution Length': avg_resolution_length,
                               'Split': world_type,
                               'Average Nodes': num_nodes,
                               'Average Edges': num_edges
                              })

In [10]:
## post processing
graphlog_stats['nodes-std'] = graphlog_stats['Average Nodes'].apply(lambda x: float(x.split('-')[-1]))
graphlog_stats['Average Nodes'] = graphlog_stats['Average Nodes'].apply(lambda x: float(x.split('-')[0]))
graphlog_stats['edges-std'] = graphlog_stats['Average Edges'].apply(lambda x: float(x.split('-')[-1]))
graphlog_stats['Average Edges'] = graphlog_stats['Average Edges'].apply(lambda x: float(x.split('-')[0]))
graphlog_stats['edge_to_noise_ratio'] = graphlog_stats['Average Resolution Length'] / graphlog_stats['Average Edges']
graphlog_stats['Average Resolution Length'] = graphlog_stats['Average Resolution Length'].apply(lambda x: round(x,2))

#### Add Supervised Learning Results

In [11]:
## Add supervised learning results
supervised_all = pd.read_csv('raw_data/supervised_result_complete.csv')
clean_model_names = {
    "GatedNodeGatEncoder": "GAT",
    "GatedGatEncoder": "E-GAT",
    "RepresentationGCNEncoder": "GCN",
    "CompositionRGCNEncoder": "RGCN",
    "Param": "Param",
}
supervised_all['rep_fn'] = supervised_all.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
supervised_all['comp_fn'] = supervised_all.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
supervised_test = pd.read_csv('raw_data/supervised_result_complete_test.csv')
supervised_test['rep_fn'] = supervised_test.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
supervised_test['comp_fn'] = supervised_test.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
supervised_valid = pd.read_csv('raw_data/supervised_result_complete_valid.csv')
supervised_valid['rep_fn'] = supervised_valid.model_representation_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])
supervised_valid['comp_fn'] = supervised_valid.model_composition_fn_path.apply(lambda x: clean_model_names[x.split('.')[-1]])



def get_baseline(rep_fn, comp_fn, rule):
    if rule in ['rule_54','rule_55','rule_56']:
        rez = supervised_test[(supervised_test.rep_fn == rep_fn) & (supervised_test.comp_fn == comp_fn) & (supervised_test.general_train_rule == rule)]
        rez['train_test_accuracy'] = rez.test_test_accuracy
    elif rule in ['rule_51','rule_52','rule_53']:
        rez = supervised_valid[(supervised_valid.rep_fn == rep_fn) & (supervised_valid.comp_fn == comp_fn) & (supervised_valid.general_train_rule == rule)]
        rez['train_test_accuracy'] = rez.valid_test_accuracy
    else:
        rez = supervised_all[(supervised_all.rep_fn == rep_fn) & (supervised_all.comp_fn == comp_fn) & (supervised_all.general_train_rule == rule)]
    return rez['train_test_accuracy'].values[0]

model_order = ["GAT-E-GAT","GCN-E-GAT","Param-E-GAT","GAT-RGCN","GCN-RGCN","Param-RGCN"]

In [12]:
shortened_model_names = {
    'GAT-E-GAT': 'M1',
    'GCN-E-GAT': 'M2',
    'Param-E-GAT': 'M3',
    'GAT-RGCN': 'M4',
    'GCN-RGCN': 'M5',
    'Param-RGCN': 'M6'
}

In [13]:
for model in model_order:
    graphlog_stats[model] = 0.0
    for i,row in graphlog_stats.iterrows():
        rep_fn = model.split('-')[0]
        comp_fn = '-'.join(model.split('-')[1:])
        #if row['Split'] != 'valid':
        graphlog_stats.at[i, model] = round(get_baseline(rep_fn, comp_fn, row['world_id']),3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [14]:
## Set Difficulty
graphlog_stats['Difficulty'] = graphlog_stats["GAT-E-GAT"].apply(lambda x: "Easy" if x >= 0.7 else "Medium" if x >= 0.54 else "Hard")

In [15]:
display_columns = ['world_id', 'num_class','ND','Split','Average Resolution Length','Average Nodes','Average Edges','Difficulty'] + model_order

In [16]:
graphlog_stats.sort_values(by='world_id_num')[(graphlog_stats["Split"] == 'train') & (graphlog_stats["Difficulty"] == "Hard")]

  """Entry point for launching an IPython kernel.


Unnamed: 0,world_id,world_id_num,num_class,ND,Average Resolution Length,Split,Average Nodes,Average Edges,nodes-std,edges-std,edge_to_noise_ratio,GAT-E-GAT,GCN-E-GAT,Param-E-GAT,GAT-RGCN,GCN-RGCN,Param-RGCN,Difficulty
14,rule_0,0,17,286,4.49,train,15.487,19.295,12.37,18.16,0.232481,0.481,0.5,0.494,0.486,0.462,0.462,Hard
35,rule_1,1,15,239,4.1,train,11.565,13.615,7.62,11.16,0.301013,0.432,0.411,0.428,0.406,0.4,0.408,Hard
21,rule_2,2,17,157,3.21,train,9.809,11.165,6.11,8.92,0.287429,0.412,0.357,0.373,0.347,0.347,0.319,Hard
34,rule_3,3,16,189,3.63,train,11.137,13.273,6.09,9.17,0.273649,0.429,0.404,0.473,0.373,0.401,0.451,Hard
24,rule_5,5,14,275,4.41,train,14.545,18.872,9.21,14.33,0.233611,0.526,0.539,0.548,0.429,0.461,0.455,Hard
36,rule_6,6,16,249,5.06,train,16.257,20.164,11.62,16.55,0.251162,0.528,0.514,0.536,0.498,0.495,0.476,Hard
2,rule_13,13,16,149,3.58,train,11.238,13.549,7.13,10.87,0.263973,0.453,0.402,0.419,0.347,0.298,0.344,Hard
11,rule_14,14,16,224,4.14,train,11.371,13.403,7.29,10.6,0.308929,0.448,0.457,0.401,0.314,0.318,0.332,Hard
33,rule_15,15,14,224,3.82,train,12.661,15.105,7.56,10.79,0.252896,0.494,0.423,0.501,0.402,0.397,0.435,Hard
28,rule_16,16,16,205,3.59,train,11.345,13.293,7.23,10.3,0.270099,0.318,0.332,0.292,0.328,0.306,0.291,Hard


In [17]:
print(graphlog_stats.sort_values(by='world_id_num')[display_columns].to_latex(index=False)) #to_csv('graphlog_stats.csv')

\begin{tabular}{lrrlrrrlrrrrrr}
\toprule
world\_id &  num\_class &    ND &  Split &  Average Resolution Length &  Average Nodes &  Average Edges & Difficulty &  GAT-E-GAT &  GCN-E-GAT &  Param-E-GAT &  GAT-RGCN &  GCN-RGCN &  Param-RGCN \\
\midrule
  rule\_0 &         17 &   286 &  train &                       4.49 &         15.487 &         19.295 &       Hard &      0.481 &      0.500 &        0.494 &     0.486 &     0.462 &       0.462 \\
  rule\_1 &         15 &   239 &  train &                       4.10 &         11.565 &         13.615 &       Hard &      0.432 &      0.411 &        0.428 &     0.406 &     0.400 &       0.408 \\
  rule\_2 &         17 &   157 &  train &                       3.21 &          9.809 &         11.165 &       Hard &      0.412 &      0.357 &        0.373 &     0.347 &     0.347 &       0.319 \\
  rule\_3 &         16 &   189 &  train &                       3.63 &         11.137 &         13.273 &       Hard &      0.429 &      0.404 &        0.473 

In [18]:
## best ranking
graphlog_stats['best'] = ""
for i,row in graphlog_stats.iterrows():
    vals = [(shortened_model_names[m], row[m]) for m in shortened_model_names]
    vals = sorted(vals, key=lambda x: x[1], reverse=True)
    graphlog_stats.at[i,'best'] = vals[0][0]
## Aggregating
graphlog_stats.best.value_counts()

M1    26
M3    20
M2    10
M4     1
Name: best, dtype: int64

In [19]:
graphlog_stats.mean()
graphlog_stats.to_csv('clean_data/graphlog_stats.csv')

In [23]:
graphlog_stats['mean_acc'] =  graphlog_stats[model_order].mean(axis=1)

In [28]:
','.join(list(graphlog_stats[graphlog_stats['Split'] == 'train'].sort_values(by='mean_acc', ascending=False)['world_id']))

'rule_34,rule_46,rule_20,rule_48,rule_37,rule_19,rule_35,rule_31,rule_38,rule_9,rule_33,rule_24,rule_36,rule_27,rule_18,rule_47,rule_32,rule_39,rule_49,rule_10,rule_12,rule_50,rule_8,rule_21,rule_22,rule_4,rule_45,rule_7,rule_41,rule_28,rule_40,rule_44,rule_43,rule_26,rule_6,rule_5,rule_11,rule_23,rule_0,rule_15,rule_42,rule_3,rule_1,rule_29,rule_25,rule_14,rule_13,rule_30,rule_2,rule_16,rule_17'

In [35]:
graphlog_stats[graphlog_stats['Difficulty'] == "Hard"]['world_id_num'].sort_values().tolist()

[0, 1, 2, 3, 5, 6, 13, 14, 15, 16, 17, 23, 25, 29, 30, 42, 44, 53]