In [1]:
%cd /data/bruingjde/on-going/SNAM2021-code/

import numpy as np
import pandas as pd
import sklearn.model_selection
import sklearn.pipeline
import sklearn.preprocessing
import sklearn.metrics
import sklearn.linear_model
from tqdm.auto import tqdm

/data/bruingjde/on-going/SNAM2021-code


In [2]:
networks = [network for network in np.arange(1, 31) if network not in [15, 17, 26, 27]]

In [5]:
def get_performance(network: int, nswap_perc: int = 0, feature_set: str = 'II-A', model: str = 'LogisticRegression'):
    with open(f'data/{network:02}/{nswap_perc:+04.0f}/properties/{feature_set}_{model}.float') as file:
        return float(file.read())

In [32]:
def read_file(path):
    extension = path.split('.')[1]
    if extension == 'int':
        with open(path) as file:
            return int(file.read())
    elif extension == 'float':
        with open(path) as file:
            return float(file.read())
    else:
        raise Exception(f'{extension=}')

def get_stats(network: int):
    properties_dir = f'data/{network:02}/+000/properties/'
    properties = {prop.split('.')[0]: read_file(properties_dir + prop) 
                  for prop 
                  in ['nodes.int', 'edges.int', 'connected_pairs.int', 'edges.int', 
                      'assortativity.float', 'average_clustering.float', 'diameter.int']}
    info = pd.read_json('networks.jsonl', lines=True).set_index('index').loc[network]
    return {
        'Label': info['label'],
        'Domain': info['category'],
        '\\bar e': properties['edges'] / properties['connected_pairs'],
        'Nodes': properties['nodes'], 
        'Edges': properties['edges'],
        'Density': 2*properties['connected_pairs'] / (properties['nodes']*(properties['nodes'] - 1)),
        'D.a.': properties['assortativity'],
        'A.c.c': properties['average_clustering'],
        'Diameter': properties['diameter'],
        '': '\cite{' + info['source'] + '}' #type: ignore
    }

In [56]:
info = pd.read_json('networks.jsonl', lines=True).set_index('index')
table = pd.DataFrame({network: get_stats(network) for network in networks}).T
df = pd.DataFrame({
    'label': info['label'],
    'domain': info['category'],
    '$\\bar e$': table['\\bar e'],
    'Nodes $(n)$': table['Nodes'],
    'I': {network: get_performance(network, feature_set='I') for network in networks},
    'II-A': {network: get_performance(network, feature_set='II-A') for network in networks},
    'II-B': {network: get_performance(network, feature_set='II-B') for network in networks},
    'III': {network: get_performance(network, feature_set='III') for network in networks},
}).dropna().sort_values('Nodes $(n)$')
df

Unnamed: 0,label,domain,$\bar e$,Nodes $(n)$,I,II-A,II-B,III
12,Rado,Social,25.500308,167,0.699348,0.819987,0.768923,0.861764
13,UC,Information,4.793275,899,0.840002,0.874328,0.854075,0.959605
30,EU,Social,20.688123,986,0.726028,0.814727,0.779272,0.829504
19,Dem,Social,8.535812,1866,0.930208,0.942431,0.931164,0.957685
18,bitA,Social,1.746068,3683,0.937751,0.953392,0.958232,0.972897
20,bitOT,Social,1.722823,5573,0.882368,0.943998,0.942185,0.985327
21,chess,Information,1.050273,6050,0.83454,0.841566,0.834713,0.896547
6,HepTh,Information,1.353547,6798,0.704736,0.786951,0.758985,0.831005
2,HepPh,Information,1.944224,16959,0.643445,0.764387,0.757369,0.827235
5,Condm,Social,1.593577,17218,0.772864,0.83293,0.824342,0.821004


In [59]:
(df['II-A'] - df['I']).mean()

0.04215327987609877

In [60]:
(df['II-A'] - df['I']).std()

0.034623752733070584