In [1]:
import itertools, json, os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
from tqdm.auto import tqdm

%cd /data/bruingjde/on-going/SNAM2021-code/

/data/bruingjde/on-going/SNAM2021-code


In [4]:
def get_result():
  network_information = pd.read_json('networks.jsonl', lines=True).set_index(np.arange(1, 31)).to_dict(orient='index')
  result = []
  iterations = [(network, nswap_perc, method)
                for network in np.arange(1,31)
                for nswap_perc in np.arange(-100, 101, 20)
                for method in ['a', 'b']
                if not (nswap_perc == 0 and method == 'b')]
  for network, nswap_perc, method in tqdm(iterations):
    stats = {'network': network, 'nswap_perc': nswap_perc, **network_information[network]}
    if nswap_perc == 0:
      directory = f'data/{network:02}/{nswap_perc:+04.0f}/properties/'
    else:
      directory = f'data/{network:02}/{nswap_perc:+04.0f}{method}/properties/'        
      stats['method'] = method
    if os.path.isdir(directory):
      for file in os.scandir(directory):
        with open(file.path) as f:
          content = f.read()
        file_name, file_extension = file.name.split('.')
        if file_extension == 'int':
          content = int(content)
        elif file_extension == 'float':
          content = float(content)
        stats[file_name] = content
    result.append(stats)
  return result
result = get_result()

  0%|          | 0/630 [00:00<?, ?it/s]

# Assortativity

In [5]:
df = (
  pd.DataFrame(result)
  .query("(method == 'b' or nswap_perc == 0) and (network not in [15, 17, 26, 27])")
  .pivot(['network', 'label', 'nodes'], 'nswap_perc', 'assortativity')
  .sort_values('nodes')
  .reset_index(['nodes', 'network'], drop=True)
  .rename(columns=lambda x: f"{x}%")
  .round(2)
  .dropna(how='all')
)
df.loc['mean'] = df.mean()
df

nswap_perc,-100%,-80%,-60%,-40%,-20%,0%,20%,40%,60%,80%,100%
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Rado,0.01,0.01,0.07,0.09,-0.0,0.15,0.14,0.18,0.16,0.09,0.19
UC,-0.05,-0.03,-0.02,0.01,0.06,0.1,0.14,0.17,0.18,0.21,0.23
EU,0.23,0.16,0.36,0.34,0.15,0.05,0.12,0.11,0.1,-0.18,-0.11
Dem,-0.21,-0.21,-0.16,-0.14,-0.14,-0.15,-0.06,-0.0,0.06,0.09,0.13
bitA,-0.25,-0.24,-0.22,-0.19,-0.17,-0.15,-0.1,-0.04,0.01,0.1,0.22
bitOT,-0.23,-0.22,-0.2,-0.17,-0.16,-0.15,-0.11,-0.07,-0.02,0.04,0.14
chess,-0.17,-0.14,-0.05,0.04,0.18,0.36,0.52,0.62,0.69,0.74,0.78
HepTh,-0.18,-0.13,-0.08,-0.03,0.03,0.08,0.18,0.31,0.46,0.57,0.61
HepPh,-0.11,-0.07,-0.02,0.04,0.1,0.17,0.26,0.35,0.43,0.48,0.52
Condm,-0.04,0.0,0.05,0.11,0.2,0.29,0.42,0.53,0.59,0.62,0.63


In [6]:
df = (
  pd.DataFrame(result)
  .query("(method == 'b' or nswap_perc == 0) and (network not in [15, 17, 26, 27])")
  .pivot(['network', 'label', 'nodes'], 'nswap_perc', 'assortativity')
  .sort_values('nodes')
  .reset_index(['network', 'nodes'], drop=True)
  .rename_axis(index=None, columns=None)
  .rename(columns=lambda x: f"{x}%")
  .dropna(how='all')
)
print(
  df.round(2)
    .to_latex(caption="Assortatvity of all networks after rewiring.", 
              label='tab:rewire-assortativity', 
              index=True, 
              column_format='lr@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r'))

\begin{table}
\centering
\caption{Assortatvity of all networks after rewiring.}
\label{tab:rewire-assortativity}
\begin{tabular}{lr@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r}
\toprule
{} &  -100\% &  -80\% &  -60\% &  -40\% &  -20\% &    0\% &   20\% &   40\% &   60\% &   80\% &  100\% \\
\midrule
Rado  &   0.01 &  0.01 &  0.07 &  0.09 & -0.00 &  0.15 &  0.14 &  0.18 &  0.16 &  0.09 &  0.19 \\
UC    &  -0.05 & -0.03 & -0.02 &  0.01 &  0.06 &  0.10 &  0.14 &  0.17 &  0.18 &  0.21 &  0.23 \\
EU    &   0.23 &  0.16 &  0.36 &  0.34 &  0.15 &  0.05 &  0.12 &  0.11 &  0.10 & -0.18 & -0.11 \\
Dem   &  -0.21 & -0.21 & -0.16 & -0.14 & -0.14 & -0.15 & -0.06 & -0.00 &  0.06 &  0.09 &  0.13 \\
bitA  &  -0.25 & -0.24 & -0.22 & -0.19 & -0.17 & -0.15 & -0.10 & -0.04 &  0.01 &  0.10 &  0.22 \\
bitOT &  -0.23 & -0.22 & -0.20 & -0.17 & -0.16 & -0.15 & -0.11 & -0.07 & -0.02 &  0.04 &  0.1

In [None]:
sns.stripplot(
  data=pd.DataFrame(result).dropna(subset=['assortativity']).query('(method == "b") or (nswap_perc == 0)').fillna(0).sort_values(['method', 'assortativity'], ascending=[True, False]).query('(method == "b")'),
  x='assortativity', y='label', hue='nswap_perc', palette='flare'
)
sns.stripplot(
  data=pd.DataFrame(result).dropna(subset=['assortativity']).query('(nswap_perc == 0) or (method == "b")').fillna(0).sort_values(['method', 'assortativity'], ascending=[True, False]).query('(nswap_perc == 0)'),
  x='assortativity', y='label', hue='nswap_perc', palette='flare', marker='v', size=10
)

# Triangle

In [None]:
ax = sns.catplot(
  data=(
    pd.DataFrame(result)
    .dropna(subset=['triangles'])
    .query('(method == "b") or (nswap_perc == 0)')
    .pivot(['label'], 'nswap_perc', 'triangles')
    .apply(normalize, axis=1)
    .dropna()
    .melt(ignore_index=False)
    .reset_index()),
  x='value', y='label', hue='nswap_perc', palette='flare'
)
plt.xscale('log')
plt.vlines([.5, 2], 0, 23)

# Performance

In [20]:
norm = (
  pd.DataFrame(result)
  .query('nswap_perc == 0')
  .dropna(subset=['II-A_LogisticRegression'])
  .set_index('label')
  .sort_values('nodes')
  ['II-A_LogisticRegression']
)

In [23]:
(
  pd.DataFrame(result)
  .query('(method == "b")')
  .dropna(subset=['II-A_LogisticRegression'])
  .pivot(['label', 'nodes'], 'nswap_perc', 'II-A_LogisticRegression')
  .sort_values('nodes')
  .reset_index('nodes', drop=True)
  .sub(norm, axis='index')
  .rename_axis(index=None, columns=None)
  .round(3)
)

Unnamed: 0,-100,-80,-60,-40,-20,20,40,60,80,100
Rado,-0.074,-0.107,-0.106,-0.096,-0.103,-0.131,-0.126,-0.136,-0.138,0.024
UC,-0.311,-0.266,-0.27,-0.356,-0.297,-0.312,-0.388,-0.373,-0.303,-0.083
EU,-0.061,-0.119,-0.088,-0.084,-0.074,-0.07,-0.106,-0.067,-0.107,-0.109
Dem,-0.152,-0.162,-0.134,-0.171,-0.105,-0.13,-0.124,-0.123,-0.169,-0.021
bitA,-0.259,-0.243,-0.267,-0.28,-0.245,-0.309,-0.373,-0.413,-0.39,-0.052
bitOT,-0.252,-0.263,-0.264,-0.308,-0.325,-0.376,-0.395,-0.353,-0.371,-0.014
chess,-0.317,-0.349,-0.368,-0.377,-0.41,-0.406,-0.403,-0.281,-0.382,0.036
HepTh,-0.142,-0.189,-0.202,-0.234,-0.276,-0.248,-0.249,-0.22,-0.177,-0.02
HepPh,-0.162,-0.193,-0.208,-0.213,-0.226,-0.234,-0.201,-0.177,-0.137,-0.034
Condm,-0.243,-0.252,-0.269,-0.294,-0.344,-0.273,-0.263,-0.252,-0.243,-0.095


In [24]:
df = (
  pd.DataFrame(result)
  .query('(method == "b")')
  .dropna(subset=['II-A_LogisticRegression'])
  .pivot(['label', 'nodes'], 'nswap_perc', 'II-A_LogisticRegression')
  .sort_values('nodes')
  .reset_index('nodes', drop=True)
  .sub(norm, axis='index')
  .rename_axis(index=None, columns=None)
  .rename(columns=lambda x: f"{x}%")
)
df.loc['mean'] = df.mean()
print(
  df.round(3)
  .to_latex(caption="Performance of all networks after rewiring.", 
          label='tab:rewire-performance', 
          index=True, 
          column_format='lr@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r')
)

\begin{table}
\centering
\caption{Performance of all networks after rewiring.}
\label{tab:rewire-performance}
\begin{tabular}{lr@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}r}
\toprule
{} &  -100\% &   -80\% &   -60\% &   -40\% &   -20\% &    20\% &    40\% &    60\% &    80\% &   100\% \\
\midrule
Rado  & -0.074 & -0.107 & -0.106 & -0.096 & -0.103 & -0.131 & -0.126 & -0.136 & -0.138 &  0.024 \\
UC    & -0.311 & -0.266 & -0.270 & -0.356 & -0.297 & -0.312 & -0.388 & -0.373 & -0.303 & -0.083 \\
EU    & -0.061 & -0.119 & -0.088 & -0.084 & -0.074 & -0.070 & -0.106 & -0.067 & -0.107 & -0.109 \\
Dem   & -0.152 & -0.162 & -0.134 & -0.171 & -0.105 & -0.130 & -0.124 & -0.123 & -0.169 & -0.021 \\
bitA  & -0.259 & -0.243 & -0.267 & -0.280 & -0.245 & -0.309 & -0.373 & -0.413 & -0.390 & -0.052 \\
bitOT & -0.252 & -0.263 & -0.264 & -0.308 & -0.325 & -0.376 & -0.395 & -0.353 & -0.371 & -0

In [None]:
scipy.stats.kruskal(
  pd.DataFrame(result).loc[lambda x: x['nswap_perc'] == 0, 'II-A_LogisticRegression'].dropna(),
  pd.DataFrame(result).loc[lambda x: x['nswap_perc'] == 0, 'II-A_RandomForest'].dropna())

# Other classifiers

In [14]:
df = (
  pd.DataFrame(result)
  .query('nswap_perc == 0')
  .set_index('label')
  .sort_values('nodes')
  .rename(columns={'II-A_LogisticRegression': 'Logistic Regression', 'II-A_RandomForest': 'Random Forest', 'II-A_XGBoost': 'XGBoost'})
  [['Logistic Regression', 'Random Forest', 'XGBoost']]
  .dropna()
  .rename_axis(None)
)
df

Unnamed: 0,Logistic Regression,Random Forest,XGBoost
Rado,0.819987,0.951218,0.954909
UC,0.874328,0.942416,0.945951
EU,0.814727,0.953162,0.942272
Dem,0.942431,0.983947,0.980768
bitA,0.953392,0.974478,0.974042
bitOT,0.943998,0.972525,0.967135
chess,0.841566,0.83315,0.830273
HepTh,0.786951,0.867389,0.855604
HepPh,0.764387,0.815839,0.797847
Condm,0.83293,0.875349,0.869749


In [17]:
df = (
  pd.DataFrame(result)
  .query('nswap_perc == 0')
  .set_index('label')
  .rename(columns={'II-A_LogisticRegression': 'Logistic Regression', 'II-A_RandomForest': 'Random Forest', 'II-A_XGBoost': 'XGBoost'})
  .sort_values('nodes')  
  [['Logistic Regression', 'Random Forest', 'XGBoost']]
  .dropna()
  .rename_axis(None)
)
df.loc['mean'] = df.mean()
print(df.to_latex(float_format="%.3f", caption="Performance obtained with the II-A feature set (See Section~\\ref{sec:experimental-setup}).", label='tab:classifiers', position='h'))

\begin{table}[h]
\centering
\caption{Performance obtained with the II-A feature set (See Section~\ref{sec:experimental-setup}).}
\label{tab:classifiers}
\begin{tabular}{lrrr}
\toprule
{} &  Logistic Regression &  Random Forest &  XGBoost \\
\midrule
Rado  &                0.820 &          0.951 &    0.955 \\
UC    &                0.874 &          0.942 &    0.946 \\
EU    &                0.815 &          0.953 &    0.942 \\
Dem   &                0.942 &          0.984 &    0.981 \\
bitA  &                0.953 &          0.974 &    0.974 \\
bitOT &                0.944 &          0.973 &    0.967 \\
chess &                0.842 &          0.833 &    0.830 \\
HepTh &                0.787 &          0.867 &    0.856 \\
HepPh &                0.764 &          0.816 &    0.798 \\
Condm &                0.833 &          0.875 &    0.870 \\
SX-MO &                0.932 &          0.959 &    0.959 \\
D-rep &                0.969 &          0.973 &    0.976 \\
Rbody &                0.909 &