In [67]:
%cd /scratch/bruingjde/SNAM2021-code/

from constants import *

/scratch/bruingjde/SNAM2021-code


In [185]:
df = networks.copy()
df['index'] = df.index
df['edge type'] = df['index'].isin(hypergraph_indices).replace(
  {True: 'multi', False: 'simple'}
)
df['source'] = df['source'].apply(lambda x: f'\cite{{{x}}}')
df['cat'] = df['category'].replace(
  {'social': 'Social', 'information': 'Inf.', 'technological': 'Tech.'})
df.drop(columns=['old_category', 'category'], inplace=True)

In [186]:
def get_size(network: int):
  edgelist = pd.read_pickle(f'data/{network:02}/edgelist.pkl')
  graph = nx.from_pandas_edgelist(edgelist, create_using=nx.MultiGraph)
    
  return pd.Series(
    dict(nodes=graph.number_of_nodes(), edges=graph.number_of_edges()),
    name=network
  )

size = tlp.ProgressParallel(n_jobs=network_count, total=network_count)(
  joblib.delayed(get_size)(network) for network in network_indices
)
size = pd.DataFrame(size) 
size.rename(dict(nodes='n', edges='m'), inplace=True)

  0%|          | 0/26 [00:00<?, ?it/s]

In [187]:
stats = dict()
for network in tqdm(network_indices):
  with open(f'data/{network:02}/stats.json') as file:
    stats[network] = json.load(file)
stats = pd.DataFrame.from_dict(stats, orient='index')
stats.rename(columns={'density (nx.Graph)': 'density', 'degree assortativity (nx.Graph)': 'degree assortativity'}, inplace=True)

columns = ['density', 'degree assortativity', 
           'average clustering coefficient']

stats = stats[columns].copy()

# stats.drop(columns=['nodes', 'edges', 'density (nx.MultiGraph)', 'fraction edges in GC'], inplace=True)

  0%|          | 0/26 [00:00<?, ?it/s]

In [216]:
stats.sort_values('average clustering coefficient')

Unnamed: 0,density,degree assortativity,average clustering coefficient
24,0.000836,-0.078326,0.004867
9,0.000187,0.008341,0.005286
21,0.001101,0.335905,0.054398
13,0.017419,-0.092248,0.070458
10,4e-05,-0.055707,0.089571
22,4e-05,-0.143106,0.109331
23,4.2e-05,-0.087745,0.116163
3,7.9e-05,-0.166689,0.119342
8,0.000219,0.221522,0.123693
4,0.000219,0.221522,0.123693


In [188]:
diameter = pd.Series(get_diameter(), name='diameter')

  0%|          | 0/26 [00:00<?, ?it/s]

In [189]:
def convert_int_to_short(x: int):
  if x > 2000000: return f'{x/1000000:.0f}M'
  elif x > 1000000: return f'{x/1000000:.1f}M'
  elif x > 2000: return f'{x/1000:.0f}K'
  elif x > 1000: return f'{x/1000:.1f}K'
  else: return str(x)
  
def scientific_notation(x): 
  x = f'{x:.0e}'
  coefficient = x[0]
  if '-' in x:
    exponent = '-' + x[-1]
  else:
    exponent = x[1]
  return f'${coefficient}\!\cdot\!10^{{{exponent}}}$'

def thousand_sep(x):
  return f'{x:,}'

In [190]:
size.to_pickle('code/figures/size.pkl')

In [191]:
result = pd.concat([df, stats, size, diameter], axis=1)
result.dropna(inplace=True)
result = result.astype(dict(diameter=int))
result.reset_index(inplace=True, drop=True)
result.index = result.index + 1
result.sort_values('nodes', inplace=True)
result['density'] = result['density'].apply(scientific_notation)
result['degree assortativity'] = result['degree assortativity'].round(2)
result['average clustering coefficient'] = (
  result['average clustering coefficient'].round(2))

In [192]:
columns = [
  'label', 'cat', 'edge type', 'nodes', 'edges', 'density', 
  'degree assortativity', 'average clustering coefficient', 'diameter', 
  'source']

In [193]:
table = result[columns].copy()
table.rename(
  {
    'source': ' ',
    'cat': 'domain',
    'degree assortativity': 'd.a.', 
    'average clustering coefficient': 'a.c.c.',
    'diameter': 'diam.'
  },
  axis='columns',
  inplace=True)

In [197]:
latex_table = table.to_latex(
  formatters={
    'nodes': thousand_sep, 'edges': thousand_sep
  },
  column_format='l@{\hspace{1em}}c@{\hspace{1em}}c@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}c@{\hspace{1em}}c@{\hspace{1em}}c@{\hspace{1em}}c@{\hspace{1em}}c',
  index=False,
  caption=(
    'Networks used in this work. '
    'The following abbreviations are used in the columns; '
    'd.a.: degree assortativity, acc: average clustering coefficient, diam.: diameter. '
    'In the column (scientific) domain, technological is abbreviated to Tech. and Information to Inf.'
  ),
  label='table:datasets',
  escape=False,
  multicolumn=False,
  position='H'
)
print(latex_table)

\begin{table}[H]
\centering
\caption{Networks used in this work. The following abbreviations are used in the columns; d.a.: degree assortativity, acc: average clustering coefficient, diam.: diameter. In the column (scientific) domain, technological is abbreviated to Tech. and Information to Inf.}
\label{table:datasets}
\begin{tabular}{l@{\hspace{1em}}c@{\hspace{1em}}c@{\hspace{1em}}r@{\hspace{1em}}r@{\hspace{1em}}c@{\hspace{1em}}c@{\hspace{1em}}c@{\hspace{1em}}c@{\hspace{1em}}c}
\toprule
label & domain & edge type &     nodes &      edges &             density &  d.a. &  a.c.c. &  diam. &                          \\
\midrule
 Rado & Social &     multi &       167 &     82,927 & $2\!\cdot\!10^{-1}$ & -0.29 &    0.59 &      5 &     \cite{Michalski2011} \\
   UC &   Inf. &     multi &       899 &     33,720 & $2\!\cdot\!10^{-2}$ & -0.09 &    0.07 &      6 &        \cite{Opsahl2013} \\
   EU & Social &     multi &       986 &    332,334 & $3\!\cdot\!10^{-2}$ & -0.03 &    0.41 &      7 &   

In [202]:
result.groupby(['cat']).agg(['mean', 'std'])

Unnamed: 0_level_0,index,index,degree assortativity,degree assortativity,average clustering coefficient,average clustering coefficient,nodes,nodes,edges,edges,diameter,diameter
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
cat,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Inf.,12.833333,8.471521,0.001667,0.193227,0.273333,0.329768,43230.666667,57552.952288,1513486.0,1572926.0,8.333333,3.265986
Social,13.666667,8.408679,-0.043333,0.163887,0.272778,0.213454,212595.833333,449233.760986,3558844.0,8486513.0,13.444444,4.97313
Tech.,28.5,0.707107,-0.115,0.035355,0.18,0.0,44014.0,12733.578916,388139.0,173450.5,14.0,4.242641


In [212]:
result.groupby(['edge type']).agg(['min', 'median', 'max'])

Unnamed: 0_level_0,index,index,index,degree assortativity,degree assortativity,degree assortativity,average clustering coefficient,average clustering coefficient,average clustering coefficient,nodes,nodes,nodes,edges,edges,edges,diameter,diameter,diameter
Unnamed: 0_level_1,min,median,max,min,median,max,min,median,max,min,median,max,min,median,max,min,median,max
edge type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
multi,1,13.5,30,-0.3,-0.09,0.18,0.07,0.335,0.77,167,29914,1824701,33720,508668.5,29487744,5,11.0,23
simple,4,13.5,24,-0.23,-0.055,0.34,0.0,0.12,0.17,3683,55387,279374,21163,335708.0,3394979,4,13.5,18


In [219]:
result.groupby(['edge type'])['average clustering coefficient'].quantile([.25, .5, .75])

edge type      
multi      0.25    0.1650
           0.50    0.3350
           0.75    0.6100
simple     0.25    0.0600
           0.50    0.1200
           0.75    0.1375
Name: average clustering coefficient, dtype: float64