In [7]:
import datetime
import itertools
import math
import os
from typing import List, Tuple, Dict, Literal

import joblib
import networkx as nx
import numpy as np
import pandas as pd

from linkprediction import construct_edges, convert_to_set, get_graph, giant_component, read_edges

def weighted_median(d: np.ndarray) -> float: return np.average(d[:,0], weights=d[:,1])

# Condmat

In [18]:
%%time

file = 'data/condmat.hg2'
print(f'Size of file: {os.path.getsize(file):.1e}')
edges = construct_edges(file)
g = get_graph(edges, directed=False)

!printf '%s\n' 'load_undirected temp/network.edges' 'dist_distri' > "temp/input.txt"
nx.write_edgelist(g, 'temp/network.edges', data=False)
! /local/bruingjde/teexgraph/teexgraph < temp/input.txt > temp/output.txt

gc = giant_component(g)
n = g.number_of_nodes()
edges_gc = edges['source'].isin(set(g.nodes)).sum()
print(f'Number of edges (gc): {len(edges)} ({edges_gc/len(edges):.0%})')
print(f'Number of nodes (gc): {n} ({gc.number_of_nodes() / n:.0%})')

Number of edges (gc): 88090 (100%)
Number of nodes (gc): 17218 (88%)


In [17]:
print(f'Density: {nx.density(g):.1e}')
print(f"Mean distance: {weighted_median(np.loadtxt('temp/output.txt')):.1f}")
print(f"Diameter: {int(np.loadtxt('temp/output.txt')[-1,0])}")

Size of file: 1e+06
*** Welcome to teexGraph ***
- Use standard input (cin) to give commands
- Read standard output (cout) to catch the result
- Observe standard log (clog) and (cerr) for status and error messages
- Graphs up to MAXN = 10000000 nodes are accepted
Input a command: Loading an undirected graph. Enter filename: 
Loading graph from temp/network.edges ...
- 55276 edges added (m = 55276) in total
- 0 edges skipped
- 6 self-edges added

Sorting edge list...
Sorting done.
Loading done.

Making graph undirected (m = 55276)...
Sorting edge list...
Sorting done.
  Verify that the graph is actually undirected.
Undirected-making done (m = 110546).
Loading file succeeded.
WCC computed.

> Computing distance distribution (based on a 100% sample of 17218 nodes) with 128 CPUs...
            0          000   0   0 0   0% 0% 0% 0% 0% 0% 0 %  0% 0%0% 0% 0% 0% 0%0     000%0   0%0% 0% 0% 0% 0% 0% 0% 0    0% 0% 0% 0% 0% 0% 0% 0%0% 0% 0% 0% 0% 0% 0%0%  0%0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0%0% 0

# Enron

In [22]:
edges = read_edges('data/enron.edges')
g = get_graph(edges, directed=True)
gc = giant_component(g)
edges_gc = edges['source'].isin(set(g.nodes)).sum()
n = g.number_of_nodes()
print(f'Number of edges (gc): {len(edges)} ({edges_gc/len(edges):.0%})')

Number of edges (gc): 1148072 (100%)


In [64]:
%%time
%%capture
!printf '%s\n' 'load_undirected temp/network.edges' 'dist_distri' > "temp/input.txt"
nx.write_edgelist(g, 'temp/network.edges', data=False)
! /local/bruingjde/teexgraph/teexgraph < temp/input.txt > temp/output.txt

CPU times: user 543 ms, sys: 68.8 ms, total: 612 ms
Wall time: 25.5 s


In [65]:
print(f'Number of nodes (gc): {n} ({gc.number_of_nodes() / n:.0%})')
print(f'Density: {nx.density(g):.1e}')
print(f"Mean distance: {weighted_median(np.loadtxt('temp/output.txt')):.1f}")
print(f"Diameter: {int(np.loadtxt('temp/output.txt')[-1,0])}")

Number of edges (gc): 299220 (99%)
Number of edges (gc): 87273 (97%)
Density: 7.9e-05
Mean distance: 4.9
Diameter: 14


# Askubuntu

In [23]:
edges = read_edges('data/au.edges', sep='\t')
g = get_graph(edges, directed=True)
gc = giant_component(g)
edges_gc = edges['source'].isin(set(g.nodes)).sum()
n = g.number_of_nodes()
print(f'Number of edges (gc): {len(edges)} ({edges_gc/len(edges):.0%})')

Number of edges (gc): 964437 (100%)


In [73]:
%%time
%%capture
!printf '%s\n' 'load_undirected temp/network.edges' 'dist_distri' > "temp/input.txt"
nx.write_edgelist(g, 'temp/network.edges', data=False)
! /local/bruingjde/teexgraph/teexgraph < temp/input.txt > temp/output.txt

CPU times: user 1.26 s, sys: 193 ms, total: 1.45 s
Wall time: 2min 3s


In [74]:
gc = giant_component(g)
e = g.number_of_edges()
n = g.number_of_nodes()
print(f'Number of edges (gc): {e} ({gc.number_of_edges() / e:.0%})')
print(f'Number of edges (gc): {n} ({gc.number_of_nodes() / n:.0%})')
print(f'Density: {nx.density(g):.1e}')
print(f"Mean distance: {weighted_median(np.loadtxt('temp/output.txt')):.1f}")
print(f"Diameter: {int(np.loadtxt('temp/output.txt')[-1,0])}")

Number of edges (gc): 508003 (99%)
Number of edges (gc): 159316 (96%)
Density: 4.0e-05
Mean distance: 3.9
Diameter: 13


# Digg

In [25]:
edges = read_edges('data/digg.edges')
g = get_graph(edges, directed=True)
gc = giant_component(g)
edges_gc = edges['source'].isin(set(g.nodes)).sum()
n = g.number_of_nodes()
print(f'Number of edges (gc): {len(edges)} ({edges_gc/len(edges):.0%})')

Number of edges (gc): 87627 (100%)


In [84]:
assert g.number_of_nodes() < 1e8
%%time
!printf '%s\n' 'load_undirected temp/network.edges' 'dist_distri' > "temp/input.txt"
nx.write_edgelist(g, 'temp/network.edges', data=False)
! /local/bruingjde/teexgraph/teexgraph < temp/input.txt > temp/output.txt

*** Welcome to teexGraph ***
- Use standard input (cin) to give commands
- Read standard output (cout) to catch the result
- Observe standard log (clog) and (cerr) for status and error messages
- Graphs up to MAXN = 10000000 nodes are accepted
Input a command: Loading an undirected graph. Enter filename: 
Loading graph from temp/network.edges ...
- 86312 edges added (m = 86312) in total
- 0 edges skipped
- 1157 self-edges added

Sorting edge list...
Sorting done.
Loading done.

Making graph undirected (m = 86312)...
Sorting edge list...
Sorting done.
  Verify that the graph is actually undirected.
Undirected-making done (m = 171467).
Loading file succeeded.
WCC computed.

> Computing distance distribution (based on a 100% sample of 30398 nodes) with 128 CPUs...
    0         0  0  0 0% 0% 0%  0  00  0%0   0%   0% 0% 0% 0% 0% 0%0% 0%0% 0 0%  0%0% 0% 0% 0% 0% 0% 0%0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0%  0% 0%0% 0%0% 0%0% 0% 0% 0%0% 0%0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0% 0%% 0%0% 0%0

In [85]:
gc = giant_component(g)
e = g.number_of_edges()
n = g.number_of_nodes()
print(f'Number of edges (gc): {e} ({gc.number_of_edges() / e:.0%})')
print(f'Number of edges (gc): {n} ({gc.number_of_nodes() / n:.0%})')
print(f'Density: {nx.density(g):.1e}')
print(f"Mean distance: {weighted_median(np.loadtxt('temp/output.txt')):.1f}")
print(f"Diameter: {int(np.loadtxt('temp/output.txt')[-1,0])}")

Number of edges (gc): 86312 (100%)
Number of edges (gc): 30398 (98%)
Density: 1.9e-04
Mean distance: 4.7
Diameter: 12


# Slashdot

In [26]:
edges = read_edges('data/slashdot.edges', skiprows=2)
g = get_graph(edges, directed=True)
gc = giant_component(g)
edges_gc = edges['source'].isin(set(g.nodes)).sum()
n = g.number_of_nodes()
print(f'Number of edges (gc): {len(edges)} ({edges_gc/len(edges):.0%})')

Number of edges (gc): 140778 (100%)


In [23]:
%%time
assert g.number_of_nodes() < 1e8
!printf '%s\n' 'load_undirected temp/network.edges' 'dist_distri' > "temp/input.txt"
nx.write_edgelist(g, 'temp/network.edges', data=False)
! /tmp/bruingjde/teexgraph/teexgraph < temp/input.txt > temp/output.txt

*** Welcome to teexGraph ***
- Use standard input (cin) to give commands
- Read standard output (cout) to catch the result
- Observe standard log (clog) and (cerr) for status and error messages
- Graphs up to MAXN = 10000000 nodes are accepted
Input a command: Loading an undirected graph. Enter filename: 
Loading graph from temp/network.edges ...
- 117378 edges added (m = 117378) in total
- 0 edges skipped
- 805 self-edges added

Sorting edge list...
Sorting done.
Loading done.

Making graph undirected (m = 117378)...
Sorting edge list...
Sorting done.
  Verify that the graph is actually undirected.
Undirected-making done (m = 233951).
Loading file succeeded.
WCC computed.

> Computing distance distribution (based on a 100% sample of 51083 nodes) with 256 CPUs...
    00        00% 0%   0 0 %%  0%    0      0     0%      00   00  0%0%   0%0%   % 0%0  0%0%0% 0%%00%0  %  0%  0    0%0%  0%0  0% 0%0  0% 0% 0% 0% 0% 0% 0%0 %0 0%   00% %           00% 0%0 0% 0% 0  0 0%00%  00 %0 %         0% 

In [24]:
gc = giant_component(g)
e = g.number_of_edges()
n = g.number_of_nodes()
print(f'Number of edges (gc): {e} ({gc.number_of_edges() / e:.0%})')
print(f'Number of edges (gc): {n} ({gc.number_of_nodes() / n:.0%})')
print(f'Density: {nx.density(g):.1e}')
print(f"Mean distance: {weighted_median(np.loadtxt('temp/output.txt')):.1f}")
print(f"Diameter: {int(np.loadtxt('temp/output.txt')[-1,0])}")

Number of edges (gc): 117378 (100%)
Number of edges (gc): 51083 (100%)
Density: 9.0e-05
Mean distance: 4.5
Diameter: 17


# Stackoverflow

In [27]:
edges = read_edges('data/so.edges', sep='\t')
g = get_graph(edges, directed=True)
gc = giant_component(g)
edges_gc = edges['source'].isin(set(g.nodes)).sum()
n = g.number_of_nodes()
print(f'Number of edges (gc): {len(edges)} ({edges_gc/len(edges):.0%})')

Number of edges (gc): 63497050 (100%)


In [7]:
%%time
assert g.number_of_nodes() < 1e8
!printf '%s\n' 'load_undirected temp/network.edges' 'dist_distri' > "temp/input.txt"
nx.write_edgelist(g, 'temp/network.edges', data=False)

In [11]:
! /tmp/bruingjde/teexgraph/teexgraph < temp/input.txt > temp/output.txt

*** Welcome to teexGraph ***
- Use standard input (cin) to give commands
- Read standard output (cout) to catch the result
- Observe standard log (clog) and (cerr) for status and error messages
- Graphs up to MAXN = 10000000 nodes are accepted
Input a command: Loading an undirected graph. Enter filename: 
Loading graph from temp/network.edges ...
   - 10000000 edges loaded so far...
   - 20000000 edges loaded so far...
- 29541284 edges added (m = 29541284) in total
- 0 edges skipped
- 1357766 self-edges added

Sorting edge list...
Sorting done.
Loading done.

Making graph undirected (m = 29541284)...
Sorting edge list...
Sorting done.
  Verify that the graph is actually undirected.
Undirected-making done (m = 57724802).
Loading file succeeded.
WCC computed.

> Computing distance distribution (based on a 100% sample of 2601977 nodes) with 256 CPUs...
 0% 0%  0 0  0% 0% 0%  0 0% 0% 0% 0%   0% 0%  0% 0% 0% 0% 0% 0%  0% 0 0%  0% 0% 0% 0% 0% 0% 0%  0%0%   0% 0   0%0  0  0%0   00% 0% 0  0% 0

In [None]:
gc = giant_component(g)
e = g.number_of_edges()
n = g.number_of_nodes()
print(f'Number of edges (gc): {e} ({gc.number_of_edges() / e:.0%})')
print(f'Number of edges (gc): {n} ({gc.number_of_nodes() / n:.0%})')
print(f'Density: {nx.density(g):.1e}')
# print(f"Mean distance: {weighted_median(np.loadtxt('temp/output.txt')):.1f}")
# print(f"Diameter: {int(np.loadtxt('temp/output.txt')[-1,0])}")