In [None]:
import igraph as ig
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from collections import Counter
import plfit
from statsmodels.distributions.empirical_distribution import ECDF as ecdf

datadir='../Datasets/'

Just looking at basic statistics on degree distribution and shortest paths, we can identify huge difference between different types of graphs. Here we look at a social-type graph, and a transportation-type network.

## GitHub Developpers (undirected)

Description

A large social network of GitHub developers which was collected from the public API in June 2019. Nodes are developers who have starred at least 10 repositories and edges are mutual follower relationships between them. The vertex features are extracted based on the location, repositories starred, employer and e-mail address. The task related to the graph is binary node classification - one has to predict whether the GitHub user is a web or a machine learning developer. This target feature was derived from the job title of each user.

Properties

- Nodes: 37,700
- Edges: 289,003
- Transitvity: 0.013


## Europe Electric Grid

Details at:
https://zenodo.org/record/47317#.Xt6nzy3MxTY



In [None]:
## read the GitHub edge list into a graph
D = pd.read_csv(datadir+'GitHubDeveloppers/musae_git_edges.csv')
tuples = [tuple(x) for x in D.values]
gh = ig.Graph.TupleList(tuples, directed = False)

## add some node features, here there are
## 2 class of nodes, 0: web developper (red), 1: ml developper (blue)
X = pd.read_csv(datadir+'GitHubDeveloppers/musae_git_target.csv')
idx = [int(i) for i in gh.vs['name']]
sorterIndex = dict(zip(idx,range(len(idx))))
X['Rank'] = X['id'].map(sorterIndex)
X.sort_values(['Rank'], ascending=[True],inplace=True)
X.dropna(inplace=True)
gh.vs['target'] = list(X['ml_target'])
cls = ['grey','black']
lbl = ['web','ml']
gh.vs['color'] = [cls[i] for i in list(X['ml_target'])]
gh.vs['lbl'] = [lbl[i] for i in list(X['ml_target'])]
gh.es['color'] = 'grey'

In [None]:
## read edge list for the grid network
gr = ig.Graph.Read_Ncol(datadir+'GridEurope/gridkit_europe-highvoltage.edges', directed=False)
gr = gr.simplify()

## read the vertices along with some attributes
X = pd.read_csv(datadir+'GridEurope/gridkit_europe-highvoltage.vertices')
idx = [int(i) for i in gr.vs['name']]
sorterIndex = dict(zip(idx,range(len(idx))))
X['Rank'] = X['v_id'].map(sorterIndex)
X.sort_values(['Rank'], ascending=[True],inplace=True)
X.dropna(inplace=True)
gr.vs['longitude'] = list(X['lon'])
gr.vs['latitude'] = list(X['lat'])
gr.vs['type'] = list(X['typ'])
gr.vs['layout'] = [(v['longitude'],v['latitude']) for v in gr.vs()]
gr.vs['size'] = 3
gr.es['color'] = 'grey'
gr.vs['color'] = 'black'

In [None]:
## for github, 9739 are ml developpers, build the subgraphs
gh_ml = gh.subgraph([v for v in gh.vs() if v['lbl']=='ml'])
gh_web = gh.subgraph([v for v in gh.vs() if v['lbl']=='web'])

In [None]:
## basic stats in a table
S = []
deg = gh.degree()
S.append(['GitHub',gh.vcount(),gh.ecount(),np.min(deg),np.mean(deg),np.median(deg),np.quantile(deg,.99),
      np.max(deg),gh.diameter(),np.max(gh.clusters().membership)+1,gh.clusters().giant().vcount(),
         sum([x==0 for x in gh.degree()])])
deg = gh_ml.degree()
S.append(['GitHub (ml)',gh_ml.vcount(),gh_ml.ecount(),np.min(deg),np.mean(deg),np.median(deg),np.quantile(deg,.99),
      np.max(deg),gh_ml.diameter(),np.max(gh_ml.clusters().membership)+1,gh_ml.clusters().giant().vcount(),
         sum([x==0 for x in gh_ml.degree()])])
deg = gh_web.degree()
S.append(['GitHub (web)',gh_web.vcount(),gh_web.ecount(),np.min(deg),np.mean(deg),np.median(deg),np.quantile(deg,.99),
      np.max(deg),gh_web.diameter(),np.max(gh_web.clusters().membership)+1,gh_web.clusters().giant().vcount(),
         sum([x==0 for x in gh_web.degree()])])
deg = gr.degree()
S.append(['Grid',gr.vcount(),gr.ecount(),np.min(deg),np.mean(deg),np.median(deg),np.quantile(deg,.99),
      np.max(deg),gr.diameter(),np.max(gr.clusters().membership)+1,gr.clusters().giant().vcount(),
         sum([x==0 for x in gr.degree()])])


In [None]:
D = pd.DataFrame(S,columns=['graph','nodes','edges',r'$d_{min}$',r'$d_{mean}$',
                             r'$d_{median}$',r'$d_{quant_{99}}$',r'$d_{max}$','diameter','components','largest','isolates'])

In [None]:
D = D.transpose()
D

In [None]:
## To produce LaTeX from a DataFrame
#df = D.round(decimals=3)
#print(df.to_latex(index=True))

In [None]:
## count ml with connection to web only
c_ml = c_web = 0
for v in gh.vs():
    if v['lbl']=='ml':
        if set([gh.vs[i]['lbl'] for i in gh.neighbors(v)])=={'web'}:
            c_ml+=1
    else:
        if set([gh.vs[i]['lbl'] for i in gh.neighbors(v)])=={'ml'}:
            c_web+=1
print(c_ml,'ml connected only to web and',c_web,'web connected only to ml')        

In [None]:
## degree exponent
sg = gr.clusters().giant()
d = sg.degree()
X = plfit.plfit(d)
print(X.plfit()[1])
## transitivity
print(sg.transitivity_undirected())

In [None]:
## subgraph of Grid -- Iberic peninsula
V = [v for v in gr.vs() if v['latitude']>36 and v['latitude']<44 and v['longitude']>-10 and v['longitude']<4]
gr_spain = gr.subgraph(V)
ly = ig.Layout(gr_spain.vs['layout'])
ly.mirror(1)
print(gr_spain.vcount())
ig.plot(gr_spain, 'grid_sg.eps', layout=ly, bbox=(0,0,300,300))

In [None]:
## plot subgraph for github(ml)
sg = gh_ml.clusters().giant()
ly = sg.layout_auto()
sg.vs['x'] = [x[0] for x in ly]
sg.vs['y'] = [x[1] for x in ly]

## degree exponent
d = sg.degree()
X = plfit.plfit(d)
print(X.plfit()[1])

## transitivity
print(sg.transitivity_avglocal_undirected())


In [None]:
z = 52
V = [v for v in sg.vs() if v['x']<z and v['x']>-z and v['y']<z and v['y']>-z]
ssg = sg.subgraph(V).clusters().giant()
ssg.vs['size'] = 3
print(ssg.vcount())
ig.plot(ssg, 'github_ml_sg.eps',bbox=(0,0,300,300))

In [None]:
## plot subgraph for github (web)
sg = gh_web.clusters().giant()
ly = sg.layout_auto()
sg.vs['x'] = [x[0] for x in ly]
sg.vs['y'] = [x[1] for x in ly]

## degree exponent
d = sg.degree()
X = plfit.plfit(d)
print(X.plfit()[1])

## transitivity
print(sg.transitivity_avglocal_undirected())

In [None]:
z = 66
V = [v for v in sg.vs() if v['x']<z and v['x']>-z and v['y']<z and v['y']>-z]
ssg = sg.subgraph(V).clusters().giant()
ssg.vs['size'] = 3
ssg.vs['color'] = 'black'
print(ssg.vcount())
ig.plot(ssg, 'github_web_sg.eps',bbox=(0,0,300,300))

In [None]:
## degree distribution - GitHub graph
deg = gh.degree()
e = ecdf(deg)
x = np.arange(1,max(deg),1)
y = [e(i) for i in x]
plt.semilogx(x,y,'-',color='black',label='GitHub')
plt.xlabel('degree',fontsize=14)
plt.ylabel('empirical cdf',fontsize=14);
#plt.savefig('ecdf_gh.eps');


In [None]:
## degree distribution - Grid graph
deg = gr.degree()
e = ecdf(deg)
x = np.arange(1,30,1)
y = [e(i) for i in x]
plt.semilogx(x,y,'-',color='black',label='Grid')
plt.xlabel('degree',fontsize=14)
plt.ylabel('empirical cdf',fontsize=14);
#plt.savefig('ecdf_gr.eps');


In [None]:
## shortest paths length from a given node, GitHib graph
from collections import Counter
V = np.random.choice(gh.vcount(),size=100,replace=False)
sp = []
for v in V:
    sp.extend(gh.shortest_paths(source=v)[0])
nb = np.max(sp)+1
c = Counter(sp)
s = sorted(c.items())

fig, ax = plt.subplots()
x = [x[0] for x in s]
y = [x[1] for x in s]
b = ax.bar(x, y, color='darkgrey') 

ax.set_yscale('log')

ax.set_xlabel('path length',fontsize=14)
ax.set_ylabel('volume (log scale)',fontsize=14);
#plt.savefig('pathlen_github.eps');


In [None]:
## min path length from that node to other nodes, Grid network
V = np.random.choice(gr.vcount(),size=100,replace=False)
sp = []
for v in V:
    sp.extend(gr.shortest_paths(source=v)[0])
nb = np.max(sp)+1
c = Counter(sp)
s = sorted(c.items())


fig, ax = plt.subplots()
x = [x[0] for x in s]
y = [x[1] for x in s]
b = ax.bar(x, y, color='darkgrey') 

ax.set_xlabel('path length',fontsize=14)
ax.set_ylabel('volume',fontsize=14);
#plt.savefig('pathlen_grid.eps');

