In [None]:
## path to the datasets
datadir='./Datasets/'

## required packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import igraph as ig
import partition_igraph
from collections import Counter
from sklearn.metrics import adjusted_rand_score as ARI
from sklearn.metrics import adjusted_mutual_info_score as AMI
import random


# Part 2 - Clustering

First, let's build the airport graph as in Part 1.

In [None]:
## the airports
airport_df = pd.read_csv(datadir + 'Airports/airports_loc.csv')
airport_df.head()
## read edges from csv file
df_edges = pd.read_csv(datadir + 'Airports/connections.csv')
df_edges.head() ## look at a few edges
## directed graph
tuple_list = [tuple(x) for x in df_edges.values]
g = ig.Graph.TupleList(tuple_list, directed=True, edge_attrs=['weight'])
## add vertex attributes
lookup = {k:v for v,k in enumerate(airport_df['airport'])}
l = [lookup[x] for x in g.vs()['name']]
g.vs['layout'] = [(airport_df['lon'][i],-airport_df['lat'][i]) for i in l]
g.vs['state'] = [airport_df['state'][i] for i in l]
g.vs['city'] = [airport_df['city'][i] for i in l]
g.vs['color'] = 'lightblue'
## undirected graph w/o loops
g_und = g.as_undirected(combine_edges=sum)
g_und = g_und.simplify(combine_edges=sum)


## 2.1 Triangles and Transitivity

Edges represent relations between entities (nodes) in graphs/networks.

The next step is to consider **triads** of nodes 

Fully connected triads form **triangles**

The presence of triangles is indicative of communities (dense subgraph(s) in a graph): ''the friend of my friend is my friend''.

Two fundamental measures of the presence of triangles in graphs are:

* **transitivity** (global clustering coefficient) measures the proportion of wedges (two-hop path in an undirected graph) that form a triangle
* **local transitivity** (local clustering coefficient) for a node is the proportion of pairs of neighbours that form a triangle
  * nodes of degree less than 2 are either ignored or given value 0
  * **average local transitivity** is obtained by averaging local transitivity over all nodes

Those measures assume **undirected** graphs.

While we can define triangles and other motifs for directed graphs, clustering generally assumes undirected graphs.

We compute the above for the airport graph, as we look for nodes with high/low local transitivity.

In [None]:
## nodes with degree < 2 will get 0 local transitivity
print('trans:',g_und.transitivity_undirected())
print('avg local trans:',g_und.transitivity_avglocal_undirected(mode='zero'))
g_und.vs['trans'] = g_und.transitivity_local_undirected(mode='zero') 
print('avg local trans:',np.mean(g_und.vs['trans']))

High transitivity example

In [None]:
v = np.argmax(g_und.vs['trans'])
print('airport:',g_und.vs[v]['name'],', transitivity',g_und.vs[v]['trans'],', plotting its ego-net:')
sg = g_und.subgraph(g_und.neighborhood(v))
ig.plot(sg,bbox=(400,300), vertex_label=sg.vs['name'], vertex_size=15, layout=sg.vs['layout'],
            vertex_label_size=6, margin=50)
## we see that all neighbour pairs are linked by an edge

### directed graphs

For directed graphs, there are 16 different possibilities for each **triad**

The **triad census** function counts all occurrences, and returns the counts in a specific ordering:
```
  - C{003} -- the empty graph
  - C{012} -- a graph with a single directed edge (C{A --> B, C})
  - C{102} -- a graph with a single mutual edge (C{A <-> B, C})
  - C{021D} -- the binary out-tree (C{A <-- B --> C})
  - C{021U} -- the binary in-tree (C{A --> B <-- C})
  - C{021C} -- the directed line (C{A --> B --> C})
  - C{111D} -- C{A <-> B <-- C}
  - C{111U} -- C{A <-> B --> C}
  - C{030T} -- C{A --> B <-- C, A --> C}
  - C{030C} -- C{A <-- B <-- C, A --> C}
  - C{201} -- C{A <-> B <-> C}
  - C{120D} -- C{A <-- B --> C, A <-> C}
  - C{120U} -- C{A --> B <-- C, A <-> C}
  - C{120C} -- C{A --> B --> C, A <-> C}
  - C{210C} -- C{A --> B <-> C, A <-> C}
  - C{300} -- the complete graph (C{A <-> B <-> C, A <-> C})
```

In [None]:
tc = g.triad_census()
print('number of triads:',np.sum(tc))
print('number of complete subgraphs (300):',tc.t300) 
print('number of 3-edge cycles (030C)',tc.t030C)
print('complete list by type:',tuple(tc))


### Cliques

Triangles are also known as **3-cliques**

A **k-clique** is a fully connected subgraph with k nodes

The **clique number** is the size of the largest clique.


In [None]:
# Cliques in the Airport graph
print('number of 3-cliques:', len(g_und.cliques(min=3, max=3)))
print('number of 4-cliques:', len(g_und.cliques(min=4, max=4)))
print('max clique size:', g_und.clique_number())

print some airport in a max clique - those are all major hubs

In [None]:
[g_und.vs[v]['name'] for v in g_und.cliques(min=38)[0][:10]]

### Questions

#### 1. Find the node in the undirected airport graph with degree 5 or more having the lowest transitivity.


#### 2. Plot its ego-net, what do you observe?


#### 3. Compute its betweenness, compare with average betweenness for all nodes

### Possible Solutions

In [None]:
## node with low transitivity, degree at least 5
x = np.argmin([v['trans'] for v in g_und.vs if g_und.degree(v)>=5])
v = [v for v in g_und.vs if g_und.degree(v)>=5][x]
print('airport:',v['name'],', transitivity',v['trans'],', plotting its ego-net:')
sg = g_und.subgraph(g_und.neighborhood(v))
ig.plot(sg,bbox=(400,300), vertex_label=sg.vs['name'], vertex_size=15,layout=sg.vs['layout'],
            vertex_label_size=6, vertex_color='lightblue', margin=50)

## We see that very few pairs of neighhbours are connected


In [None]:
## compare its betweenness with average node beteweenness
g_und.betweenness(v) / np.mean(g_und.betweenness())

## it has high betweenness ... 
## with several neighbours not linked by an edge, this node is on several geodesics!


In [None]:
## clean-up
del(g_und.vs['trans'])

## 2.2 Clustering

Graph clustering, a.k.a. **node partitioning**, is a very active research area, with dozens of algorithms.

Some good ones are:

* Louvain (multilevel): 
    * fast, but may return disconnected communities
    * unstable for graphs with homogeneous edge weights
* Leiden:
    * fast, connected communities
* ECG (ensemble clustering):
    * better stability for graphs with homogeneous edge weights

Measures of community strength include:
* modularity ("proportion of edges within communities" - "expected proportion under null model")
* comparing degree within and between communities


### Clusters in the airport graph

In [None]:
## Leiden algorithm is randomized - fix the seed to compare results
random.seed(31416)

## cluster and color nodes w.r.t. communities
cl = g_und.community_leiden(objective_function='modularity', weights='weight')
g_und.vs['cl'] = cl.membership
pal = ig.ClusterColoringPalette(n=np.max(cl.membership)+1)
g_und.vs['color'] = [pal.get(i) for i in cl.membership]

In [None]:
## Compute modularity (<=1); 
## large positive values are indicative of community structure
g_und.modularity(cl.membership, weights='weight')

In [None]:
## we see 4 big clusters related to geographical locations
ig.plot(g_und, vertex_size=5, edge_color='grey', layout=g.vs['layout'], bbox=(500,400))

Size of each cluster found

In [None]:
cl.sizes()

Showing one (small) cluster

In [None]:
sg = cl.subgraph(6)
ig.plot(sg, bbox=(400,300), vertex_label_size=8, vertex_label=sg.vs['name'], layout=sg.vs['layout'], )

Collapse communities to show degree between and within communities

In [None]:
g_und.es['label'] = 1
g_und.vs['lat'] = [v['layout'][0] for v in g_und.vs]
g_und.vs['lon'] = [v['layout'][1] for v in g_und.vs]
G = cl.cluster_graph(combine_vertices={'lat':np.mean,'lon':np.mean, 'color':'first'}, combine_edges={'label':sum})
G.vs['label'] = [2*G.ecount() for G in cl.subgraphs()]
ly = [(v['lat'],v['lon']) for v in G.vs]
ig.plot(G, layout=ly, bbox=(450,350), vertex_label_size=8, edge_label_size=8)


In [None]:
## clean up
del(g_und.es['label'])
del(g_und.vs['lat'])
del(g_und.vs['lon'])

### Questions

#### 1. What are the most frequent states in each of the clusters we found?


#### 2. Compute the transitivity fo each cluster and compare with the transitivity for the who;le graph 


### Possible Solutions

In [None]:
## Louvain - most frequent states
for sg in cl.subgraphs():
    print('cluster:',sg.vcount(),'nodes, frequent states:',Counter(sg.vs['state']).most_common(5))
    
## Transitivity    
print('\ntransitivity:',g_und.transitivity_undirected(mode='zero'))
print('\ntransitivity per community:')
[(sg.vcount(),sg.transitivity_undirected(mode='zero')) for sg in cl.subgraphs()]


In [None]:
## clean-up
del(g_und.vs['cl'])


## 2.3 Random Graph Models

Random graph models are useful for various reasons:
* interpretation of results on real graphs (ex: is this value expected? high? low?)
* to compare algorithms (ex: clustering algorithms)
* to study theoretical properties

Usually, we fix some aspects of the graph, such as the **number of nodes**, and randomly sample.

There are many such models, including:
* Erdos-Renyi model: fix the number of nodes and randomly place edges
* Configuration model: as above, but given a degree distribution for the nodes
* ABCD model: power-law node degree distribution with ground-truth communities

Let's build an Erdos-Renyi (ER) graph with same number of nodes/edges as (undirected) airport graph

In [None]:
g_er = ig.Graph.Erdos_Renyi(n=g_und.vcount(), m=g_und.ecount())
print('min degree',np.min(g_er.degree()),'max degree',np.max(g_er.degree()))


Transitivity for the ER graph; recall we had 0.48/0.53 for the airport graph


In [None]:
print('trans:',g_er.transitivity_undirected())
print('avg local trans:',g_er.transitivity_avglocal_undirected())

Number of cliques in the ER graph and clique number

All values much smaller than for the airport graph!


In [None]:
print('number of 3-cliques:', len(g_er.cliques(min=3, max=3)))
print('number of 4-cliques:', len(g_er.cliques(min=4, max=4)))
print('max clique:', g_er.clique_number())

Modularity for Leiden communities; recall we had 0.23 for the airport graph

In [None]:
er_cl = g_er.community_leiden(objective_function='modularity')
g_er.modularity(er_cl.membership)


# To go further

More topics can be found in:
* book: https://www.ryerson.ca/mining-complex-networks
* notebooks: https://github.com/ftheberge/GraphMiningNotebooks
    
including:   
* more centrality measures
* clustering: overlapping clusters, outliers    
* degree assortativity
* vertex and graph embedding
* hypergraphs
* network robustness
* road networks
