In [None]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import scipy
import math

In [None]:
print(f"networkx = {nx.__version__}")
print(f"numpy    = {np.__version__}")
print(f"pandas   = {pd.__version__}")
print(f"scipy    = {scipy.__version__}")

# Representing Networks 

## Python package and documentation
We will use ```networkx``` package. Following resources
- installation instructions - https://networkx.org/documentation/stable/install.html
- quick tutorial - https://networkx.org/documentation/stable/tutorial.html
- examples - https://networkx.org/documentation/stable/auto_examples/index.html

## Examples
* $N={1,…,n}$ nodes, vertices, agents, actors, players, ...
* edges, links, ties: connections between nodes
    - They may have intensity (weighted)
        - How many hours do two people spend together per week?
        - How much of one country's GDP is traded with another?
    - They may just be 0 or 1 (unweighted)
        - Have two researchers written an article together?
        - Are two people "friends" on some social platform?
    - They may be "undirected" or "directed"
        - coauthors, friends,..., relatives, spouses, ...., are mutual relationships
        - link from on web page to another, citations, following on social media..., one way

## Notation and definitions

- $N={1,…,n}$ - nodes, vertices, players
- $g \in \{0,1\}^{n \times n}$ adjacency matrix (unweighted, possibly directed)
- $g_{ij} = 1$ indicates a link, tie, or edge between $i$ and $j$
- Alternative notation: $ij \in g$ a link between $i$ and $j$
- Network is $(N,g)$

## Undirected Graphs

#### Adjacency matrix
$g_{ij}=1$ iff $i$ & $j$ are linked undirected, so symmetric,

In [None]:
# define symmetric adjacency matrix
g_matrix = np.array([
    [0, 1, 0, 1],
    [1, 0, 0, 1],
    [0, 0, 0, 1], 
    [1, 1, 1, 0]
])
g_matrix

In [None]:
g = nx.from_numpy_array(g_matrix,)
nx.draw(g, with_labels=True)

#### List of the links

In [None]:
g_edges = [(0, 1), (0, 3), (1, 3), (2, 3)]
g_edges

In [None]:
g = nx.from_edgelist(g_edges)
nx.draw(g, with_labels=True)

In [None]:
print("nodes")
print(g.nodes)
print("\nedge list")
print(nx.to_edgelist(g))
print("\nadjacency matrix")
print(nx.to_numpy_array(g))

### What will happen if matrix is not symmetric?

In [None]:
g_matrix = np.array([
    [0, 1, 0, 1],
    [0, 0, 0, 1],
    [0, 0, 0, 0], 
    [1, 0, 1, 0]
])
g = nx.from_numpy_array(g_matrix,)
print(nx.to_numpy_array(g))

## Directed Graphs

#### Adjacency matrix
$g_{ij}=1$ iff $i$ is linked with $j$, i.e. directed links, so matrix could be not symmetric

In [None]:
dg_matrix = np.array([
    [0, 1, 0, 1],
    [0, 0, 0, 1],
    [0, 0, 0, 0], 
    [1, 0, 1, 0]
])
dg = nx.from_numpy_array(dg_matrix, create_using=nx.DiGraph)
nx.draw(dg, with_labels=True)

#### List of the links

In [None]:
dg = nx.DiGraph()
dg.add_edges_from([(0, 1), (0, 3), (1, 3), (3, 0), (3, 2)])
nx.draw(dg, with_labels=True)

In [None]:
print("nodes")
print(dg.nodes)
print("\nedge list")
print(nx.to_edgelist(dg))
print("\nadjacency matrix")
print(nx.to_numpy_array(dg))

## Weighted Directed Network

### Row stochastic example

In [None]:
dgrs_matrix = np.array([
    [1/3, 1/3, 1/3],
    [1/2, 1/2, 0],
    [0, 1/4, 3/4]
])
dgrs_matrix

In [None]:
dgrs = nx.from_numpy_array(dgrs_matrix,  create_using=nx.DiGraph)
nx.to_edgelist(dgrs)

In [None]:
# define nodes' position
pos = nx.circular_layout(dgrs)

# draw nodes
nx.draw_networkx_nodes(dgrs, pos)
# draw nodes'label
nx.draw_networkx_labels(dgrs, pos, font_size=10)
# draw edges as arcs, see connection style documentation for more options
nx.draw_networkx_edges(dgrs, pos, edgelist=nx.to_edgelist(dgrs), width=1,connectionstyle="arc3,rad=-0.2")
# extract weights
edge_labels = nx.get_edge_attributes(dgrs, "weight")
edge_labels = {x: "{:.2g}".format(edge_labels[x]) for x in edge_labels}
# plot edges weights with manual position adjustment
edge_labels_loop = {x: edge_labels[x] for x in edge_labels if x[0] == x[1]}
edge_labels_s2b = {x: edge_labels[x] for x in edge_labels if x[0] < x[1]}
edge_labels_b2s = {x: edge_labels[x] for x in edge_labels if x[0] > x[1]}
nx.draw_networkx_edge_labels(dgrs, {k:(pos[k][0], pos[k][1]-0.35) for k in pos},  edge_labels_s2b)
nx.draw_networkx_edge_labels(dgrs, {k:(pos[k][0], pos[k][1]+0.35) for k in pos},  edge_labels_b2s)
nx.draw_networkx_edge_labels(dgrs, {k:(pos[k][0]+0.11, pos[k][1]+0.23) for k in pos},  edge_labels_loop)
# show plot
plt.box(False)
plt.show()


### Classical weight example

In [None]:
dgw_matrix = np.array([
    [0, 7, 2],
    [5, 0, 0],
    [0, 4, 0]
])
dgw_matrix

In [None]:
dgw = nx.from_numpy_array(dgw_matrix,  create_using=nx.DiGraph)
nx.to_edgelist(dgw)

In [None]:
# define nodes' position
pos = nx.circular_layout(dgw)

# draw nodes
nx.draw_networkx_nodes(dgw, pos)
# draw nodes'label
nx.draw_networkx_labels(dgw, pos, font_size=10)
# draw edges as arcs, see connection style documentation for more options
nx.draw_networkx_edges(dgw, pos, edgelist=nx.to_edgelist(dgw), width=1,connectionstyle="arc3,rad=-0.2")
# extract weights
edge_labels = nx.get_edge_attributes(dgw, "weight")
edge_labels = {x: "{:.2g}".format(edge_labels[x]) for x in edge_labels}
# plot edges weights with manual position adjustment
edge_labels_loop = {x: edge_labels[x] for x in edge_labels if x[0] == x[1]}
edge_labels_s2b = {x: edge_labels[x] for x in edge_labels if x[0] < x[1]}
edge_labels_b2s = {x: edge_labels[x] for x in edge_labels if x[0] > x[1]}
nx.draw_networkx_edge_labels(dgw, {k:(pos[k][0], pos[k][1]-0.35) for k in pos},  edge_labels_s2b)
nx.draw_networkx_edge_labels(dgw, {k:(pos[k][0], pos[k][1]+0.35) for k in pos},  edge_labels_b2s)
nx.draw_networkx_edge_labels(dgw, {k:(pos[k][0]+0.11, pos[k][1]+0.23) for k in pos},  edge_labels_loop)
# show plot
plt.box(False)
plt.show()

## Walk, Path, Cycle

- Walk from $i_1$ to $i_K$: a sequence of nodes $(i_1,i_2,\dots, i_K)$ and sequence of links $(i_1i_2,i_2i_3,...,i_{K‐1}i_K)$ such that
$i_{k‐1}i_k \in g$ for each $k$. Convenient to represent it as the corresponding
sequence of nodes $(i_1,i_2,\dots, i_K)$ such that $i_{k‐1}i_k \in g$
for each $k$.
- Path: a walk $(i_1,i_2,... i_K)$ with each node $i_k$ distinct
- Cycle: a walk where $i_1 = i_K$
- Geodesic: a shortest path between two nodes

#### Illustrative network

In [None]:
graph = nx.from_edgelist([
    (1, 2, {"distance": 10}),
    (1, 3, {"distance": 30}),
    (2, 3, {"distance": 10}),
    (3, 4, {"distance": 10}),
    (3, 5, {"distance": 10}),
    (4, 5, {"distance": 10}),
    (3, 7, {"distance": 10}),
    (7, 6, {"distance": 10}),
    (5, 6, {"distance": 10})
])

In [None]:
pos = nx.spring_layout(graph, seed=7)  # positions for all nodes - seed for reproducibility

# nodes
nx.draw_networkx_nodes(graph, pos)
nx.draw_networkx_labels(graph, pos, font_size=10)
nx.draw_networkx_edges(graph, pos, edgelist=nx.to_edgelist(graph), width=2)
plt.box(False)
plt.show()

#### Path and walk from 1 to 7
$(1, 2, 3, 4, 5, 6, 7)$

In [None]:
path_17 = [(1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]
nx.draw_networkx_nodes(graph, pos)
nx.draw_networkx_labels(graph, pos, font_size=10)
nx.draw_networkx_edges(graph, pos, edgelist=nx.to_edgelist(graph), width=2)
nx.draw_networkx_edges(graph, pos, edgelist=path_17, width=2, edge_color="r")
plt.box(False)
plt.show()

#### Walk from 1 to 7, which is not a path
$(1, 2, 3, 4, 5, 3, 7)$

In [None]:
walk_17 = [(1, 2), (2, 3), (3, 4), (4, 5), (5, 3), (3, 7)]
nx.draw_networkx_nodes(graph, pos)
nx.draw_networkx_labels(graph, pos, font_size=10)
nx.draw_networkx_edges(graph, pos, edgelist=nx.to_edgelist(graph), width=2)
nx.draw_networkx_edges(graph, pos, edgelist=walk_17, width=2, edge_color="r")
plt.box(False)
plt.show()

#### Simple Cycle (and a walk) from 1 to 1: 
$(1, 2, 3, 1)$

In [None]:
walk_seq = [(1, 2), (2, 3), (3, 1)]
nx.draw_networkx_nodes(graph, pos)
nx.draw_networkx_labels(graph, pos, font_size=10)
nx.draw_networkx_edges(graph, pos, edgelist=nx.to_edgelist(graph), width=2)
nx.draw_networkx_edges(graph, pos, edgelist=walk_seq, width=2, edge_color="r")
plt.box(False)
plt.show()

In [None]:
walk_seq = [(1, 2), (2, 3), (3, 4), (4, 5), (5, 3), (3, 1)]
nx.draw_networkx_nodes(graph, pos)
nx.draw_networkx_labels(graph, pos, font_size=10)
nx.draw_networkx_edges(graph, pos, edgelist=nx.to_edgelist(graph), width=2)
nx.draw_networkx_edges(graph, pos, edgelist=walk_seq, width=2, edge_color="r")
plt.box(False)
plt.show()

#### Python code examples for paths and cycle analysis

In [None]:
# all paths from 1 to 7 as seq of nodes
for x in nx.all_simple_paths(graph, 1, 7):
    print(x)

In [None]:
# all paths from 1 to 7 as seq of edges
for x in nx.all_simple_edge_paths(graph, 1, 7):
    print(x)

In [None]:
# list with shortest path
for x in nx.shortest_simple_paths(graph, 1, 7):
    print(x)

In [None]:
# list with shortest path using 'distance' as a weight
for x in nx.shortest_simple_paths(graph, 1, 7, "distance"):
    print(x)

In [None]:
# get cycles' basis
nx.cycle_basis(graph)

## Component
- $(N,g)$ is connected if there is a path between every two nodes
- Component: maximal connected subgraph
    - $(N',g')$ is a subset of $(N,g)$
    - $(N',g')$ is connected
    - $i \in N'$ and $ij \in g$ implies $j \in N'$ and $ij \in g'$

In [None]:
# network with 4 components
graph_comp = nx.from_edgelist([(1,5), (3, 4), (3, 5), (4, 5), (6, 10), (7, 8), (7, 9), (8, 9)])
graph_comp.add_node(2)
nx.draw(graph_comp, with_labels=True)

In [None]:
print("Number of components")
print(nx.number_connected_components(graph_comp))
print("\nComponents")
print([x for x in nx.connected_components(graph_comp)])

In [None]:
# getting component containing a specific node
nx.node_connected_component(graph_comp, 4)

# Homework

Load the schedules.csv file (Blagoevgrad transport scheme) and create a network by using networkx package

# Measuring Networks: Summary Statistics and Characteristics 

- Global patterns of networks
    - degree distributions
    - path lengths
    - ...
- Segregation Patterns
    - node types and homophily
- Local Patterns
    - Clustering
    - Transitivity
    - Support
    - ...
- Positions in networks
    - Neighborhoods
    - Centrality
    - Influence
    - ...

## Diameter and average path length

Questions of interest:
- How close are nodes to each other:
- How long does it take to
reach average node?
- How fast will information spread?
- How does it depend on network density?

Definitions:
- Diameter – largest geodesic (largest shortest path). If unconnected, of largest component.
- Average path length (less prone to outliers)

In [None]:
print("Diameter")
print(nx.diameter(graph))
print("\nAverage path length")
print(nx.average_shortest_path_length(graph))

#### Size of the network vs diameter

In [None]:
graph_btree = nx.balanced_tree(2, 4)
nx.draw(graph_btree)

In [None]:
print("Number of nodes")
print(graph_btree.number_of_nodes())
print("\nDiameter")
print(nx.diameter(graph_btree))
print("\nAverage path length")
print(nx.average_shortest_path_length(graph_btree))

In [None]:
graph_circle = nx.circulant_graph(31, [1])
nx.draw_circular(graph_circle)

In [None]:
print("Number of nodes")
print(graph_circle.number_of_nodes())
print("\nDiameter")
print(nx.diameter(graph_circle))
print("\nAverage path length")
print(nx.average_shortest_path_length(graph_circle))

#### Small average path length and diameter
- Milgram (1967) letter experiments - https://en.wikipedia.org/wiki/Small-world_experiment
    - median 5 for the 25% that made it
- Co‐Authorship studies - https://sites.google.com/oakland.edu/grossman/home/the-erdoes-number-project/research-on-collaboration-in-research
    - Grossman (2002) Math mean 7.6, max 27,
    - Newman (2001) Physics mean 5.9, max 20
    - Goyal et al (2004) Economics mean 9.5, max 29
- WWW
    - Adamic, Pitkow (1999) – mean 3.1 (85.4% possible of 50M pages) - https://link.springer.com/chapter/10.1007/3-540-48155-9_27
- Facebook
    - Backstrom et al (2012) – mean 4.74 (721 million users) - https://arxiv.org/abs/1111.4570

## Neighborhood and degree
- Neighborhood: $N_i(g) = \{ j | ij \in g \}$ (usual convention $ii$ not in $g$ )
- Degree: $d_i = \# N_i(g)$

In [None]:
print("Neighborhood per nodes")
print([x for x in graph.adjacency()])
print("\nDegrees per nodes")
print(graph_comp.degree())

### Average Degree vs Degree distribution

In [None]:
def average_degree(graph):
    return np.mean([x[1] for x in graph.degree()])

def plot_degree_distribution(graph):
    plt.hist([x[1] for x in graph.degree()])

In [None]:
graph_d1 = nx.from_edgelist([(x + 1, x +2) for x in range(8)])
nx.draw(graph_d1)

In [None]:
graph_d2 = nx.from_edgelist([(1, x +2) for x in range(8)])
nx.draw(graph_d2)

In [None]:
print(f"average degree graph_d1 = {average_degree(graph_d1)}")
print(f"average degree graph_d2 = {average_degree(graph_d2)}")

In [None]:
plot_degree_distribution(graph_d1)

In [None]:
plot_degree_distribution(graph_d2)

## Clustering

- What fraction of my friends are friends?
$$Cl_i(g) = \frac{\#\{ kj \in g | k, j \in N_i(g)\}}{\#\{ kj | k, j \in N_i(g)\}}$$
- Average clustering: $$Cl_{avg}(g) =\frac{1}{n}\sum_{i} Cl_{i}(g)$$
- Overall clustering:
$$Cl(g) =\frac{\sum_{i} \#\{ kj \in g | k, j \in N_i(g)\}}{\sum_i \#\{ kj | k, j \in N_i(g)\}}$$

### Differences between average and overall clustering

In [None]:
# aux graph generator to support the demo
def create_cluster_example(n):
    graph = nx.Graph()
    graph.add_node(0)

    for i in range(n):
        edges = [(0, i * 3 + j + 1) for j in range(3)] + [(i * 3 + j + 1, i * 3 + ((j + 1) % 3) + 1) for j in range(3)] 
        graph.add_edges_from(edges)
        
    return graph

# overall clustering calculation is not available in networkx
def overall_clustering(graph):
    triangles = nx.triangles(graph)
    degree = nx.degree(graph)
    try:
        return sum([triangles[k] for k in triangles]) / sum([v * (v - 1) / 2 for k, v in degree])
    except:
        return 0

In [None]:
graph_cluster = create_cluster_example(5)
nx.draw(graph_cluster, with_labels=True)

In [None]:
for n in range(1, 10):
    size = n * 5
    graph = create_cluster_example(size)
    avg_cluster = nx.average_clustering(graph)
    ovr_cluster = overall_clustering(graph)
    print(f"# clusters = {size}\t avg. clustering {avg_cluster:.4f}\t overall clustering {ovr_cluster:.4f}")
    print()

## Centrality measures

Four different things to measure:
- Degree - connectedness
- Closeness - ease of reaching other nodes
- Betweenness - role as an intermediary, connector
- Influence, Prestige, Eigenvectors "not what you know, but who you know.."

In [None]:
# graph used for illustrations
edges_list = [(1, 2), (1, 3), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 7)]
graph_centrality = nx.from_edgelist(edges_list)
nx.draw(graph_centrality, with_labels=True)

### Degree Centrality
- How 'connected' is a node?
- degree captures connectedness
- normalize by $n‐1$ ‐ most possible

In [None]:
nx.degree_centrality(graph_centrality)

### Closeness Centrality

- relative distances to other nodes
$$C_i = \frac{(n‐1)}{\sum_{j} l(i,j)}$$
- scales directly with distance – twice as far is half as central.

In [None]:
nx.closeness_centrality(graph_centrality)

### Betweenness Centrality


$$C_{k} = \frac{2}{(n‐1)(n‐2)}\sum_{i,j\neq k} \frac{P_k(i,j)}{P(i,j)}$$
where:

- $P(i,j)$ number of geodesics between $i$ and $j$
- $P_k(i,j)$ number of geodesics between $i$ and $j$ that $k$ lies on

In [None]:
nx.betweenness_centrality(graph_centrality)

### Eigenvector Centrality

- Centrality is proportional to the sum of neighbors' centralities
$$C_i = a\sum_j g_{ij} C_j$$

In [None]:
nx.eigenvector_centrality(graph_centrality)

In [None]:
r = nx.eigenvector_centrality(graph_centrality)

In [None]:
(r[3] +r[5])/r[4]

In [None]:
(r[2] + r[3])/r[1]

#### Concepts related to eigenvector centrality:
- Google Page rank: score of a page is proportional to the sum of the scores of pages
linked to it
- Random surfer model: start at some page on the
web, randomly pick a link, follow it, repeat...

# Homework

- Load 26KeroNetwork undirected graph and calculate following statistics:
    - Average degree
    - Diameter
    - Maximum Clustering
    - Minimum Clustering


- Load imports_manufactures directed graph and find out which of the countries has:
    - maximum "Closeness centrality"
    - maximum "Betweenness centrality"


# EU Centrality measures example

EU countries graph is having edges between 2 EU countries if they have common land border.

In [None]:
edges = [
    ("Austria", "Germany"),
    ("Austria", "Czech"),
    ("Austria", "Slovakia"),
    ("Austria", "Hungary"),
    ("Austria", "Slovenia"),
    ("Austria", "Italy"),
    ("Belgium", "Netherland"),
    ("Belgium", "Germany"),
    ("Belgium", "Luxemburg"),
    ("Belgium", "France"),
    ("Bulgaria", "Romania"),
    ("Bulgaria", "Greece"),
    ("Croatia", "Slovenia"),
    ("Croatia", "Hungary"),
    ("Czech", "Germany"),
    ("Czech", "Poland"),
    ("Czech", "Slovakia"),
    ("Denmark", "Germany"),
    ("Denmark", "Sweden"),
    ("Estonia", "Latvia"),
    ("Finland", "Sweden"),
    ("France", "Luxemburg"),
    ("France", "Italy"),
    ("France", "Spain"),
    ("France", "Germany"),
    ("Germany", "Poland"),
    ("Germany", "Netherland"),
    ("Germany", "Luxemburg"),
    ("Hungary", "Slovenia"),
    ("Hungary", "Slovakia"),
    ("Hungary", "Romania"),
    ("Italy", "Slovenia"),
    ("Latvia", "Litva"),
    ("Litva", "Poland"),
    ("Poland", "Slovakia"),
    ("Portugal", "Spain"),
]

In [None]:
gr = nx.from_edgelist(edges)
nx.draw(gr, with_labels=True)

In [None]:
centrality_dict = nx.degree_centrality(gr)
centrality_series = pd.Series(centrality_dict)
closeness_dict = nx.closeness_centrality(gr)
closeness_series = pd.Series(closeness_dict)
betweenness_dict = nx.betweenness_centrality(gr)
betweenness_series = pd.Series(betweenness_dict)
eigenvector_dict = nx.eigenvector_centrality(gr)
eigenvector_series = pd.Series(eigenvector_dict)
centralities_df = pd.concat([
    centrality_series.rename("Degree Centrality"),
    closeness_series.rename("Closeness Centrality"),
    betweenness_series.rename("Betweenness Centrality"),
    eigenvector_series.rename("Eigenvector Centrality")
], axis=1)
centralities_df.sort_values(by="Eigenvector Centrality", ascending=False)
