# Networks and their Structure Assignment

## Network Science Topic 3

Note that the networks in this exercise are all undirected.

We recall some definitions and introduce some new ones.

Let $d_{ij}$ be the distance (the length of the shortest path) between vertices $i$ to $j$.

Then the *closeness centrality* of vertex $j$ is    $\displaystyle{ \mbox{CC}(j)= \frac{1}{\sum_i d_{ij}}}$.

The *nearness centrality* of vertex $j$ is $\displaystyle{ \mbox{NC}(j)= \sum_i  \frac{1}{d_{ij}}}$.  In both these definitions, the sums are over all vertices $i$, $i \neq j$, in the network.

The *degree centrality* of vertex $j$ is simply its degree (the number of neighbours it has) and is denoted $\mbox{DC}(j)$.

The *adjacency centrality* of vertex $j$ is $ \mbox{AC}(j)=\displaystyle{ \frac{1}{d_j} \sum_i \frac{d_j - d_i}{d_j+d_i} }$ where the sum is over all vertices $i$ that are adjacent to $j$ and $d_i$ denotes the degree of a vertex.  (So $\mbox{DC}(j)$ and $d_j$ are different notations for the same measure.)

1. [5 marks]  Calculate the values of the four centrality measures defined above on each vertex in the network below.  (The diagram and the dictionary are two representations of the same network.)   Present your answer as four lists --- one for each centrality measure --- that gives the vertices and the calculated values ordered by those values.


```python
network = {1:  [4],
           2:  [4],
           3:  [4],
           4:  [1, 2, 3, 5, 6],
           5:  [4],
           6:  [4, 7, 8, 9, 10, 11],
           7:  [6, 8, 11],
           8:  [6, 7, 9, 11],
           9:  [6, 8, 10],
           10: [6, 9, 11, 12],
           11: [6, 7, 8, 10],
           12: [10]}
           
```

<img src="example.jpg" width="400">


In [64]:
import itertools
from typing import Dict, Set, Optional, Tuple, List, Generator, Hashable, Literal

Node = Vertex = Hashable
Network = Graph = Dict[Node, Set[Node]]

In [65]:
def bfs(
        graph_: Graph,
        start_node_: Node,
        visited_: Optional[Set[Node]] = None
) -> Generator[Tuple[Node, int], None, None]:
    if visited_ is None:
        visited_ = set()

    queue_: List[Tuple[Node, int]] = [(start_node_, 0)]

    while queue_:
        node_, distance_ = queue_.pop(0)
        visited_.add(node_)

        yield node_, distance_

        for neighbour_ in graph_[node_]:
            if neighbour_ not in visited_:
                queue_.append((neighbour_, distance_ + 1))


def distance(graph_: Graph, start_node_: Node, end_node_: Node) -> int:
    for node_, distance_ in bfs(graph_, start_node_):
        if node_ == end_node_:
            return distance_
    return -1


def distances(graph_: Graph, start_node_: Node) -> Dict[int, int]:
    return {node_: distance_ for node_, distance_ in bfs(graph_, start_node_)}


def all_distances(graph_: Graph) -> Dict[Tuple[Node, Node], int]:
    return {(start_node_, end_node_): distance_
            for start_node_ in sorted(list(graph_))
            for end_node_, distance_ in sorted(list(distances(graph_, start_node_).items()))}


def closeness_centralities(
        graph_: Graph,
        graph_distances_: Optional[Dict[Tuple[Node, Node], int]] = None
) -> Dict[Node, float]:
    if graph_distances_ is None:
        graph_distances_ = all_distances(graph_)
    return {
        node_: 1.0 / sum([
            graph_distances_[(node_, other_node_)]
            for other_node_ in graph_
            if node_ != other_node_
        ])
        for node_ in graph_
    }


def nearness_centralities(
        graph_: Graph,
        graph_distances_: Optional[Dict[Tuple[Node, Node], int]] = None
) -> Dict[Node, float]:
    if graph_distances_ is None:
        graph_distances_ = all_distances(graph_)
    return {
        node_: sum([
            1.0 / graph_distances_[(node_, other_node_)]
            for other_node_ in graph_
            if node_ != other_node_
        ])
        for node_ in graph_
    }


def degree_centralities(graph_: Graph) -> Dict[Node, int]:
    return {
        node_: len(graph_[node_])
        for node_ in graph_
    }


def adjacency_centralities(
        graph_: Graph,
        degree_centralities_: Optional[Dict[Node, int]] = None
) -> Dict[Node, float]:
    if degree_centralities_ is None:
        degree_centralities_ = degree_centralities(graph_)
    return {
        node_: (1.0 / degree_centralities_[node_]) * sum([
            (degree_centralities_[node_] - degree_centralities_[adjacent_node_]) /
            (degree_centralities_[node_] + degree_centralities_[adjacent_node_])
            for adjacent_node_ in graph_[node_]
        ])
        for node_ in graph_
    }

In [66]:
network: Network = {
    1: {4},
    2: {4},
    3: {4},
    4: {1, 2, 3, 5, 6},
    5: {4},
    6: {4, 7, 8, 9, 10, 11},
    7: {6, 8, 11},
    8: {6, 7, 9, 11},
    9: {6, 8, 10},
    10: {6, 9, 11, 12},
    11: {6, 7, 8, 10},
    12: {10}
}

# Compute the distance from every node to every other node in the network
d = all_distances(network)

# Compute the four centrality measure values for each vertex
ccs = closeness_centralities(network, d)
ncs = nearness_centralities(network, d)
dcs = degree_centralities(network)
acs = adjacency_centralities(network, dcs)

# Print the computed values in a table
print(
    "j\t|\tCC(j)\tNC(j)\tDC(j)\tAC(j)\n" +
    "-" * 4 + "|" + "-" * 35 + "\n" +
    "\n".join([
        f"{j}\t|\t"
        f"{ccs[j]:.4f}\t"
        f"{ncs[j]:.4f}\t"
        f"  {dcs[j]}\t\t"
        f"{acs[j]:.4f}"
        for j in network
    ])
)

j	|	CC(j)	NC(j)	DC(j)	AC(j)
----|-----------------------------------
1	|	0.0303	4.5333	  1		-0.6667
2	|	0.0303	4.5333	  1		-0.6667
3	|	0.0303	4.5333	  1		-0.6667
4	|	0.0435	7.0833	  5		0.5152
5	|	0.0303	4.5333	  1		-0.6667
6	|	0.0476	6.3333	  6		0.2263
7	|	0.0303	4.4167	  3		-0.2063
8	|	0.0333	5.1667	  4		0.0214
9	|	0.0333	5.1667	  3		-0.2063
10	|	0.0345	5.6667	  4		0.1357
11	|	0.0345	5.3333	  4		-0.0143
12	|	0.0256	3.9667	  1		-0.6000


2. [20 marks] Obtain the three datasets in topic3networks.zip (under Topic 3 on Learn Ultra, see the descriptions below).  Load these networks.  Again, they are all undirected.  We wish also to work with connected graphs so find the largest connected component of each and discard other vertices.

 For each dataset, for each of the four centrality measures, list, in order, the 20 vertices with the highest values of that measure (include more if the values are tied).  Comment on whether you think, based on what you have found, that nearness centrality is a good alternative to closeness centrality and that adjacency centrality is a good alternative to degree centrality.

The datasets:
* london_transport_raw_edges.txt:  The network is of London rail and underground stations that are linked if they are adjacent on some line.  The second and third items on each line in the file are a pair of nodes that are joined by an edge (the first item describes how they are linked and can be ignored for this exercise).
* Roget.txt: This is a network of words that are linked if they appear together in a thesaurus.  At the start of the file is a list of words (the nodes) and their numeric identifiers.    Then there are lists (one per line) of words that appear together in the thesaurus.  There should be an edge between any pair of nodes that appear in the same list.  For example, the list 3 4 323 325 implies the existence of six edges: (3,4), (3, 323), (3, 325), (4, 323), (4, 325), (323, 325)
* CCSB-Y2H.txt: The network is of interactions amongst proteins in yeast (living cells can be considered as complex webs of macromolecular interactions known as interactome networks).  The first two items on each line are a pair of nodes joined by
an edge (the rest of the line can be ignored).

In [67]:
def load_london_transport(filename_: str) -> Network:
    network_: Network = {}
    with open(filename_, "rt") as file_:
        for line_ in file_:
            _, left_node_, right_node_ = line_.rstrip().split()
            if left_node_ not in network_:
                network_[left_node_] = set()
            if right_node_ not in network_:
                network_[right_node_] = set()
            if left_node_ != right_node_:
                network_[left_node_].add(right_node_)
                network_[right_node_].add(left_node_)
    return network_


london_transport_network: Network = load_london_transport("./topic3networks/london_transport_raw.edges.txt")

In [68]:
def load_roget(filename_: str) -> Network:
    network_: Network = {}
    with open(filename_, "rt") as file_:
        state_: Literal["vertex-count", "vertex-ids", "arcs-title", "arcs-list"] = "vertex-count"
        vertex_count_: int = 0
        vertex_names_: Dict[int, str] = {}
        for line_ in file_:
            line_: str = line_.rstrip()
            if state_ == "vertex-count":
                left_, right_ = line_.split()
                assert left_ == "*Vertices"
                vertex_count_ = int(right_)
                state_ = "vertex-ids"
            elif state_ == "vertex-ids":
                left_, *right_ = line_.split()
                vertex_name_ = " ".join(right_).replace("\"", "")
                vertex_id_: int = int(left_)
                vertex_names_[vertex_id_] = vertex_name_
                network_[vertex_name_] = set()
                if vertex_id_ == vertex_count_:
                    state_ = "arcs-title"
            elif state_ == "arcs-title":
                assert line_ == "*Arcslist"
                state_ = "arcs-list"
            elif state_ == "arcs-list":
                # noinspection PyTypeChecker
                nodes_ = map(vertex_names_.get, map(int, line_.split()))
                for left_node_, right_node_ in itertools.permutations(nodes_, 2):
                    if left_node_ != right_node_:
                        network_[left_node_].add(right_node_)
    return network_


roget_network: Network = load_roget("./topic3networks/Roget.txt")

In [70]:
def load_ccsb_y2h(filename_) -> Network:
    network_: Network = {}
    with open(filename_, "rt") as file_:
        state_: Literal["first-line", "edges"] = "first-line"
        for line_ in file_:
            line_ = line_.rstrip()
            if state_ == "first-line":
                assert hash(line_) == 7186263516352579066
                state_ = "edges"
            elif state_ == "edges":
                left_node_, right_node_, *_ = line_.split("\t")
                if left_node_ not in network_:
                    network_[left_node_] = set()
                if right_node_ not in network_:
                    network_[right_node_] = set()
                if left_node_ != right_node_:
                    network_[left_node_].add(right_node_)
                    network_[right_node_].add(left_node_)
    return network_


ccsb_y2h_network: Network = load_ccsb_y2h("./topic3networks/CCSB-Y2H.txt")