In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

# Input file

In [None]:
df = pd.read_csv("./Input/ProcessedNetworkDataFrame.csv")
df

# Generate network

1. zip(author_pid, coauthor_pid)
2. add edges

In [None]:
author_pid = df["author_pid"].to_list()
coauthor_pid = df["coauthor_pid"].to_list()
collaborations = list(zip(author_pid, coauthor_pid))

In [None]:
G = nx.Graph()
G.add_edges_from(collaborations)

In [None]:
nx.draw(G)

### 1. What are the properties of the collaboration network?
a. Size: number of nodes and edges
<br>
b: Degree analysis
<br>
c. Average shortest path length
<br>
d. Clustering coefficient
<br>
e. Connectedness: GC/WCC/SCC
<br>
f. Centrality: Degree, Eigenvector, Betweenness, Closeness

In [None]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(nx.info(G))

In [None]:
degree_sequence = sorted((d for n, d in G.degree()), reverse=True)
dmax = max(degree_sequence)

fig, ax = plt.subplots()
ax.bar(*np.unique(degree_sequence, return_counts=True))
ax.set_title("Degree Histogram")
ax.set_xlabel("Degree")
ax.set_ylabel("Number of nodes")

plt.show()

In [None]:
i = 1
for C in (G.subgraph(c).copy() for c in nx.connected_components(G)):
    print(f"Sub-graph {i}")
    print(f"Average shortest path length: {nx.average_shortest_path_length(C):.2f}")
    print(f"Diameter: {nx.diameter(C)}\n")
    i += 1

In [None]:
print(f"Average clustering coefficient: {nx.average_clustering(G):.2f}")

In [None]:
degree_centrality = nx.degree_centrality(G)
max_degree_centrality = max(degree_centrality)
print(f"PID with the highest degree centrality: {max_degree_centrality}, with score of {degree_centrality[max_degree_centrality]:.5f}\n")

eigen_centrality = nx.eigenvector_centrality(G)
max_eigen_centrality = max(eigen_centrality)
print(f"PID with the highest eigenvector centrality: {max_eigen_centrality}, with score of {eigen_centrality[max_eigen_centrality]:.5f}\n")

betweenness_centrality = nx.betweenness_centrality(G)
max_betweenness_centrality = max(betweenness_centrality)
print(f"PID with the highest betweenness centrality: {max_betweenness_centrality}, with score of {betweenness_centrality[max_betweenness_centrality]:.5f}\n")

closeness_centrality = nx.closeness_centrality(G)
max_closeness_centrality = max(closeness_centrality)
print(f"PID with the highest closeness centrality: {max_closeness_centrality}, with score of {closeness_centrality[max_closeness_centrality]:.5f}\n")

# How has the collaboration network and its properties evolved over time?

# Assume that we create a random network from the set of individuals in the input file. How does the properties of this network differ from the real collaboration network?

In [None]:
# real network has 989 nodes
gnp = nx.fast_gnp_random_graph(n=989, p=0.03, seed=4071)

print(f"Number of nodes: {gnp.number_of_nodes()}")
print(f"Number of edges: {gnp.number_of_edges()}")

In [None]:
gnp_degree_sequence = sorted((d for n, d in gnp.degree()), reverse=True)

fig, ax = plt.subplots()
ax.bar(*np.unique(gnp_degree_sequence, return_counts=True))
ax.set_title("Degree Histogram")
ax.set_xlabel("Degree")
ax.set_ylabel("Number of nodes")

plt.show()

In [None]:
print(f"Average shortest path length: {nx.average_shortest_path_length(gnp):.5f}")

In [None]:
print(f"Average clustering coefficient: {nx.average_clustering(gnp):.5f}")

### Observations
1. Degree distribution follows Poisson Distribution
<br>
2. Average shortest path length is similar to that of the real collaboration network
<br>
3. Average clustering coefficient is much lower than real collaboration network