In [8]:
from build_graph import build_graph
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

In [9]:
start_page = "Volsci" # Change this to the page you want to start from
depth = 2  # Adjust the depth as needed

graph, links_dict, categories_dict = build_graph(start_page, depth, display=False)

Graph not found. Building a new graph.
Initial graph built.
Number of nodes: 8027 Number of edges: 11888
Number of categories: 32

Graph completed with new links between already present nodes.
Number of nodes: 7473 Number of edges: 872884


In [16]:
import numpy as np
from threadpoolctl import threadpool_limits
import time

def get_common_neighbors(adj_matrix):
    # square the adjacency matrix
    with threadpool_limits(limits=10, user_api='blas'):
        adj_matrix_squared = np.dot(adj_matrix, adj_matrix)
    # set the diagonal to 0
    np.fill_diagonal(adj_matrix_squared, 0)

    return adj_matrix_squared

adj_matrix = nx.to_numpy_array(graph)

t0 = time.time()
common_neighbors1 = get_common_neighbors(adj_matrix)
t1 = time.time()

print(f"Time taken: {t1-t0:.2f} seconds", flush=True)

# convert to sparse matrix
from scipy.sparse import csr_matrix, csc_matrix

t0 = time.time()
adj_matrix_sparse = csr_matrix(adj_matrix)
common_neighbors_sparse1 = adj_matrix_sparse.dot(adj_matrix_sparse)
t1 = time.time()

print(f"Time taken: {t1-t0:.2f} seconds", flush=True)

t0 = time.time()
adj_matrix_sparse = csc_matrix(adj_matrix)
common_neighbors_sparse2 = adj_matrix_sparse.dot(adj_matrix_sparse)
t1 = time.time()

print(f"Time taken: {t1-t0:.2f} seconds", flush=True)

t0 = time.time()
adj_matrix_sparse_c = csc_matrix(adj_matrix)
adj_matrix_sparse_r = csr_matrix(adj_matrix)
common_neighbors_sparse3 = adj_matrix_sparse_c.dot(adj_matrix_sparse_r)
t1 = time.time()

print(f"Time taken: {t1-t0:.2f} seconds", flush=True)

t0 = time.time()
adj_matrix_sparse_c = csc_matrix(adj_matrix)
adj_matrix_sparse_r = csr_matrix(adj_matrix)
common_neighbors_sparse4 = adj_matrix_sparse_r.dot(adj_matrix_sparse_c)
t1 = time.time()

Time taken: 4.96 seconds
Time taken: 1.44 seconds


In [None]:
from neighbors import get_common_neighbors, get_total_neighbors, get_jaccard_coefficient

adjacency_matrix = nx.adjacency_matrix(graph).todense()
common_neighbors_matrix = get_common_neighbors(adjacency_matrix)
total_neighbors_matrix = get_total_neighbors(adjacency_matrix, common_neighbors_matrix)
jaccard_similarity_matrix = get_jaccard_coefficient(common_neighbors_matrix, total_neighbors_matrix)

In [None]:
from affinity_propagation import cluster

# combine the adjacency matrix and the jaccard similarity matrix
clustering_matrix = adjacency_matrix + jaccard_similarity_matrix

# find the maximum value in the matrix
max_val = np.max(clustering_matrix)

# set the diagonal to the maximum value
np.fill_diagonal(clustering_matrix, max_val)

# cluster the nodes
cluster_labels = cluster(similarity_arr=clustering_matrix, damping_start=0.75)

# get the number of clusters and the number of nodes in each cluster
n_clusters = len(set(cluster_labels))
cluster_sizes = [np.sum(cluster_labels == i) for i in range(n_clusters)]
print(f"Number of clusters: {n_clusters}")
print(f"Cluster sizes: {cluster_sizes}")



KeyboardInterrupt: 

In [None]:
# set the diagonal of the adjacency matrix to 0
np.fill_diagonal(adjacency_matrix, 0)
boolean_adjacency_matrix = adjacency_matrix > 0
masked_similarity_matrix = jaccard_similarity_matrix[boolean_adjacency_matrix]

quantile_threshold = 0.7
threshold = np.quantile(masked_similarity_matrix, quantile_threshold)
print(f'Similarity threshold: {threshold}')

Similarity threshold: 0.9340659340659341


In [None]:
from missing_links import find_missing_links, print_missing_links

missing_links = find_missing_links(graph, jaccard_similarity_matrix, cluster_labels, threshold)

print(f"Number of missing links: {len(missing_links)}")

# remove printing limit
np.set_printoptions(threshold=np.inf)
n = 20
print_missing_links(missing_links, n)

Number of missing links: 10
Top 10 missing links:
0: Guy Laroque <-- 1.0 --> Jean-Charles Rochet
1: Guy Laroque <-- 1.0 --> Jean-Michel Grandmont
2: Jan Eeckhout <-- 1.0 --> Louis Philps
3: Jan Eeckhout <-- 1.0 --> Maristella Botticini
4: Jean-Charles Rochet <-- 1.0 --> Jean-Michel Grandmont
5: Louis Philps <-- 1.0 --> Maristella Botticini
6: Guy Laroque <-- 0.9880952380952381 --> Herbert Scarf
7: Herbert Scarf <-- 0.9880952380952381 --> Jean-Charles Rochet
8: Herbert Scarf <-- 0.9880952380952381 --> Jean-Michel Grandmont
9: Nicholas Stern <-- 0.9767441860465116 --> Nicholas Stern, Baron Stern of Brentford



In [None]:
# find the cluster of the start page
start_page_cluster = cluster_labels[list(graph.nodes).index(start_page)]
print(f"Start page cluster: {start_page_cluster}")

# find missing links that are in the same cluster as the start page
same_cluster_missing_links  = find_missing_links(graph, jaccard_similarity_matrix, cluster_labels, threshold, specific_cluster=start_page_cluster)

print_missing_links(same_cluster_missing_links, n)

Start page cluster: 0
Top 10 missing links:
0: Guy Laroque <-- 1.0 --> Jean-Charles Rochet
1: Guy Laroque <-- 1.0 --> Jean-Michel Grandmont
2: Jan Eeckhout <-- 1.0 --> Louis Philps
3: Jan Eeckhout <-- 1.0 --> Maristella Botticini
4: Jean-Charles Rochet <-- 1.0 --> Jean-Michel Grandmont
5: Louis Philps <-- 1.0 --> Maristella Botticini
6: Guy Laroque <-- 0.9880952380952381 --> Herbert Scarf
7: Herbert Scarf <-- 0.9880952380952381 --> Jean-Charles Rochet
8: Herbert Scarf <-- 0.9880952380952381 --> Jean-Michel Grandmont
9: Nicholas Stern <-- 0.9767441860465116 --> Nicholas Stern, Baron Stern of Brentford



In [None]:
# find missing links that involve the start page
start_page_missing_links = find_missing_links(graph, jaccard_similarity_matrix, cluster_labels, threshold, specific_node=start_page)

print_missing_links(start_page_missing_links, n)

Top 10 missing links:



In [None]:
from sklearn.cluster import KMeans

num_nodes = len(graph.nodes)
num_clusters = max(1, int(num_nodes / 100))

kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(clustering_matrix)

cluster_labels = kmeans.labels_

# get the number of clusters and the number of nodes in each cluster
n_clusters = len(set(cluster_labels))
cluster_sizes = [np.sum(cluster_labels == i) for i in range(n_clusters)]
print(f"Number of clusters: {n_clusters}")
print(f"Cluster sizes: {cluster_sizes}")

Number of clusters: 1
Cluster sizes: [122]


In [None]:
# find missing links with the new clustering
missing_links = find_missing_links(graph, jaccard_similarity_matrix, cluster_labels, threshold)
print(f"Number of missing links: {len(missing_links)}")

print_missing_links(missing_links, n)

In [None]:
# find missing links that are in the same cluster as the start page
start_page_cluster = cluster_labels[list(graph.nodes).index(start_page)]
same_cluster_missing_links  = find_missing_links(graph, jaccard_similarity_matrix, cluster_labels, threshold, specific_cluster=start_page_cluster)
print_missing_links(same_cluster_missing_links, n)

In [None]:
# find missing links that involve the start page
start_page_missing_links = find_missing_links(graph, jaccard_similarity_matrix, cluster_labels, threshold, specific_node=start_page)
print_missing_links(start_page_missing_links, n)