In [1]:
from build_graph import build_graph
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

In [2]:
start_page = "Cumulative distribution function" # Change this to the page you want to start from
depth = 2  # Adjust the depth as needed

graph, links_dict, categories_dict = build_graph(start_page, depth, display=False)

Graph found. Loading graph, links and categories.
Number of nodes: 3563 Number of edges: 328549


In [3]:
from neighbors import get_common_neighbors, get_total_neighbors, get_jaccard_coefficient

adjacency_matrix = nx.adjacency_matrix(graph).todense()
adjacency_matrix.astype(int)
common_neighbors_matrix = get_common_neighbors(adjacency_matrix)
total_neighbors_matrix = get_total_neighbors(adjacency_matrix, common_neighbors_matrix)
jaccard_similarity_matrix = get_jaccard_coefficient(common_neighbors_matrix, total_neighbors_matrix)

In [4]:
from dbscan import dbscan_from_similarity

# combine the adjacency matrix and the jaccard similarity matrix
clustering_matrix = adjacency_matrix + jaccard_similarity_matrix

# find the maximum value in the matrix
max_val = np.max(clustering_matrix)

# set the diagonal to the maximum value
np.fill_diagonal(clustering_matrix, max_val)

cluster_labels = dbscan_from_similarity(clustering_matrix)

Cluster Selection Epsilon: 0.1, Number of Clusters: 19, Silhouette Score: 0.15348920655917334
Cluster Selection Epsilon: 0.12105263157894737, Number of Clusters: 18, Silhouette Score: 0.2117328732730448
Cluster Selection Epsilon: 0.14210526315789473, Number of Clusters: 17, Silhouette Score: 0.3476969205979627
Cluster Selection Epsilon: 0.1631578947368421, Number of Clusters: 16, Silhouette Score: 0.376580518918584
Cluster Selection Epsilon: 0.1842105263157895, Number of Clusters: 16, Silhouette Score: 0.376580518918584
Cluster Selection Epsilon: 0.20526315789473684, Number of Clusters: 15, Silhouette Score: 0.3480369721769305
Cluster Selection Epsilon: 0.22631578947368422, Number of Clusters: 13, Silhouette Score: 0.29792921172265097
Silhouette Score: 0.376580518918584


In [5]:
# get the number of clusters and the number of nodes in each cluster
n_clusters = len(set(cluster_labels))
cluster_sizes = [np.sum(cluster_labels == i) for i in range(n_clusters)]
print(f"Number of clusters: {n_clusters}")
print(f"Cluster sizes: {cluster_sizes}")

Number of clusters: 16
Cluster sizes: [57, 74, 517, 67, 40, 37, 149, 127, 130, 43, 219, 420, 209, 146, 198, 0]


In [6]:
# set the diagonal of the adjacency matrix to 0
np.fill_diagonal(adjacency_matrix, 0)
boolean_adjacency_matrix = adjacency_matrix > 0
masked_similarity_matrix = jaccard_similarity_matrix[boolean_adjacency_matrix]

# find weak quantile threshold
weak_quantile_threshold = 0.7
weak_threshold = np.quantile(masked_similarity_matrix, weak_quantile_threshold)

# find strong quantile threshold
strong_quantile_threshold = 0.99
strong_threshold = np.quantile(masked_similarity_matrix, strong_quantile_threshold)

# print the thresholds
print(f'Similarity thresholds: ')
print(f'Weak quantile threshold: {weak_quantile_threshold}')
print(f'Strong quantile threshold: {strong_quantile_threshold}')

Similarity thresholds: 
Weak quantile threshold: 0.7
Strong quantile threshold: 0.99


In [7]:
from missing_links import find_missing_links, print_missing_links

missing_link_candidates = find_missing_links(graph, jaccard_similarity_matrix, cluster_labels, weak_threshold, strong_threshold)

print(f"Number of missing link candidates: {len(missing_link_candidates)}")

# remove printing limit
np.set_printoptions(threshold=np.inf)
n = 2000
print_missing_links(missing_link_candidates, n)

Number of missing links: 51417
Top 2000 missing links:
0: First derivative test <-- 1.0 --> Second derivative test
1: Geometric continuity <-- 1.0 --> Parametric continuity
2: Left limit <-- 1.0 --> Right limit
3: Cultured pearl <-- 1.0 --> Homesteading
4: Cultured pearl <-- 1.0 --> Maid service
5: Cultured pearl <-- 1.0 --> Optical manufacturing and testing
6: Cultured pearl <-- 1.0 --> Ready-mix concrete
7: Cultured pearl <-- 1.0 --> Security company
8: Geomatics engineering <-- 1.0 --> Survey engineering
9: Homesteading <-- 1.0 --> Maid service
10: Homesteading <-- 1.0 --> Optical manufacturing and testing
11: Homesteading <-- 1.0 --> Ready-mix concrete
12: Homesteading <-- 1.0 --> Security company
13: Maid service <-- 1.0 --> Optical manufacturing and testing
14: Maid service <-- 1.0 --> Ready-mix concrete
15: Maid service <-- 1.0 --> Security company
16: Optical manufacturing and testing <-- 1.0 --> Ready-mix concrete
17: Optical manufacturing and testing <-- 1.0 --> Security comp