In [1]:
from build_graph import build_graph
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

In [2]:
depth = 2
start_page = "Cumulative distribution function" 

graph, links_dict, categories_dict = build_graph(start_page, depth, display=False)

Graph found. Loading graph, links and categories.
Number of nodes: 3381 , Number of edges: 330293


In [3]:
from neighbors import get_common_neighbors, get_total_neighbors, get_jaccard_coefficient

adjacency_matrix = nx.adjacency_matrix(graph).todense()
adjacency_matrix.astype(int)
common_neighbors_matrix = get_common_neighbors(adjacency_matrix)
total_neighbors_matrix = get_total_neighbors(adjacency_matrix, common_neighbors_matrix)
jaccard_similarity_matrix = get_jaccard_coefficient(common_neighbors_matrix, total_neighbors_matrix)

In [5]:
from dbscan import dbscan_from_similarity

# combine the adjacency matrix and the jaccard similarity matrix
clustering_matrix = jaccard_similarity_matrix + adjacency_matrix

# find the maximum value in the matrix
max_val = np.max(clustering_matrix)

# set the diagonal to the maximum value
np.fill_diagonal(clustering_matrix, max_val)

cluster_labels = dbscan_from_similarity(clustering_matrix)

Epsilon: 0.01, N Clusters: 48, Silhouette Score: -0.171
Epsilon: 0.041, N Clusters: 51, Silhouette Score: -0.123
Epsilon: 0.071, N Clusters: 50, Silhouette Score: 0.065
Epsilon: 0.102, N Clusters: 48, Silhouette Score: 0.066
Epsilon: 0.133, N Clusters: 50, Silhouette Score: 0.063
Epsilon: 0.163, N Clusters: 42, Silhouette Score: 0.222
Epsilon: 0.194, N Clusters: 41, Silhouette Score: 0.265
Epsilon: 0.225, N Clusters: 41, Silhouette Score: 0.215
Epsilon: 0.256, N Clusters: 36, Silhouette Score: -0.107
Epsilon: 0.286, N Clusters: 35, Silhouette Score: -0.106
Epsilon: 0.317, N Clusters: 33, Silhouette Score: -0.112

Subclustering the noise cluster

Subclustering the noise cluster

Subclustering the noise cluster

Subclustering the noise cluster
--------------------------------------------------
Final Silhouette Score: 0.2796094146194735
--------------------------------------------------


In [6]:
# get the number of clusters and the number of nodes in each cluster
n_clusters = len(set(cluster_labels))
cluster_sizes = [np.sum(cluster_labels == i) for i in range(-1, n_clusters)]
print(f"Number of clusters: {n_clusters}")
print(f'Cluster sizes: {cluster_sizes}')

max_size = max(cluster_sizes)
min_size = min(cluster_sizes)

print(f"Max cluster size: {max_size}, Min cluster size: {min_size}")

Number of clusters: 74
Cluster sizes: [35, 6, 5, 7, 5, 6, 24, 5, 6, 5, 51, 25, 5, 9, 13, 21, 6, 5, 100, 18, 27, 6, 17, 30, 8, 10, 35, 8, 32, 78, 38, 9, 9, 80, 258, 184, 388, 170, 182, 123, 314, 5, 5, 5, 5, 5, 6, 9, 7, 56, 7, 44, 6, 7, 32, 466, 6, 7, 5, 10, 6, 13, 8, 5, 8, 6, 8, 11, 6, 6, 116, 5, 121, 6, 0]
Max cluster size: 466, Min cluster size: 0


In [7]:
# find the cluster of the start page
start_page_cluster = cluster_labels[0]

# find the nodes in the same cluster as the start page
start_page_cluster_nodes = [node for node, cluster in zip(graph.nodes, cluster_labels) if cluster == start_page_cluster]

# find the dimension of the start page cluster
start_page_cluster_dim = len(start_page_cluster_nodes)

print(f"{start_page_cluster_dim} nodes in cluster {start_page_cluster}, with the start page:")
for node in start_page_cluster_nodes:
    print(node)

116 nodes in cluster 69, with the start page:
Gaius Terentilius Harsa
Consul
Terentilia gens
Bürgerliches Gesetzbuch
Civil Code of Japan
Code of Hammurabi
Corpus Juris Civilis
Napoleonic Code
Philippine legal codes
United States Code
Acta Senatus
Andrew Lintott
Centuria
Equestrians
Gaius Licinius Stolo
James Hampton (priest)
Lex Hortensia
Lex Trebonia (448 BC)
Lily Ross Taylor
Lucius Sextius Lateranus
Lucius Sicinius Vellutus
Lucius Valerius Poplicola Potitus
Master of the Horse
Monte Sacro
Nobiles
Outline of political science
Plebeians
Quintus Hortensius (dictator)
Quintus Publilius Philo
Veto
Agen
Argos, Peloponnese
Consularis
Hypatos
Livadeia
Sufet
Titus Tatius
Annales maximi
Antony's Parthian War
Ariarathes V
Ascanius
Bacchanalia
Battle of Chaeronea (86 BC)
Battle of the Caudine Forks
Bituitus
Brennus (4th century BC)
Corsicans
Dardanians (Trojan)
Donations of Alexandria
Epitome
Gaius Antonius (brother of Mark Antony)
Gaius Scribonius Curio (consul)
Gaius Servilius Glaucia
Gnaeus G

In [8]:
# set the diagonal of the adjacency matrix to 0
np.fill_diagonal(adjacency_matrix, 0)
boolean_adjacency_matrix = adjacency_matrix > 0
masked_similarity_matrix = jaccard_similarity_matrix[boolean_adjacency_matrix]

# find weak quantile threshold
weak_quantile_threshold = 0.95
weak_threshold = np.quantile(masked_similarity_matrix, weak_quantile_threshold)

# find strong quantile threshold
strong_quantile_threshold = 0.99
strong_threshold = np.quantile(masked_similarity_matrix, strong_quantile_threshold)

# print the thresholds
print(f'Similarity thresholds: ')
print(f'Weak quantile threshold: {weak_threshold}')
print(f'Strong quantile threshold: {strong_threshold}')

Similarity thresholds: 
Weak quantile threshold: 0.9075144508670521
Strong quantile threshold: 0.9597069597069597


In [9]:
from missing_links import find_missing_link_candidates, print_missing_links_dict

missing_link_candidates, missing_link_candidates_matrix = find_missing_link_candidates(graph, jaccard_similarity_matrix, cluster_labels, weak_threshold, strong_threshold)

print(f"Number of missing link candidates: {len(missing_link_candidates)}")

Number of missing link candidates: 866


In [10]:
from build_dataset import build_train_dataset, build_missing_link_dataset

train_df = build_train_dataset(adjacency_matrix, jaccard_similarity_matrix, missing_link_candidates_matrix, 
                               common_neighbors_matrix, total_neighbors_matrix, graph, cluster_labels, categories_dict)

missing_link_df = build_missing_link_dataset(adjacency_matrix, jaccard_similarity_matrix, missing_link_candidates_matrix, 
                                            common_neighbors_matrix, total_neighbors_matrix, graph, cluster_labels, categories_dict)
train_df.head()

[20, 256, 447, 107, 235, 1290, 280, 1203, 704, 667, 744, 300, 277, 957, 578, 25, 375, 435, 84, 25, 43, 181, 143, 208, 207, 138, 212, 231, 177, 202, 135, 140, 222, 143, 314, 197, 204, 45, 209, 153, 204, 181, 201, 213, 200, 552, 229, 31, 251, 238, 248, 223, 232, 253, 156, 38, 71, 276, 209, 393, 37, 64, 13, 233, 214, 280, 228, 231, 238, 202, 341, 214, 305, 200, 155, 235, 216, 220, 71, 144, 221, 153, 209, 236, 43, 165, 227, 215, 351, 222, 157, 144, 143, 213, 512, 216, 28, 166, 270, 208, 515, 163, 337, 103, 202, 214, 213, 143, 208, 282, 134, 136, 207, 207, 202, 142, 168, 459, 411, 20, 15, 222, 200, 216, 205, 214, 180, 310, 323, 210, 251, 219, 242, 220, 208, 308, 215, 204, 138, 200, 202, 181, 154, 158, 298, 200, 175, 556, 181, 498, 233, 17, 256, 135, 240, 603, 239, 227, 208, 156, 211, 145, 137, 153, 254, 200, 209, 209, 203, 260, 234, 139, 206, 141, 138, 205, 222, 291, 163, 185, 11, 328, 253, 49, 205, 29, 95, 225, 208, 66, 131, 158, 646, 252, 139, 152, 201, 48, 43, 110, 19, 274, 288, 248, 205

TypeError: string operation on non-string array

In [None]:
from sklearn.model_selection import train_test_split

# split the dataset into train and test
X = train_df.drop(columns=['Link'])
y = train_df['Link']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from xgboost import XGBClassifier
from tune_model import tune
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score


# train the model
model = XGBClassifier()

# calculate scale_pos_weight
scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)

# set parameters for grid search
space = {
    'n_estimators': [100],
    'max_depth': [3],
    'scale_pos_weight': [scale_pos_weight],
    'objective': ['binary:logistic'],
    'alpha': [0.1]
}

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

best_params, best_model = tune(X, y, space, scoring, 
                            model, modeltype='clf', search_type='grid', n_iter_random=100,
                            n_splits=3, n_repeats=1, random_state=1,
                            verbose=True, display_plots=0, refit='roc_auc')


Fitting 3 folds for each of 1 candidates, totalling 3 fits

Best Score (roc_auc): [1m0.9938086057268922[0m
accuracy: 0.9957497593022988
precision: 0.42631893748906485
recall: 0.9914266176126523
f1: 0.5379601558592585

Best Hyperparameters:
alpha: 0.1
max_depth: 3
n_estimators: 100
objective: binary:logistic
scale_pos_weight: 779.3897784601886

Optimal Threshold: 1.5000771e-32


In [None]:
# test the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix: ")
print(confusion)

Accuracy: 0.9969846525899861
Precision: 0.30828100470957615
Recall: 0.968557336621455
F1 score: 0.4676987198571003
ROC AUC: 0.9827904612434959
Confusion matrix: 
[[1180786    3525]
 [     51    1571]]


In [None]:
# save the columns 'node1', 'node2' into arrays
node1 = missing_link_df['node 1'].values
node2 = missing_link_df['node 2'].values

# remove the columns 'node1', 'node2' from the dataframe
missing_link_df = missing_link_df.drop(columns=['node 1', 'node 2'])

# predict link probabilities for the missing link candidates
link_probabilities = best_model.predict_proba(missing_link_df)[:, 1]

# add the link probabilities to a new column called 'Link Probability'
missing_link_df['Link Probability'] = link_probabilities
missing_link_df['node 1'] = node1
missing_link_df['node 2'] = node2

# keep only probabilities above the optimal threshold
missing_link_df = missing_link_df[missing_link_df['Link Probability'] > 0.5]

# sort the missing link candidates by link probability
missing_link_df = missing_link_df.sort_values(by='Link Probability', ascending=False)

for i in range(100):
    node1 = missing_link_df.iloc[i]['node 1']
    node2 = missing_link_df.iloc[i]['node 2']
    link_probability = missing_link_df.iloc[i]['Link Probability']

    print(f"{node1} <-- {link_probability:.3f} --> {node2}")

Gaius Terentilius Harsa <-- 1.000 --> 1917 code of canon law
Gaius Terentilius Harsa <-- 1.000 --> great seal of the united states
Gaius Terentilius Harsa <-- 1.000 --> early modern period
Gaius Terentilius Harsa <-- 1.000 --> epigraphy
Gaius Terentilius Harsa <-- 1.000 --> esse quam videri
Gaius Terentilius Harsa <-- 1.000 --> eton college
Gaius Terentilius Harsa <-- 1.000 --> etruscan alphabet
Gaius Terentilius Harsa <-- 1.000 --> etruscan language
Gaius Terentilius Harsa <-- 1.000 --> faliscan language
Gaius Terentilius Harsa <-- 1.000 --> floruit
Gaius Terentilius Harsa <-- 1.000 --> franks casket
Gaius Terentilius Harsa <-- 1.000 --> frederic m. wheelock
Gaius Terentilius Harsa <-- 1.000 --> fricative consonant
Gaius Terentilius Harsa <-- 1.000 --> fusional language
Gaius Terentilius Harsa <-- 1.000 --> gemination
Gaius Terentilius Harsa <-- 1.000 --> genitive case
Gaius Terentilius Harsa <-- 1.000 --> george buchanan
Gaius Terentilius Harsa <-- 1.000 --> germanic people
Gaius Ter

In [None]:
# find missing links where either node1 or node2 are in the same cluster as the start page
start_page_cluster = cluster_labels[0]
node1_cluster_mask = missing_link_df['cluster node 1'].values == start_page_cluster
node2_cluster_mask = missing_link_df['cluster node 2'].values == start_page_cluster
start_page_cluster_mask = node1_cluster_mask | node2_cluster_mask

start_page_cluster_missing_links = missing_link_df[start_page_cluster_mask]

iters = min(100, len(start_page_cluster_missing_links))

for i in range(iters):
    node1 = start_page_cluster_missing_links.iloc[i]['node 1']
    node2 = start_page_cluster_missing_links.iloc[i]['node 2']
    link_probability = start_page_cluster_missing_links.iloc[i]['Link Probability']

    print(f"{node1} <-- {link_probability:.3f} --> {node2}")

Gaius Terentilius Harsa <-- 1.000 --> 1917 code of canon law
Gaius Terentilius Harsa <-- 1.000 --> great seal of the united states
Gaius Terentilius Harsa <-- 1.000 --> early modern period
Gaius Terentilius Harsa <-- 1.000 --> epigraphy
Gaius Terentilius Harsa <-- 1.000 --> esse quam videri
Gaius Terentilius Harsa <-- 1.000 --> eton college
Gaius Terentilius Harsa <-- 1.000 --> etruscan alphabet
Gaius Terentilius Harsa <-- 1.000 --> etruscan language
Gaius Terentilius Harsa <-- 1.000 --> faliscan language
Gaius Terentilius Harsa <-- 1.000 --> floruit
Gaius Terentilius Harsa <-- 1.000 --> franks casket
Gaius Terentilius Harsa <-- 1.000 --> frederic m. wheelock
Gaius Terentilius Harsa <-- 1.000 --> fricative consonant
Gaius Terentilius Harsa <-- 1.000 --> fusional language
Gaius Terentilius Harsa <-- 1.000 --> gemination
Gaius Terentilius Harsa <-- 1.000 --> genitive case
Gaius Terentilius Harsa <-- 1.000 --> george buchanan
Gaius Terentilius Harsa <-- 1.000 --> germanic people
Gaius Ter

In [None]:
# find missing links that include the start page if node1 or node2 is the start page
df_missing_links_start_page = missing_link_df[(missing_link_df['node 1'] == start_page) | (missing_link_df['node 2'] == start_page)]

df_missing_links_start_page.head()

Unnamed: 0,Common Neighbors,Total Neighbors,Similarity,Common Categories,Total Categories,n_categories node 1,n_categories node 2,cluster node 1,cluster node 2,Link Probability,node 1,node 2
0,1.0,21.0,0.047619,0.0,4.0,4.0,0.0,0.0,0.0,1.0,Gaius Terentilius Harsa,1917 code of canon law
2098,1.0,21.0,0.047619,0.0,27.0,4.0,23.0,0.0,0.0,1.0,Gaius Terentilius Harsa,great seal of the united states
2076,1.0,21.0,0.047619,0.0,38.0,4.0,34.0,0.0,0.0,1.0,Gaius Terentilius Harsa,early modern period
2077,1.0,21.0,0.047619,0.0,32.0,4.0,28.0,0.0,0.0,1.0,Gaius Terentilius Harsa,epigraphy
2078,1.0,21.0,0.047619,0.0,20.0,4.0,16.0,0.0,0.0,1.0,Gaius Terentilius Harsa,esse quam videri


In [None]:
# use link dict to find the links of 'high-pressure cut-off switch
high_pressure_cut_off_switch_links = links_dict['high-pressure cut-off switch']

print(f"Links of 'high-pressure cut-off switch':")
for link in high_pressure_cut_off_switch_links:
    print(link)

KeyError: 'high-pressure cut-off switch'

In [None]:
blacklist = ['is', 'not']
categories = np.array(['I am you', 'I am not you', 'I is me', 'I am not me', 'I am him', 'I am not him', 'I am her', 'I am not her', 'I am it', 'I am not it'])

# use np.char.find to filter out the blacklist words

for word in blacklist:
    categories = categories[np.char.find(categories, word) == -1]