In [1]:
# import function from file in another directory
import sys
sys.path.append('functions/')

from build_graph import build_graph
import networkx as nx
import numpy as np
import pandas as pd

In [2]:
depth = 1
start_page = "DBSCAN" 

graph, links_dict, categories_dict = build_graph(start_page, depth, display=False)

Graph not found. Building a new graph.
Graph built. Completing the graph with missing links between existing nodes.
First round: Processing nodes to add missing links between existing nodes.
Graph completed with new links between already existing nodes.
Number of nodes: 137 , Number of edges: 7552 , Number of categories: 165


In [3]:
from neighbors import get_common_neighbors, get_total_neighbors, get_jaccard_coefficient

adjacency_matrix = nx.adjacency_matrix(graph).todense()
adjacency_matrix.astype(int)
common_neighbors_matrix = get_common_neighbors(adjacency_matrix)
total_neighbors_matrix = get_total_neighbors(adjacency_matrix, common_neighbors_matrix)
jaccard_similarity_matrix = get_jaccard_coefficient(common_neighbors_matrix, total_neighbors_matrix)

In [4]:
from dbscan import dbscan_from_similarity

# combine the adjacency matrix and the jaccard similarity matrix
clustering_matrix = jaccard_similarity_matrix + adjacency_matrix

# find the maximum value in the matrix
max_val = np.max(clustering_matrix)

# set the diagonal to the maximum value
np.fill_diagonal(clustering_matrix, max_val)

cluster_labels = dbscan_from_similarity(clustering_matrix)

Epsilon: 0.01, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.041, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.071, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.102, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.133, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.163, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.194, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.225, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.256, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.286, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.317, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.348, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.378, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.409, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.44, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.47, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.501, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.532, N Clusters: 3, Silhouette Score: 0.474
Epsilon: 0.56

In [5]:
# get the number of clusters and the number of nodes in each cluster
n_clusters = len(set(cluster_labels))
cluster_sizes = [np.sum(cluster_labels == i) for i in range(-1, n_clusters)]
print(f"Number of clusters: {n_clusters}")
print(f'Cluster sizes: {cluster_sizes}')

max_size = max(cluster_sizes)
min_size = min(cluster_sizes)

print(f"Max cluster size: {max_size}, Min cluster size: {min_size}")

Number of clusters: 3
Cluster sizes: [26, 90, 21, 0]
Max cluster size: 90, Min cluster size: 0


In [6]:
# find the cluster of the start page
start_page_cluster = cluster_labels[0]

# find the nodes in the same cluster as the start page
start_page_cluster_nodes = [node for node, cluster in zip(graph.nodes, cluster_labels) if cluster == start_page_cluster]

# find the dimension of the start page cluster
start_page_cluster_dim = len(start_page_cluster_nodes)

print(f"{start_page_cluster_dim} nodes in cluster {start_page_cluster}, with the start page:")
for node in start_page_cluster_nodes:
    print(node)

90 nodes in cluster 0, with the start page:
DBSCAN
Active learning (machine learning)
Anomaly detection
Artificial neural network
Association rule learning
Autoencoder
Automated machine learning
BIRCH
Batch learning
Bias–variance tradeoff
Boosting (machine learning)
Bootstrap aggregating
CURE algorithm
Canonical correlation
Cluster analysis
Computational learning theory
Conditional random field
Conference on Neural Information Processing Systems
Convolutional neural network
Curriculum learning
Data clustering
Data mining
Decision tree learning
DeepDream
Diffusion model
Empirical risk minimization
Ensemble learning
Feature engineering
Feature learning
Feedforward neural network
Fuzzy clustering
Gated recurrent unit
Generative adversarial network
Grammar induction
Graphical model
Hierarchical clustering
Human-in-the-loop
Independent component analysis
International Conference on Learning Representations
International Conference on Machine Learning
K-means algorithm
K-means clustering
Ker

In [7]:
# set the diagonal of the adjacency matrix to 0
np.fill_diagonal(adjacency_matrix, 0)
boolean_adjacency_matrix = adjacency_matrix > 0
masked_similarity_matrix = jaccard_similarity_matrix[boolean_adjacency_matrix]

# find weak quantile threshold
weak_quantile_threshold = 0.75
weak_threshold = np.quantile(masked_similarity_matrix, weak_quantile_threshold)

# find strong quantile threshold
strong_quantile_threshold = 0.95
strong_threshold = np.quantile(masked_similarity_matrix, strong_quantile_threshold)

# print the thresholds
print(f'Similarity thresholds: ')
print(f'Weak quantile threshold: {weak_threshold}')
print(f'Strong quantile threshold: {strong_threshold}')

Similarity thresholds: 
Weak quantile threshold: 0.984251968503937
Strong quantile threshold: 1.0


In [8]:
from missing_links import find_missing_links_multi_thread

missing_link_candidates, missing_link_candidates_matrix = find_missing_links_multi_thread(graph, jaccard_similarity_matrix, cluster_labels, weak_threshold, strong_threshold)

print(f"Number of missing link candidates: {len(missing_link_candidates)}")

Number of missing link candidates: 1


In [9]:
from build_dataset import build_dataset_multi_thread

train_df, filtered_categories_dict = build_dataset_multi_thread(adjacency_matrix, jaccard_similarity_matrix, missing_link_candidates_matrix, 
                                                    common_neighbors_matrix, total_neighbors_matrix, 
                                                    graph, cluster_labels, categories_dict, df_type='train')

missing_link_df, filtered_categories_dict = build_dataset_multi_thread(adjacency_matrix, jaccard_similarity_matrix, missing_link_candidates_matrix,
                                                            common_neighbors_matrix, total_neighbors_matrix,graph, cluster_labels, categories_dict,
                                                            df_type='missing links', filtered_categories_dict=filtered_categories_dict)

In [10]:
from sklearn.model_selection import train_test_split

# split the dataset into train and test
X = train_df.drop(columns=['node_1', 'node_2', 'link'])
y = train_df['link']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
from xgboost import XGBClassifier
from tune_model import tune
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score


# train the model
model = XGBClassifier()

# calculate scale_pos_weight
scale_pos_weight = np.sum(y_train == 0) / np.sum(y_train == 1)

# set parameters for grid search
space = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'scale_pos_weight': [scale_pos_weight],
    'objective': ['binary:logistic'],
    'alpha': [0, 0.1]
}

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc'
}

best_params, best_model = tune(X, y, space, scoring, 
                            model, modeltype='clf', search_type='grid', n_iter_random=100,
                            n_splits=2, n_repeats=1, random_state=1,
                            verbose=True, display_plots=0, refit='roc_auc')


Fitting 2 folds for each of 12 candidates, totalling 24 fits

Best Score (roc_auc): [1m1.0[0m
accuracy: 1.0
precision: 1.0
recall: 1.0
f1: 1.0

Best Hyperparameters:
alpha: 0
max_depth: 5
n_estimators: 200
objective: binary:logistic
scale_pos_weight: 0.24354526089884007

Optimal Threshold: 0.7444357


In [12]:
# test the model
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 score: {f1}")
print(f"ROC AUC: {roc_auc}")
print(f"Confusion matrix: ")
print(confusion)

Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 score: 1.0
ROC AUC: 1.0
Confusion matrix: 
[[ 59309      0]
 [     0 240678]]


In [13]:
# save the columns 'node1', 'node2' into arrays
node1 = missing_link_df['node_1'].values
node2 = missing_link_df['node_2'].values

# remove the columns 'node1', 'node2' from the dataframe
missing_link_to_preidct_df = missing_link_df.drop(columns=['node_1', 'node_2'])

# predict link probabilities for the missing link candidates
link_probabilities = best_model.predict_proba(missing_link_to_preidct_df)[:, 1]

# create a new dataframe with the columns 'node1', 'node2', 'link_probability', 'similarity' and clusters of the nodes
missing_link_predictions_df = pd.DataFrame({
    'node_1': node1, 
    'node_2': node2, 
    'link_probability': link_probabilities, 
    'similarity': missing_link_df['similarity'].values, 
    'cluster_node_1': missing_link_df['cluster_node_1'].values, 
    'cluster_node_2': missing_link_df['cluster_node_2'].values})

# sort the dataframe by link_probability
missing_link_predictions_df = missing_link_predictions_df.sort_values(by='link_probability', ascending=False)

# keep only the values above 0.5 probability
missing_link_predictions_df = missing_link_predictions_df[missing_link_predictions_df['link_probability'] > 0.5]

# set pd printing limits to display all the rows
pd.set_option('display.max_rows', None)
columns_to_display = ['node_1', 'node_2', 'link_probability']
print("Missing link predictions: ")
missing_link_predictions_df[columns_to_display].head(100)

Missing link predictions: 


Unnamed: 0,node_1,node_2,link_probability


In [14]:
# find missing links where either node1 or node2 are in the same cluster as the start page
start_page_cluster = cluster_labels[0]
node1_cluster_mask = missing_link_predictions_df['cluster_node_1'].values == start_page_cluster
node2_cluster_mask = missing_link_predictions_df['cluster_node_2'].values == start_page_cluster
start_page_cluster_mask = node1_cluster_mask | node2_cluster_mask

start_page_cluster_missing_links = missing_link_predictions_df[start_page_cluster_mask]

print(f'Missing links where either node1 or node2 are in the same cluster as the start page:')
start_page_cluster_missing_links[columns_to_display].head(100)


Missing links where either node1 or node2 are in the same cluster as the start page:


Unnamed: 0,node_1,node_2,link_probability


In [15]:
# find missing links that include the start page if node1 or node2 is the start page
missing_links_start_page_df = missing_link_predictions_df[(missing_link_predictions_df['node_1'] == start_page) | (missing_link_predictions_df['node_2'] == start_page)]

print(f'Missing links that include the start page:')
missing_links_start_page_df[columns_to_display].head(100)

Missing links that include the start page:


Unnamed: 0,node_1,node_2,link_probability


In [16]:
# find missing links between nodes in different clusters
different_cluster_mask = missing_link_predictions_df['cluster_node_1'].values != missing_link_predictions_df['cluster_node_2'].values

different_cluster_missing_links_df = missing_link_predictions_df[different_cluster_mask]

print(f'Missing links between nodes in different clusters:')
different_cluster_missing_links_df[columns_to_display].head(100)

Missing links between nodes in different clusters:


Unnamed: 0,node_1,node_2,link_probability
