In [24]:
import sys
import os
sys.path.append('../')
import data_readers

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math

# networkx
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

# For semantic similarity
from urllib.parse import unquote
from sentence_transformers import SentenceTransformer
import torch

# Python functions in .py file to read data
import machine_searchers
import time

import warnings
from tqdm import TqdmWarning
warnings.filterwarnings('ignore', category=TqdmWarning)

In [25]:
G = nx.read_edgelist('../datasets/wikispeedia_paths-and-graph/links.tsv',
                                         create_using=nx.DiGraph)
pagerank = nx.pagerank(G)

## need to define a correct f function

In [26]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings using sentence transformer
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

# Function to perform L2 normalization on the embeddings
def l2_normalize(tensor):
    return tensor / tensor.norm(p=2, dim=0, keepdim=True)

# Function to calculate semantic similarity between two pieces of text
def semantic_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)

    # L2 normalization of the embeddings (to make sure, although embedding should already be normalized)
    embedding1_normalized = l2_normalize(embedding1)
    embedding2_normalized = l2_normalize(embedding2)

    # Compute and return the similarity of normalized tensors
    return torch.dot(embedding1_normalized, embedding2_normalized).item()

def get_value(G, node_value, target_value):
    similarity = semantic_similarity(node_value, target_value)
    # get pagerank of node_value in G
    node_pagerank = pagerank.get(node_value, None)
    f = similarity * node_pagerank
    return f

In [27]:
def traverse_graph(graph, start_node, target_node):
    current_node = start_node
    visited = set()  # Set to keep track of visited nodes

    for len in range(20):  # Limit to 20 moves
        if current_node == target_node:
            print(f"Target node reached in {len} moves.")
            return len

        visited.add(current_node)  # Mark the current node as visited

        # Check if the current node has neighbors
        neighbors = list(graph.neighbors(current_node))
        unvisited_neighbors = [n for n in neighbors if n not in visited]

        if unvisited_neighbors:
            # Find the neighbor with the highest value by applying get_value
            # Only consider unvisited neighbors
            next_node = max(unvisited_neighbors, key=lambda n: get_value(G, n, target_node))
            current_node = next_node
            print(f"Moving to node: {current_node}")
        else:
            print("No more unvisited neighbors to move to.")
            return len

    print("Limit of 20 nodes reached.")
    return len

In [28]:
get_value(G, 'Egypt', 'Japan')

0.0010913111552156185

In [29]:
traverse_graph(G, 'Functional_programming', 'Judaism')

Moving to node: Calculus
Moving to node: Europe
Moving to node: Christianity
Moving to node: Islam
Moving to node: France
Moving to node: United_States
Moving to node: Germany
Moving to node: Japan
Moving to node: United_Kingdom
Moving to node: Judaism
Target node reached in 10 moves.


10

# Same code in networkit

I was not able to make it work

In [30]:
import networkit as nk

# Specify the path to your file
file_path = '../datasets/wikispeedia_paths-and-graph/links.tsv'

# Create an EdgeListReader instance for a TSV file
reader = nk.graphio.EdgeListReader(separator='\t', commentPrefix='#', firstNode=0)

# Read the graph as undirected
undirected_graph = reader.read(file_path)

# Convert the graph to a directed graph
directed_graph = undirected_graph.toDirected()


RuntimeError: Scanning node failed. The file may be corrupt.

In [None]:

# Read the graph using Networkit
G = nk.readGraph('../datasets/wikispeedia_paths-and-graph/links.tsv', nk.Format.EdgeListTabZero)

# Calculate PageRank using Networkit
pagerank = nk.centrality.PageRank(G, 0.85).run().scores()

def get_value(G, node_id, target_id):
    similarity = semantic_similarity(G.nodeName(node_id), G.nodeName(target_id))
    node_pagerank = pagerank[node_id]
    f = similarity * node_pagerank
    return f

def traverse_graph(graph, start_node_id, target_node_id):
    current_node_id = start_node_id
    visited = set()  # Set to keep track of visited nodes

    for steps in range(20):  # Limit to 20 moves
        if current_node_id == target_node_id:
            print(f"Target node reached in {steps} moves.")
            return steps

        visited.add(current_node_id)  # Mark the current node as visited

        # Check if the current node has neighbors
        neighbors = graph.iterNeighbors(current_node_id)
        unvisited_neighbors = [n for n in neighbors if n not in visited]

        if unvisited_neighbors:
            # Find the neighbor with the highest value by applying get_value
            next_node_id = max(unvisited_neighbors, key=lambda n: get_value(G, n, target_node_id))
            current_node_id = next_node_id
            print(f"Moving to node: {graph.nodeName(current_node_id)}")
        else:
            print("No more unvisited neighbors to move to.")
            return steps

    print("Limit of 20 nodes reached.")
    return steps


OSError: ../datasets/wikispeedia_paths-and-graph/links.tsv is not a valid Format.EdgeListTabZero file: Scanning node failed. The file may be corrupt.