In [1]:
import sys
import os
sys.path.append('../')
import data_readers

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math

# networkx
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

# For semantic similarity
from urllib.parse import unquote
from sentence_transformers import SentenceTransformer
import torch

# Python functions in .py file to read data
import machine_searchers
import time

import warnings
from tqdm import TqdmWarning
warnings.filterwarnings('ignore', category=TqdmWarning)
from data_readers import *

In [2]:
G = read_wikispeedia_graph()
pagerank = nx.pagerank(G)

In [10]:
finished_paths = read_finished_paths()
unique_paths = finished_paths[['first_article', 'last_article']].drop_duplicates()
sources = unique_paths['first_article']
targets = unique_paths['last_article']

In [13]:
unique_paths

Unnamed: 0,first_article,last_article
0,14th_century,African_slave_trade
3,14th_century,Greece
4,14th_century,John_F._Kennedy
6,14th_century,Fire
7,14th_century,Rainbow
...,...,...
51313,Yagan,Civilization
51314,Yagan,Fiction
51315,Yagan,U.S._Open_%28tennis%29
51316,Yarralumla%2C_Australian_Capital_Territory,Abraham_Lincoln


## need to define a correct f function

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings using sentence transformer
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

# Function to perform L2 normalization on the embeddings
def l2_normalize(tensor):
    return tensor / tensor.norm(p=2, dim=0, keepdim=True)

# Function to calculate semantic similarity between two pieces of text
def semantic_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)

    # L2 normalization of the embeddings (to make sure, although embedding should already be normalized)
    embedding1_normalized = l2_normalize(embedding1)
    embedding2_normalized = l2_normalize(embedding2)

    # Compute and return the similarity of normalized tensors
    return torch.dot(embedding1_normalized, embedding2_normalized).item()

def get_value(G, node_value, target_value):
    similarity = semantic_similarity(node_value, target_value)
    
    # get pagerank of node_value in G
    node_pagerank = pagerank.get(node_value, None)
    if similarity < 0.1:
        f = node_pagerank
    elif 0.1 <= similarity <= 0.5:
        f = similarity * node_pagerank
    else:
        f = similarity
    #print(node_value, target_value, f)
    return f

## For a single iteration, print all nodes visited

In [58]:
def traverse_graph(graph, start_node, target_node):
    current_node = start_node
    visited = []  # List to keep track of visited nodes
    # previous_node = start_node
    reached_target = False
    print(f"Starting at node: {current_node}")
    
    for len in range(20):  # Limit to 20 moves
        if current_node == target_node:
            print(f"Target node reached in {len} moves.")
            visited.append(previous_node)
            visited.append(current_node)
            reached_target = True
            return len+1, visited, reached_target

        if len!=0: visited.append(previous_node)  # Mark the previous node as visited
        previous_node = current_node  # We do it like this so we can return to the previous node

        # Check if the current node has neighbors
        neighbors = list(graph.neighbors(current_node))
        unvisited_neighbors = [n for n in neighbors if n not in visited and n != current_node]

        if unvisited_neighbors:
            # Find the neighbor with the highest value by applying get_value
            next_node = max(unvisited_neighbors, key=lambda n: get_value(G, n, target_node))
            current_node = next_node
            print(f"Moving to node: {current_node}")
        else:
            print("No more unvisited neighbors to move to.")
            return len+1, visited, reached_target

    print("Limit of 20 nodes reached.")
    return len+1, visited, reached_target

In [59]:
len, visited, reached =traverse_graph(G, '14th_century', 'Fire')

Starting at node: 14th_century
Moving to node: Europe
Moving to node: Arabic_language
Moving to node: Portuguese_language
Moving to node: United_States
Moving to node: War
Moving to node: Weapon
Moving to node: Gunpowder
Moving to node: Fire
Target node reached in 8 moves.


## For more iterations, don't print anything

In [64]:
def traverse_graph_no_print(graph, start_node, target_node):
    current_node = start_node
    visited = []  # List to keep track of visited nodes
    # previous_node = start_node
    reached_target = False
    #print(f"Starting at node: {current_node}")
    
    for len in range(20):  # Limit to 20 moves
        if current_node == target_node:
            #print(f"Target node reached in {len} moves.")
            visited.append(previous_node)
            visited.append(current_node)
            reached_target = True
            return len+1, visited, reached_target

        if len!=0: visited.append(previous_node)  # Mark the previous node as visited
        previous_node = current_node  # We do it like this so we can return to the previous node

        # Check if the current node has neighbors
        neighbors = list(graph.neighbors(current_node))
        unvisited_neighbors = [n for n in neighbors if n not in visited and n != current_node]

        if unvisited_neighbors:
            # Find the neighbor with the highest value by applying get_value
            next_node = max(unvisited_neighbors, key=lambda n: get_value(G, n, target_node))
            current_node = next_node
            #print(f"Moving to node: {current_node}")
        else:
            #print("No more unvisited neighbors to move to.")
            return len+1, visited, reached_target

    #print("Limit of 20 nodes reached.")
    return len+1, visited, reached_target

In [65]:
# Initialize empty lists to store results
results = []
i = 0

# Record the start time
start_time = time.time()

for index, row in unique_paths.iterrows():
    source = row['first_article']
    target = row['last_article']
    
    # Assuming you have the traverse_graph function as described earlier
    length, visited, reached = traverse_graph_no_print(G, source, target)
    
    # Create a dictionary for the current result
    result_dict = {
        'source': source,
        'target': target,
        'reached': reached,
        'length': length,
        'visited': visited
    }
    
    results.append(result_dict)
    
    print(i)
    i += 1
    if i > 3:
        break


# Calculate the end time and the duration
end_time = time.time()
duration = end_time - start_time

# Create a DataFrame from the list of dictionaries
result_df = pd.DataFrame(results)

0
1
2
3


In [63]:
result_df.loc[3, 'visited'] 

['14th_century',
 'Europe',
 'Arabic_language',
 'Portuguese_language',
 'United_States',
 'War',
 'War',
 'Weapon',
 'Gunpowder',
 'Fire']

## Export the dataframe

In [44]:
result_df.to_csv('machine_paths.csv', index=False)