In [16]:
import importlib.util
if importlib.util.find_spec("sentence_transformers") is None:
    !pip install sentence-transformers

In [17]:
!pip install scipy==1.11.3
import scipy
print("SciPy version:", scipy.__version__)

SciPy version: 1.11.3


In [18]:
import scipy
import numpy as np
print("SciPy version:", scipy.__version__)
print("NumPy version:", np.__version__)

import sys
import os
import pandas as pd
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout
from urllib.parse import unquote
from sentence_transformers import SentenceTransformer
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import time
import warnings
from tqdm import TqdmWarning
warnings.filterwarnings('ignore', category=TqdmWarning)

SciPy version: 1.11.3
NumPy version: 1.24.3


In [19]:
sys.version

'3.10.12 | packaged by conda-forge | (main, Jun 23 2023, 22:40:32) [GCC 12.3.0]'

In [20]:
import logging

# Assuming the library uses the standard logging module
logger = logging.getLogger('sentence_transformers')  # Replace 'library_name' with the actual name
logger.setLevel(logging.ERROR)  # Set to ERROR or CRITICAL to reduce verbosity
logger2 = logging.getLogger('SentenceTransformer')  # Replace 'library_name' with the actual name
logger2.setLevel(logging.ERROR)  # Set to ERROR or CRITICAL to reduce verbosity


In [21]:
def read_wikispeedia_graph() -> nx.Graph:
    wikispeedia = nx.read_edgelist('../input/links.tsv',
                                         create_using=nx.DiGraph)
    return wikispeedia


def read_finished_paths() -> pd.DataFrame:
    paths_finished = pd.read_csv('../input/paths_finished.tsv', sep='\t', skiprows=15,
                                 names=['hashedIpAddress', 'timestamp', "durationInSec", 'path', "rating"])
    paths_finished['first_article'] = paths_finished['path'].apply(lambda x: x.split(';')[0])
    paths_finished['last_article'] = paths_finished['path'].apply(lambda x: x.split(';')[-1])
    paths_finished['path_length'] = paths_finished['path'].apply(lambda x: len(x.split(';')))
    paths_finished['date'] = pd.to_datetime(paths_finished['timestamp'], unit='s')
    return paths_finished

In [22]:
G = read_wikispeedia_graph()
pagerank = nx.pagerank(G)

In [23]:
finished_paths = read_finished_paths()
unique_paths = finished_paths[['first_article', 'last_article']].drop_duplicates()
sources = unique_paths['first_article']
targets = unique_paths['last_article']
unique_paths.sort_values(by=['first_article', 'last_article'], inplace=True)
unique_paths.reset_index(inplace=True, drop=True)

In [24]:
df = finished_paths[['first_article', 'last_article']].copy()
df['path'] = df['first_article'] + '_' + df['last_article']
df['path_count'] = df.groupby('path')['path'].transform('count')
df.drop_duplicates(subset='path', inplace=True)
df = df.sort_values('path_count', ascending = False)
df = df[df['path_count']>2][['first_article', 'last_article', 'path_count']]
df.reset_index(drop=True, inplace=True)
df.to_csv('paths_sample.csv')
df

Unnamed: 0,first_article,last_article,path_count
0,Asteroid,Viking,1043
1,Brain,Telephone,1040
2,Theatre,Zebra,905
3,Pyramid,Bean,642
4,Batman,Wood,148
...,...,...,...
3855,Tin,Political_philosophy,3
3856,Carcinus_maenas,Riyadh,3
3857,Afrikaans,Invertebrate,3
3858,Botswana,Duran_Duran,3


In [25]:
unique_paths.head()

Unnamed: 0,first_article,last_article
0,%E2%82%AC2_commemorative_coins,Irish_Sea
1,10th_century,11th_century
2,10th_century,Banknote
3,10th_century,Country
4,10th_century,Harlem_Globetrotters


In [26]:
paths_sample = unique_paths.sample(5000).reset_index(drop=True)
paths_sample.to_csv('paths_sample.csv')

In [27]:
paths_sample

Unnamed: 0,first_article,last_article
0,Lake_Tahoe,All_Blacks
1,Mercantilism,Linux
2,Where_Did_Our_Love_Go,Boundary_Waters_Canoe_Area_Wilderness
3,Electromagnetic_radiation,William_McKinley
4,16th_century,Adolf_Hitler
...,...,...
4995,Ragtime,German_language
4996,Miles_Davis,Impressionism
4997,Trapdoor_spider,Eris_%28dwarf_planet%29
4998,J._R._R._Tolkien,Star_Wars_Episode_IV__A_New_Hope


## need to define a correct f function

In [28]:
model = SentenceTransformer('all-MiniLM-L6-v2').to(device)

# Function to get embeddings using sentence transformer
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

# Function to perform L2 normalization on the embeddings
def l2_normalize(tensor):
    return tensor / tensor.norm(p=2, dim=0, keepdim=True)

# Function to calculate semantic similarity between two pieces of text
def semantic_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)

    # L2 normalization of the embeddings (to make sure, although embedding should already be normalized)
    embedding1_normalized = l2_normalize(embedding1)
    embedding2_normalized = l2_normalize(embedding2)

    # Compute and return the similarity of normalized tensors
    return torch.dot(embedding1_normalized, embedding2_normalized).item()

def get_value(G, node_value, target_value):
    similarity = semantic_similarity(node_value, target_value)

    # get pagerank of node_value in G
    node_pagerank = pagerank.get(node_value, None)
    if similarity < 0.1:
        f = node_pagerank
    elif 0.1 <= similarity <= 0.5:
        f = similarity * node_pagerank
    else:
        f = similarity
    #print(node_value, target_value, f)
    return f

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## For a single iteration, print all nodes visited

In [29]:
def traverse_graph(graph, start_node, target_node):
    current_node = start_node
    visited = []  # List to keep track of visited nodes
    previous_node = start_node
    reached_target = False
    print(f"Starting at node: {current_node}")

    for length in range(20):  # Limit to 20 moves
        if current_node == target_node:
            print(f"Target node reached in {length} moves.")
            visited.append(previous_node)
            visited.append(current_node)
            reached_target = True
            return length+1, visited, reached_target

        if length!=0: visited.append(previous_node)  # Mark the previous node as visited
        previous_node = current_node  # We do it like this so we can return to the previous node

        # Check if the current node has neighbors
        neighbors = list(graph.neighbors(current_node))
        unvisited_neighbors = [n for n in neighbors if n not in visited and n != current_node]

        if unvisited_neighbors:
            # Find the neighbor with the highest value by applying get_value
            next_node = max(unvisited_neighbors, key=lambda n: get_value(G, n, target_node))
            current_node = next_node
            print(f"Moving to node: {current_node}")
        else:
            print("No more unvisited neighbors to move to.")
            return length+1, visited, reached_target

    print("Limit of 20 nodes reached.")
    return length+1, visited, reached_target

In [30]:
length, visited, reached =traverse_graph(G, '14th_century', 'Currency')

Starting at node: 14th_century
Moving to node: Europe
Moving to node: United_Kingdom
Moving to node: Currency
Target node reached in 3 moves.


## For more iterations, don't print anything

In [31]:
def traverse_graph_no_print(graph, start_node, target_node):
    current_node = start_node
    visited = []  # List to keep track of visited nodes
    previous_node = start_node
    reached_target = False
    #print(f"Starting at node: {current_node}")

    for length in range(20):  # Limit to 20 moves
        if current_node == target_node:
            #print(f"Target node reached in {len} moves.")
            visited.append(previous_node)
            visited.append(current_node)
            reached_target = True
            return length+1, visited, reached_target

        if length!=0: visited.append(previous_node)  # Mark the previous node as visited
        previous_node = current_node  # We do it like this so we can return to the previous node

        # Check if the current node has neighbors
        neighbors = list(graph.neighbors(current_node))
        unvisited_neighbors = [n for n in neighbors if n not in visited and n != current_node]

        if unvisited_neighbors:
            # Find the neighbor with the highest value by applying get_value
            next_node = max(unvisited_neighbors, key=lambda n: get_value(G, n, target_node))
            current_node = next_node
            #print(f"Moving to node: {current_node}")
        else:
            #print("No more unvisited neighbors to move to.")
            return length+1, visited, reached_target

    #print("Limit of 20 nodes reached.")
    return length+1, visited, reached_target

In [32]:
from itertools import islice

# Initialize empty lists to store results
results = []
i = 0

# Record the start time
start_time = time.time()

#for index, row in df.iterrows():
for index, row in islice(df.iterrows(), 1000, None):

    source = row['first_article']
    target = row['last_article']

    # Assuming you have the traverse_graph function as described earlier
    length, visited, reached = traverse_graph_no_print(G, source, target)

    # Create a dictionary for the current result
    result_dict = {
        'source': source,
        'target': target,
        'reached': reached,
        'length': length,
        'visited': visited
    }

    results.append(result_dict)

    print(index)
    i += 1
    if i > 1000:
        break


# Calculate the end time and the duration
end_time = time.time()
duration = end_time - start_time
print("Duration: ", duration)

# Create a DataFrame from the list of dictionaries
result_df = pd.DataFrame(results)

KeyboardInterrupt: 

In [None]:
result_df

## Export the dataframe

In [None]:
result_df.to_csv('machine_paths.csv', index=False)