# Landmark approach
Previous algorithms take hella long to run, we need simpler alternatives.

First idea is the landmark approach.

Gist of it is: Pick important nodes as landmarks, calculate shortest distances to and fro these for all other nodes. Just pick the ones with highest degree for this.

Then when finding the shortest path, we just compare the landmarks, sum of the two values and pick the shortest path.

Do another step of getting the path back, and we're golden

In [None]:
import sys
import os
sys.path.append('../')
import data_readers

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math

# networkx
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

# For semantic similarity
from urllib.parse import unquote
from sentence_transformers import SentenceTransformer
import torch

# Python functions in .py file to read data
import machine_searchers
import time

import warnings
from tqdm import TqdmWarning
warnings.filterwarnings('ignore', category=TqdmWarning)

wikispeedia= nx.read_edgelist('../datasets/wikispeedia_paths-and-graph/links.tsv',
                              create_using=nx.DiGraph)

In [None]:
class LandmarkSearch:
    def __init__(self, graph: nx.DiGraph, landmark_num: int = 50):
        # Default value should be a function of the size of the graph...
        self.landmark_num = landmark_num

        self.landmark_node_list = None
        
        # Empty dictionaries to store info
        self.shortest_paths_to_node = {}
        self.shortest_paths_from_node = {}
        
        self.fro_df = None
        self.to_df = None
        
        self.get_landmark_info(graph, landmark_num)
        
    def get_landmark_info(self, graph: nx.DiGraph, landmark_num: int):
        temp = sorted(graph.degree, key=lambda x: x[1], reverse=True)
        temp = [elem[0] for elem in temp]
        self.landmark_node_list = temp[:landmark_num]
        
        for elem in self.landmark_node_list:
            self.shortest_paths_to_node[elem] = nx.single_target_shortest_path(graph, elem)
            self.shortest_paths_from_node[elem] = nx.single_source_shortest_path(graph, elem)
            
        # Transforming the previous elements into a dict of lengths, because it's important
        # But it's a dict of dicts!
        paths_to_lengths = {}
        paths_fro_lengths = {}

        max_length = len(graph.nodes)

        for elem in graph.nodes:
            paths_fro_lengths[elem] = {}
            paths_to_lengths[elem] = {}
            for landmark in self.shortest_paths_from_node.keys():
                # This extra code is to check if the key exists or not in the dictionaries
                
                # And fro and to are swapped, but that's because the dicts we save the info to
                # are as well.
                # So this ends up making sense
                if elem in self.shortest_paths_from_node[landmark]:
                    paths_to_lengths[elem][landmark] = len(self.shortest_paths_from_node[landmark][elem])
                else:
                    paths_to_lengths[elem][landmark] = max_length
        
                if elem in self.shortest_paths_to_node[landmark]:
                    paths_fro_lengths[elem][landmark] = len(self.shortest_paths_to_node[landmark][elem])
                else:
                    paths_fro_lengths[elem][landmark] = max_length
                    
        # The easy way of distinguishing the two dfs is as follows:
        # Get a loc[a, b]
        # fro_df will describe distance from b to a
        # to_df describes distance from a to b
        self.fro_df = pd.DataFrame(paths_fro_lengths)
        self.to_df = pd.DataFrame(paths_to_lengths)
        
    def find_shortest_path(self, source, target):
        # For this, I sum up the two and fro somehow, and find the values!
        temp_fro = self.fro_df.loc[:, source]
        temp_to = self.to_df.loc[:, target]

        distances = temp_to + temp_fro
        distances.sort_values(inplace=True)
        
        landmark = distances.index[0]
        
        # The landmark is the middle point, this tells us the best one
        start_path = self.shortest_paths_to_node[landmark][source][:-1]
        end_path = self.shortest_paths_from_node[landmark][target]
        
        final_path = start_path + end_path
        
        return final_path

In [None]:
landmark_search = LandmarkSearch(wikispeedia, 2)

In [None]:
landmark_search.find_shortest_path('DVD', 'Compact_Disc')

Okay, it runs and the result makes some sense!

We can also see in the example it's not perfect, as it can loop around. This is just a consequence of the way this was created. That's okay tbh!

Double reading the paper, what they actually do is use the landmarks as a point of reference for A* search...

Fuck it, it's a new idea. I'm also doing this because I'm pissed my other ideas aren't usable.

Now, we'll do two things:
- Number of landmarks is arbitrary, I'll just pick a fraction of the total number of nodes
- Run the tests and get the data for the existing info

There are better ways of picking landmarks, but this is good enough as a starting point. I can also segway this into the existing info, so fuck it.

In [None]:
def decode_word(word):
    word = word.replace('_', ' ')
    return unquote(word)

# Create a new graph with decoded node labels
decoded_wikispeedia = nx.DiGraph()

for node in wikispeedia.nodes():
    decoded_node = decode_word(node)
    decoded_wikispeedia.add_node(decoded_node)

# Copy the edges from the original graph to the new graph with decoded node labels
for edge in wikispeedia.edges():
    decoded_edge = tuple(decode_word(node) for node in edge)
    decoded_wikispeedia.add_edge(*decoded_edge)

In [None]:
len(decoded_wikispeedia.nodes())

I found no good explanation for taking more or less nodes... fuck it. I'll take 46, just because it's around 1\%

In [None]:
landmark_search = LandmarkSearch(wikispeedia, 46)

In [None]:
# Reading in the nodes to explore
finished_paths = pd.read_csv('../paths_sample.csv'
                             #names=['first_article','last_article','path_count']
                             )

finished_paths.head()

In [None]:
def apply_machine_landmark(row) -> list:
    source = row['first_article']
    target = row['last_article']

    res = landmark_search.find_shortest_path(source, target)

    return [source, target, len(res)-1, res]

landmark_df = finished_paths.apply(apply_machine_landmark, axis=1, result_type='expand')
landmark_df.columns = ['first_article','last_article','path_count', 'path']

landmark_df.head()

In [None]:
landmark_df['path'][0][0]

In [None]:
landmark_df.to_csv('landmark_method_results.csv')

Reading in the data properly is a pain. Writing this to figure out how to do it and get this organized!

In [None]:
read_landmark = pd.read_csv('landmark_method_results.csv'
                            #, converters={'path': lambda x: x[1:-1].split(",")}
                )
read_landmark['path'] = read_landmark['path'].str.strip('[]').str.split(',')
#read_landmark['path'] = read_landmark['path'].apply(lambda x: x[1:-1])
read_landmark.head()

In [None]:
read_landmark['path'][0][0][1: -1]