In [65]:
import pandas as pd
import numpy as np
import math
import csv
import random

# networkx
import networkx as nx

# transformers
from sentence_transformers import SentenceTransformer
import torch


# For semantic similarity
from urllib.parse import unquote
from sklearn.metrics.pairwise import cosine_similarity

# Python functions in .py file to read data
import data_readers
import machine_searchers

In [45]:
# The links and edges
wikispeedia = data_readers.read_wikispeedia_graph()

# The finished paths
finished_paths = data_readers.read_finished_paths()

# The unfinished paths
unfinished_paths = data_readers.read_unfinished_paths()

# DF of all articles
articles = data_readers.read_articles()

# DF of all articles and their categories
categories = data_readers.read_categories()

In [46]:
def read_finished_paths() -> pd.DataFrame:
    paths_finished = pd.read_csv('paths_sample.csv', sep=',', 
                                 names=['first_article','last_article','path_count'])
    return paths_finished

In [47]:
finished_paths = read_finished_paths()
unique_paths = finished_paths[['first_article', 'last_article']].drop_duplicates()
sources = unique_paths['first_article']
targets = unique_paths['last_article']
unique_paths.sort_values(by=['first_article', 'last_article'], inplace=True)
unique_paths.reset_index(inplace=True, drop=True)

In [57]:
len(finished_paths)

3861

In [48]:
len(unique_paths)

3861

In [59]:
unique_paths_573 = unique_paths[573:]

In [60]:
unique_paths_573

Unnamed: 0,first_article,last_article
573,Boeing_747,Hawk-Eye
574,Bolivia,Henry_IV_of_England
575,Bolivia,Pac-Man
576,Bolivia,Platinum
577,Bolivia,Universe
...,...,...
3856,Zinc,American_Revolutionary_War
3857,Zinc_chloride,Age_of_Enlightenment
3858,Zulu,Aluminium
3859,Zulu,Arctic_Circle


In [49]:
print("Dataset has", len(wikispeedia.nodes), "nodes (articles)")
print("Dataset has", len(wikispeedia.edges), "edges (links between articles)")

dic = nx.pagerank(wikispeedia)
print(dic)

for node in [nodo for nodo in wikispeedia.nodes()]:
    wikispeedia.nodes[node]['pagerank'] = dic[node]
print(wikispeedia.nodes(data=True))

Dataset has 4592 nodes (articles)
Dataset has 119882 edges (links between articles)
{'%C3%81ed%C3%A1n_mac_Gabr%C3%A1in': 3.2710390395592254e-05, 'Bede': 0.00021938161316650256, 'Columba': 0.00012116881897593145, 'D%C3%A1l_Riata': 0.000105560154678038, 'Great_Britain': 0.0015061292462843427, 'Ireland': 0.001899965414851622, 'Isle_of_Man': 0.00046032088563309313, 'Monarchy': 0.0008133221072287704, 'Orkney': 0.00028994588516574057, 'Picts': 0.00019759121101826146, 'Scotland': 0.0021425304277849183, 'Wales': 0.001099763207428743, '%C3%85land': 3.2710390395592254e-05, '20th_century': 0.0023615390664036124, 'Baltic_Sea': 0.0006497932466591777, 'Crimean_War': 0.0002201292664638352, 'Currency': 0.0032371559196565525, 'Euro': 0.0012495337101871686, 'European_Union': 0.002318582455038707, 'Finland': 0.001009621435681574, 'League_of_Nations': 0.0005769703027683626, 'List_of_countries_by_system_of_government': 0.0028333266533963103, 'Nationality': 0.00022617861790487603, 'Parliamentary_system': 0.

In [50]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings using sentence transformer
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

# Function to perform L2 normalization on the embeddings
def l2_normalize(tensor):
    return tensor / tensor.norm(p=2, dim=0, keepdim=True)

# Function to calculate semantic similarity between two pieces of text
def semantic_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)

    # L2 normalization of the embeddings (to make sure, although embedding should already be normalized)
    embedding1_normalized = l2_normalize(embedding1)
    embedding2_normalized = l2_normalize(embedding2)

    # Compute and return the similarity of normalized tensors
    return torch.dot(embedding1_normalized, embedding2_normalized).item()

In [51]:
semantic_similarity("Japan","Asia")

0.752111554145813

In [66]:
# si funciona, podemos hacer sample para mirar resultados de ir buscando cambiando la ref_similarity

def degree_and_sem(G: nx.Graph, source: str, target: str, ref_similarity=0.3):
 # ref_similarity should be the avg sem dist

   visited = set([])
   current_children = []
   sem_sim_childr = {}
   max_page_childr = {}
   path = []

   current_node = source 

   found = False

   while(not found):
      visited.add(current_node)
      path.append(current_node)
      
      if len(path) >= 25:
         return source, target, found, len(path), path

      if current_node == target:
        found = True
        return source, target, found, len(path), path,

      current_children = list(G.successors(current_node))
         
      sem_sim_childr = {}
      max_page_childr = {}
      # store in a dic each child and its sem sim
      for c in current_children:
         if c == target:
            found = True
            visited.add(c)
            path.append(c)
            return source, target, found, len(path), path,
         # compute semantic similarity
         elif c in visited:
            current_children.remove(c)
         else:
            semsim = semantic_similarity(c,target)
            #store it with ID and sem sim
            sem_sim_childr[c] = semsim
            # compute semantic similarity
            pagerank = G.nodes[c]['pagerank']
            #store it with ID and sem sim
            max_page_childr[c] = pagerank
      if sem_sim_childr:
         max_node = max(sem_sim_childr, key=sem_sim_childr.get)
         max_sim = sem_sim_childr[max_node]
         if max_sim >= ref_similarity:
            #se va al de mayor similarity
               current_node = max_node
         else:
            # se va al de mayor degree
            max_node = max(max_page_childr, key=max_page_childr.get)
            current_node = max_node
      else:
         current_children = list(G.successors(current_node))
         current_node = random.choice(current_children)

In [67]:
degree_and_sem(wikispeedia,'Boeing_747','Hawk-Eye')

('Boeing_747',
 'Hawk-Eye',
 False,
 25,
 ['Boeing_747',
  'United_States',
  'Bald_Eagle',
  'Eagle',
  'Bird_of_prey',
  'Hawk',
  'Bird',
  'Falcon',
  'Peregrine_Falcon',
  'Crow',
  'Dove',
  'Animal',
  'Spider',
  'Beetle',
  'Wasp',
  'Insect',
  'Cretaceous',
  'Shark',
  'Eye',
  'Eyelid',
  'Eye',
  'Retina',
  'Evolution',
  'Stephen_Jay_Gould',
  'United_Kingdom'])

In [68]:
with open('carol573.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['source',	'target',	'reached',	'length',	'visited']) 

    for index, row in unique_paths.iterrows():
        source = row['first_article']
        target = row['last_article']
        
        machine_result = degree_and_sem(wikispeedia, source, target)
        writer.writerow(machine_result) 

NetworkXError: The node first_article is not in the digraph.

In [17]:
#df = pd.read_csv('carol.csv')


In [18]:
#df

Unnamed: 0,source,target,reached,length,visited
0,%E2%82%AC2_commemorative_coins,Irish_Sea,True,3,"['%E2%82%AC2_commemorative_coins', 'Atlantic_O..."
1,10th_century,11th_century,True,2,"['10th_century', '11th_century']"


In [12]:
#degree_and_sem(wikispeedia,"Quito","Water")

('Quito',
 'Water',
 True,
 7,
 ['Quito', 'Basketball', 'Sport', 'Football', 'Sphere', 'Earth', 'Water'])

In [13]:
#machine_searchers.modded_astar_path(wikispeedia,"Quito","Water")

(['Quito', 'Louisville%2C_Kentucky', 'Water'],
 {'Quito': None,
  'Andes': 'Quito',
  'Basketball': 'Quito',
  'Bolivia': 'Quito',
  'Canada': 'Quito',
  'Capital': 'Quito',
  'Cotopaxi': 'Quito',
  'Earthquake': 'Quito',
  'Ecuador': 'Quito',
  'Europe': 'Quito',
  'Florida': 'Quito',
  'Football_%28soccer%29': 'Quito',
  'Francis_of_Assisi': 'Quito',
  'Gold': 'Quito',
  'Inca_Empire': 'Quito',
  'Lima': 'Quito',
  'Louisville%2C_Kentucky': 'Quito',
  'Water': 'Louisville%2C_Kentucky'})

In [14]:
#degree_and_sem(wikispeedia,"Switzerland","Ant")

('Switzerland',
 'Ant',
 True,
 18,
 ['Switzerland',
  'Latin',
  'Latin_alphabet',
  'Alphabet',
  'Science',
  'Atom',
  'Rubidium',
  'Electron',
  'Amber',
  'Aristotle',
  'Moon',
  'Apollo_11',
  'Apollo_8',
  'Santa_Claus',
  'Toy',
  'Animal',
  'Insect',
  'Ant'])

In [178]:
#machine_searchers.modded_astar_path(wikispeedia,"Switzerland","Ant")

(['Switzerland', 'Brussels', 'Iron', 'Organism', 'Ant'],
 {'Switzerland': None,
  'Austria': 'Switzerland',
  'Brussels': 'Switzerland',
  'Cheese': 'Switzerland',
  'Climate': 'Switzerland',
  'Conflict': 'Switzerland',
  'Currency': 'Switzerland',
  'Denmark': 'Switzerland',
  'Energy': 'Switzerland',
  'English_language': 'Switzerland',
  'Europe': 'Switzerland',
  'European_Union': 'Switzerland',
  'Finance': 'Switzerland',
  'France': 'Switzerland',
  'French_Revolution': 'Switzerland',
  'French_language': 'Switzerland',
  'German_language': 'Switzerland',
  'Germany': 'Switzerland',
  'Glacier': 'Switzerland',
  'Holy_Roman_Empire': 'Switzerland',
  'Iceland': 'Switzerland',
  'Industrial_Revolution': 'Switzerland',
  'Islam': 'Switzerland',
  'Italy': 'Switzerland',
  'Japan': 'Switzerland',
  'Latin': 'Switzerland',
  'League_of_Nations': 'Switzerland',
  'Liechtenstein': 'Switzerland',
  'List_of_countries_by_system_of_government': 'Switzerland',
  'Norway': 'Switzerland',
  

In [173]:
#degree_and_sem(wikispeedia,"Space_Shuttle_Columbia_disaster","Indus_Valley_Civilization")

(['Space_Shuttle_Columbia_disaster', 'India', 'Indus_Valley_Civilization'],
 {'India', 'Indus_Valley_Civilization', 'Space_Shuttle_Columbia_disaster'})

In [179]:
#machine_searchers.modded_astar_path(wikispeedia,"Space_Shuttle_Columbia_disaster","Indus_Valley_Civilization")

(['Space_Shuttle_Columbia_disaster', 'India', 'Indus_Valley_Civilization'],
 {'Space_Shuttle_Columbia_disaster': None,
  'Amarillo%2C_Texas': 'Space_Shuttle_Columbia_disaster',
  'Asteroid': 'Space_Shuttle_Columbia_disaster',
  'Earth%27s_atmosphere': 'Space_Shuttle_Columbia_disaster',
  'George_W._Bush': 'Space_Shuttle_Columbia_disaster',
  'Gravitation': 'Space_Shuttle_Columbia_disaster',
  'Hurricane_Katrina': 'Space_Shuttle_Columbia_disaster',
  'India': 'Space_Shuttle_Columbia_disaster',
  'Indus_Valley_Civilization': 'India'})

In [174]:
#degree_and_sem(wikispeedia,"Western_Roman_Empire","Alcohol")

(['Western_Roman_Empire',
  'Cologne',
  'Bridge',
  'Steel',
  'Water',
  'Ethanol',
  'Alcohol'],
 {'Alcohol',
  'Bridge',
  'Cologne',
  'Ethanol',
  'Steel',
  'Water',
  'Western_Roman_Empire'})

In [181]:
#degree_and_sem(wikispeedia,'14th_century', 'Fire')

(['14th_century',
  'Time',
  'Light',
  'Heat',
  'Energy',
  'Rain',
  'Sun',
  'Gas',
  'Phase_%28matter%29',
  'Metal',
  'Iron',
  'Color',
  'Eye',
  'Oxygen',
  'Fire'],
 {'14th_century',
  'Color',
  'Energy',
  'Eye',
  'Fire',
  'Gas',
  'Heat',
  'Iron',
  'Light',
  'Metal',
  'Oxygen',
  'Phase_%28matter%29',
  'Rain',
  'Sun',
  'Time'})

In [180]:
#machine_searchers.modded_astar_path(wikispeedia,'14th_century', 'Fire')

(['14th_century', 'China', 'Gunpowder', 'Fire'],
 {'14th_century': None,
  '13th_century': '14th_century',
  '15th_century': '14th_century',
  'Abacus': '14th_century',
  'Aztec': '14th_century',
  'Black_Death': '14th_century',
  'Buddha': '14th_century',
  'China': '14th_century',
  'Christianity': '14th_century',
  'Dante_Alighieri': '14th_century',
  'Dark_Ages': '14th_century',
  'Edward_III_of_England': '14th_century',
  'England': '14th_century',
  'English_peasants%27_revolt_of_1381': '14th_century',
  'Europe': '14th_century',
  'France': '14th_century',
  'Hundred_Years%27_War': '14th_century',
  'Ibn_Battuta': '14th_century',
  'India': '14th_century',
  'Islam': '14th_century',
  'Italy': '14th_century',
  'Lithuania': '14th_century',
  'Ming_Dynasty': '14th_century',
  'Niger': '14th_century',
  'Ottoman_Empire': '14th_century',
  'Poland': '14th_century',
  'Pope': '14th_century',
  'Renaissance': '14th_century',
  'Singapore': '14th_century',
  'Time': '14th_century',
  