In [None]:
import pandas as pd
import numpy as np
import math
import csv
import random

# networkx
import networkx as nx

# transformers
from sentence_transformers import SentenceTransformer
import torch


# For semantic similarity
from urllib.parse import unquote
from sklearn.metrics.pairwise import cosine_similarity

# Python functions in .py file to read data
import data_readers
import machine_searchers

In [None]:
# The links and edges
wikispeedia = data_readers.read_wikispeedia_graph()

# The finished paths
finished_paths = data_readers.read_finished_paths()

# The unfinished paths
unfinished_paths = data_readers.read_unfinished_paths()

# DF of all articles
articles = data_readers.read_articles()

# DF of all articles and their categories
categories = data_readers.read_categories()

In [None]:
def read_finished_paths() -> pd.DataFrame:
    paths_finished = pd.read_csv('paths_sample.csv', sep=',', 
                                 names=['first_article','last_article','path_count'])
    return paths_finished

In [None]:
finished_paths = read_finished_paths()
unique_paths = finished_paths[['first_article', 'last_article']].drop_duplicates()
sources = unique_paths['first_article']
targets = unique_paths['last_article']
unique_paths.sort_values(by=['first_article', 'last_article'], inplace=True)
unique_paths.reset_index(inplace=True, drop=True)

In [None]:
len(finished_paths)

In [None]:
len(unique_paths)

In [None]:
unique_paths_573 = unique_paths[573:]

In [None]:
unique_paths_573

In [None]:
print("Dataset has", len(wikispeedia.nodes), "nodes (articles)")
print("Dataset has", len(wikispeedia.edges), "edges (links between articles)")

dic = nx.pagerank(wikispeedia)
print(dic)

for node in [nodo for nodo in wikispeedia.nodes()]:
    wikispeedia.nodes[node]['pagerank'] = dic[node]
print(wikispeedia.nodes(data=True))

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings using sentence transformer
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

# Function to perform L2 normalization on the embeddings
def l2_normalize(tensor):
    return tensor / tensor.norm(p=2, dim=0, keepdim=True)

# Function to calculate semantic similarity between two pieces of text
def semantic_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)

    # L2 normalization of the embeddings (to make sure, although embedding should already be normalized)
    embedding1_normalized = l2_normalize(embedding1)
    embedding2_normalized = l2_normalize(embedding2)

    # Compute and return the similarity of normalized tensors
    return torch.dot(embedding1_normalized, embedding2_normalized).item()

In [None]:
semantic_similarity("Japan","Asia")

In [None]:
# si funciona, podemos hacer sample para mirar resultados de ir buscando cambiando la ref_similarity

def degree_and_sem(G: nx.Graph, source: str, target: str, ref_similarity=0.3):
 # ref_similarity should be the avg sem dist

   visited = set([])
   current_children = []
   sem_sim_childr = {}
   max_page_childr = {}
   path = []

   current_node = source 

   found = False

   while(not found):
      visited.add(current_node)
      path.append(current_node)
      
      if len(path) >= 25:
         return source, target, found, len(path), path

      if current_node == target:
        found = True
        return source, target, found, len(path), path,

      current_children = list(G.successors(current_node))
         
      sem_sim_childr = {}
      max_page_childr = {}
      # store in a dic each child and its sem sim
      for c in current_children:
         if c == target:
            found = True
            visited.add(c)
            path.append(c)
            return source, target, found, len(path), path,
         # compute semantic similarity
         elif c in visited:
            current_children.remove(c)
         else:
            semsim = semantic_similarity(c,target)
            #store it with ID and sem sim
            sem_sim_childr[c] = semsim
            # compute semantic similarity
            pagerank = G.nodes[c]['pagerank']
            #store it with ID and sem sim
            max_page_childr[c] = pagerank
      if sem_sim_childr:
         max_node = max(sem_sim_childr, key=sem_sim_childr.get)
         max_sim = sem_sim_childr[max_node]
         if max_sim >= ref_similarity:
            #se va al de mayor similarity
               current_node = max_node
         else:
            # se va al de mayor degree
            max_node = max(max_page_childr, key=max_page_childr.get)
            current_node = max_node
      else:
         current_children = list(G.successors(current_node))
         current_node = random.choice(current_children)

In [None]:
degree_and_sem(wikispeedia,'Boeing_747','Hawk-Eye')

In [None]:
with open('carol573.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['source',	'target',	'reached',	'length',	'visited']) 

    for index, row in unique_paths.iterrows():
        source = row['first_article']
        target = row['last_article']
        
        machine_result = degree_and_sem(wikispeedia, source, target)
        writer.writerow(machine_result) 

In [None]:
#df = pd.read_csv('carol.csv')


In [None]:
#df

In [None]:
#degree_and_sem(wikispeedia,"Quito","Water")

In [None]:
#machine_searchers.modded_astar_path(wikispeedia,"Quito","Water")

In [None]:
#degree_and_sem(wikispeedia,"Switzerland","Ant")

In [None]:
#machine_searchers.modded_astar_path(wikispeedia,"Switzerland","Ant")

In [None]:
#degree_and_sem(wikispeedia,"Space_Shuttle_Columbia_disaster","Indus_Valley_Civilization")

In [None]:
#machine_searchers.modded_astar_path(wikispeedia,"Space_Shuttle_Columbia_disaster","Indus_Valley_Civilization")

In [None]:
#degree_and_sem(wikispeedia,"Western_Roman_Empire","Alcohol")

In [None]:
#degree_and_sem(wikispeedia,'14th_century', 'Fire')

In [None]:
#machine_searchers.modded_astar_path(wikispeedia,'14th_century', 'Fire')