# Why?
Idea of this notebook is to give the template that will be run on how to get all of the machine data.

Namely, finding the paths and explorations that the two models take. 
Ideally it'll be upgraded to work with different models too, but that's for later

In [1]:
import sys
import os
sys.path.append('../')
import data_readers

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math

# networkx
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

# For semantic similarity
from urllib.parse import unquote
from sentence_transformers import SentenceTransformer
import torch

# Python functions in .py file to read data
import machine_searchers
import time

import warnings
from tqdm import TqdmWarning
warnings.filterwarnings('ignore', category=TqdmWarning)

In [2]:
finished_paths = pd.read_csv('../datasets/wikispeedia_paths-and-graph/paths_finished.tsv', sep='\t', skiprows=15,
                                 names=['hashedIpAddress', 'timestamp', "durationInSec", 'path', "rating"])
finished_paths['first_article'] = finished_paths['path'].apply(lambda x: x.split(';')[0])
finished_paths['last_article'] = finished_paths['path'].apply(lambda x: x.split(';')[-1])
finished_paths['path_length'] = finished_paths['path'].apply(lambda x: len(x.split(';')))
finished_paths['date'] = pd.to_datetime(finished_paths['timestamp'], unit='s')

In [3]:
# How many each pair of articles has been visited
article_combinations_count = finished_paths.groupby(['first_article', 'last_article']).size().reset_index(name='count')

# The mean and std of the path length for each pair of articles
article_combinations_stats = finished_paths.groupby(['first_article', 'last_article'])['path_length'].agg(['mean', 'std']).reset_index()
article_combinations_stats['std'] = article_combinations_stats['std'].fillna(0)
article_combinations_stats.rename(columns={'mean': 'mean_length', 'std': 'std_length'}, inplace=True)

# The mean and std of the rating for each pair of articles. 
# Note that mean and std may be nan if there are nan ratings. We purposely leave them as nan, as we don't want to fill them with 0s or 1s.
# Depending on the application, we could change this in the future if neeeded.
rating_combinations_stats_rating = finished_paths.groupby(['first_article', 'last_article'])['rating'].agg(['mean', 'std']).reset_index()
#rating_combinations_stats_rating['std'] = rating_combinations_stats_rating['std'].fillna(0)
mask = rating_combinations_stats_rating['mean'].notnull()
rating_combinations_stats_rating.loc[mask, 'std'] = rating_combinations_stats_rating.loc[mask, 'std'].fillna(0)
rating_combinations_stats_rating.rename(columns={'mean': 'mean_rating', 'std': 'std_rating'}, inplace=True)

# The mean and std of the time for each pair of articles.
rating_combinations_stats_time = finished_paths.groupby(['first_article', 'last_article'])['durationInSec'].agg(['mean', 'std']).reset_index()
rating_combinations_stats_time['std'] = rating_combinations_stats_time['std'].fillna(0)
rating_combinations_stats_time.rename(columns={'mean': 'mean_durationInSec', 'std': 'std_durationInSec'}, inplace=True)

# Merging all the dataframes
article_combinations = pd.merge(article_combinations_count, article_combinations_stats, on=['first_article', 'last_article'])
article_combinations = pd.merge(article_combinations, rating_combinations_stats_rating, on=['first_article', 'last_article'])
article_combinations = pd.merge(article_combinations, rating_combinations_stats_time, on=['first_article', 'last_article'])

# The number of unique sources and targets
unique_sources = finished_paths['first_article'].value_counts().reset_index()
unique_targets = finished_paths['last_article'].value_counts().reset_index()

In [4]:
article_combinations.head()

Unnamed: 0,first_article,last_article,count,mean_length,std_length,mean_rating,std_rating,mean_durationInSec,std_durationInSec
0,%E2%82%AC2_commemorative_coins,Irish_Sea,1,3.0,0.0,1.0,0.0,15.0,0.0
1,10th_century,11th_century,3,2.0,0.0,2.333333,2.309401,4.333333,1.527525
2,10th_century,Banknote,1,5.0,0.0,3.0,0.0,48.0,0.0
3,10th_century,Country,1,3.0,0.0,1.0,0.0,15.0,0.0
4,10th_century,Harlem_Globetrotters,2,4.5,0.707107,2.0,0.0,75.0,24.041631


In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to get embeddings using sentence transformer
def get_embedding(text):
    return model.encode(text, convert_to_tensor=True)

# Function to perform L2 normalization on the embeddings
def l2_normalize(tensor):
    return tensor / tensor.norm(p=2, dim=0, keepdim=True)

# Function to calculate semantic similarity between two pieces of text
def semantic_similarity(word1, word2):
    embedding1 = get_embedding(word1)
    embedding2 = get_embedding(word2)

    # L2 normalization of the embeddings (to make sure, although embedding should already be normalized)
    embedding1_normalized = l2_normalize(embedding1)
    embedding2_normalized = l2_normalize(embedding2)

    # Compute and return the similarity of normalized tensors
    return torch.dot(embedding1_normalized, embedding2_normalized).item()


In [6]:
wikispeedia= nx.read_edgelist('../datasets/wikispeedia_paths-and-graph/links.tsv',
                              create_using=nx.DiGraph)

def decode_word(word):
    word = word.replace('_', ' ')
    return unquote(word)

# Create a new graph with decoded node labels
decoded_wikispeedia = nx.DiGraph()

for node in wikispeedia.nodes():
    decoded_node = decode_word(node)
    decoded_wikispeedia.add_node(decoded_node)

# Copy the edges from the original graph to the new graph with decoded node labels
for edge in wikispeedia.edges():
    decoded_edge = tuple(decode_word(node) for node in edge)
    decoded_wikispeedia.add_edge(*decoded_edge)

start_time = time.time()
lib_path_1, lib_explore_1 = machine_searchers.modded_astar_path(wikispeedia, 'Actor', 'Japan', heuristic=semantic_similarity)
end_time = time.time()

# It's len - 1 because the target node is also included, and that node wasn't explored
print("Using the modded a star that returns explored nodes:")
print(" Found solution for Actor to Japan exploring the following number of nodes:", len(lib_explore_1)-1)
print(" Found it in:", end_time-start_time)

start_time = time.time()
lib_path_2, lib_explore_2 = machine_searchers.only_depth_first_astar_path(wikispeedia, 'Actor', 'Japan', heuristic=semantic_similarity)
end_time = time.time()

print("Using depth first only A star that returns explored nodes:")
print(" Found solution for Actor to Japan exploring the following number of nodes:", len(lib_explore_2)-1)
print(" Found it in:", end_time-start_time)

Using the modded a star that returns explored nodes:
 Found solution for Actor to Japan exploring the following number of nodes: 1
 Found it in: 0.09830498695373535
Using depth first only A star that returns explored nodes:
 Found solution for Actor to Japan exploring the following number of nodes: 1
 Found it in: 0.09151577949523926


In [7]:
decoded_articles = article_combinations.copy()
decoded_articles[['first_article', 'last_article']] = article_combinations[['first_article', 'last_article']].apply(lambda col: col.apply(decode_word))
decoded_articles.head(5)

Unnamed: 0,first_article,last_article,count,mean_length,std_length,mean_rating,std_rating,mean_durationInSec,std_durationInSec
0,€2 commemorative coins,Irish Sea,1,3.0,0.0,1.0,0.0,15.0,0.0
1,10th century,11th century,3,2.0,0.0,2.333333,2.309401,4.333333,1.527525
2,10th century,Banknote,1,5.0,0.0,3.0,0.0,48.0,0.0
3,10th century,Country,1,3.0,0.0,1.0,0.0,15.0,0.0
4,10th century,Harlem Globetrotters,2,4.5,0.707107,2.0,0.0,75.0,24.041631


In [16]:
decoded_articles.shape

(28718, 9)

In [8]:
decoded_articles['last_article'][2]

'Banknote'

In [9]:
temp_path, temp_explore = machine_searchers.modded_astar_path(decoded_wikispeedia, decoded_articles['first_article'][0], decoded_articles['last_article'][0], heuristic=semantic_similarity)
temp_path

['€2 commemorative coins', 'United Kingdom', 'Irish Sea']

In [10]:
temp_path, temp_explore = machine_searchers.only_depth_first_astar_path(decoded_wikispeedia, decoded_articles['first_article'][0], decoded_articles['last_article'][0], heuristic=semantic_similarity)
temp_path

Node Irish Sea not reachable from €2 commemorative coins in depth first version


[]

In [17]:
temp_path

['10th century', 'Scotland', 'Banknote']

In [13]:
def apply_machine_first(row) -> list:
    source = row['first_article']
    target = row['last_article']

    lib_path_1, lib_explore_1 = machine_searchers.modded_astar_path(decoded_wikispeedia, source, target, heuristic=semantic_similarity)
    #lib_path_2, lib_explore_2 = machine_searchers.only_depth_first_astar_path(decoded_wikispeedia, source, target, heuristic=semantic_similarity)

    return [source, target, len(lib_explore_1)-1, lib_path_1, lib_explore_1]

In [14]:
def apply_machine_second(row) -> list:
    source = row['first_article']
    target = row['last_article']

    #lib_path_1, lib_explore_1 = machine_searchers.modded_astar_path(decoded_wikispeedia, source, target, heuristic=semantic_similarity)
    lib_path_2, lib_explore_2 = machine_searchers.only_depth_first_astar_path(decoded_wikispeedia, source, target, heuristic=semantic_similarity)

    return [source, target, len(lib_explore_2)-1, lib_path_2, lib_explore_2]

In [None]:
# garbage = decoded_articles[:25]
# 
# start_time = time.time()
# temp_df = garbage.apply(apply_machine_first, axis=1, result_type='expand')
# end_time = time.time()
# print("Finished first")
# print(" Found it in:", end_time-start_time)
# 
# 
# temp_2_df = garbage.apply(apply_machine_second, axis=1, result_type='expand')

Finished first
 Found it in: 845.3341653347015
Node Irish Sea not reachable from €2 commemorative coins in depth first version
Node Banknote not reachable from 10th century in depth first version
Node Country not reachable from 10th century in depth first version
Node Harlem Globetrotters not reachable from 10th century in depth first version
Node History of democracy not reachable from 10th century in depth first version
Node Marco Polo not reachable from 10th century in depth first version
Node Dimetrodon not reachable from 11th century in depth first version
Node Education in the United States not reachable from 11th century in depth first version
Node Hurricane Alex (2004) not reachable from 11th century in depth first version
Node John Adams not reachable from 11th century in depth first version
Node Lhasa not reachable from 11th century in depth first version
Node Plum not reachable from 11th century in depth first version
Node Taiwan not reachable from 11th century in depth firs

In [28]:
splits = [(0, 6000), (6000, 12000), (12000, 18000), (18000, 24000), (24000, 28718)]

In [29]:
result_dict = {}

chosen_split = splits[0]

to_process = decoded_articles[chosen_split[0]: chosen_split[1]]

for index, row in to_process.iterrows():
    source = row['first_article']
    target = row['last_article']

    try:
        lib_path_1, lib_explore_1 = machine_searchers.modded_astar_path(decoded_wikispeedia, source, target, heuristic=semantic_similarity)
        lib_path_2, lib_explore_2 = machine_searchers.only_depth_first_astar_path(decoded_wikispeedia, source, target, heuristic=semantic_similarity)

        # Now storing the index, to easily know what's not stored
        result_dict[index] = [index, source, target, len(lib_explore_1)-1, lib_path_1, lib_explore_1, len(lib_explore_2)-1, lib_path_2, lib_explore_2]

    except Exception as e:
        # Handle the exception (e.g., log it, assign None or np.nan, etc.)
        result_dict[index] = [source, target, np.nan, np.nan, np.nan]

KeyboardInterrupt: 

In [22]:
resulting_df = pd.DataFrame.from_dict(result_dict, orient='index', 
                                      columns=['Source', 'Target', 'Path_1', 'Explored_1', 'Path_2', 'Explored_2'])

resulting_df.head()

ValueError: 6 columns passed, passed data had 8 columns

In [34]:
name = 'machine_data_runs_' + str(chosen_split[0]) + '_' + str(chosen_split[1]) + '.csv'
resulting_df.to_csv(name, encoding='utf-8')

'machine_data_runs_0_6000.csv'

In [None]:
test_read = pd.read_csv('machine_data_runs.csv')
test_read