# Why?
Idea of this notebook is to give the template that will be run on how to get all of the machine data.

Namely, finding the paths and explorations that the two models take. 
Ideally it'll be upgraded to work with different models too, but that's for later

In [None]:
import sys
import os
sys.path.append('../')
import data_readers

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import math

# networkx
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout


# For semantic similarity
from urllib.parse import unquote
from transformers import BertTokenizer
from transformers import BertModel
from sklearn.metrics.pairwise import cosine_similarity

# Python functions in .py file to read data
import machine_searchers

import warnings
from tqdm import TqdmWarning
warnings.filterwarnings('ignore', category=TqdmWarning)



In [None]:
finished_paths = pd.read_csv('../datasets/wikispeedia_paths-and-graph/paths_finished.tsv', sep='\t', skiprows=15,
                                 names=['hashedIpAddress', 'timestamp', "durationInSec", 'path', "rating"])
finished_paths['first_article'] = finished_paths['path'].apply(lambda x: x.split(';')[0])
finished_paths['last_article'] = finished_paths['path'].apply(lambda x: x.split(';')[-1])
finished_paths['path_length'] = finished_paths['path'].apply(lambda x: len(x.split(';')))
finished_paths['date'] = pd.to_datetime(finished_paths['timestamp'], unit='s')

In [None]:
# How many each pair of articles has been visited
article_combinations_count = finished_paths.groupby(['first_article', 'last_article']).size().reset_index(name='count')

# The mean and std of the path length for each pair of articles
article_combinations_stats = finished_paths.groupby(['first_article', 'last_article'])['path_length'].agg(['mean', 'std']).reset_index()
article_combinations_stats['std'] = article_combinations_stats['std'].fillna(0)
article_combinations_stats.rename(columns={'mean': 'mean_length', 'std': 'std_length'}, inplace=True)

# The mean and std of the rating for each pair of articles. 
# Note that mean and std may be nan if there are nan ratings. We purposely leave them as nan, as we don't want to fill them with 0s or 1s.
# Depending on the application, we could change this in the future if neeeded.
rating_combinations_stats_rating = finished_paths.groupby(['first_article', 'last_article'])['rating'].agg(['mean', 'std']).reset_index()
#rating_combinations_stats_rating['std'] = rating_combinations_stats_rating['std'].fillna(0)
mask = rating_combinations_stats_rating['mean'].notnull()
rating_combinations_stats_rating.loc[mask, 'std'] = rating_combinations_stats_rating.loc[mask, 'std'].fillna(0)
rating_combinations_stats_rating.rename(columns={'mean': 'mean_rating', 'std': 'std_rating'}, inplace=True)

# The mean and std of the time for each pair of articles.
rating_combinations_stats_time = finished_paths.groupby(['first_article', 'last_article'])['durationInSec'].agg(['mean', 'std']).reset_index()
rating_combinations_stats_time['std'] = rating_combinations_stats_time['std'].fillna(0)
rating_combinations_stats_time.rename(columns={'mean': 'mean_durationInSec', 'std': 'std_durationInSec'}, inplace=True)

# Merging all the dataframes
article_combinations = pd.merge(article_combinations_count, article_combinations_stats, on=['first_article', 'last_article'])
article_combinations = pd.merge(article_combinations, rating_combinations_stats_rating, on=['first_article', 'last_article'])
article_combinations = pd.merge(article_combinations, rating_combinations_stats_time, on=['first_article', 'last_article'])

# The number of unique sources and targets
unique_sources = finished_paths['first_article'].value_counts().reset_index()
unique_targets = finished_paths['last_article'].value_counts().reset_index()

In [None]:
article_combinations.head()

In [None]:
# We define the model outside the function (make sure to run this before using the function)
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
import machine_searchers
import time

def modded_get_embedding(text: str):
    temp_str = text.replace('_', ' ')
    temp_str = unquote(temp_str)
    inputs = tokenizer(temp_str, return_tensors="pt")
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

def distance_two_words(w1: str, w2: str):
    """Receives a string that was in the wikispeedia dataset, and transforms it as needed to work
    with the berd embeddings."""

    embedding1 = modded_get_embedding(w1)
    embedding2 = modded_get_embedding(w2)
    similarity = cosine_similarity(embedding1.detach().numpy(), embedding2.detach().numpy())[0][0]
    # Adding absolute, just in case it is needed
    # Similarity is actually 1 - abs(similarity) + 1,
    # As we want closer words to have a smaller distance
    # The last plus one is to indicate that there would be an extra cost to exploring, as if not the system often
    # thinks that there are nodes that have a distance of 0.5 or something like that
    similarity = 1 - abs(similarity) + 1
    # print("First word:", w1, ". Second word:", w2, ". GoodDistance:", similarity)
    return similarity

In [None]:
wikispeedia= nx.read_edgelist('../datasets/wikispeedia_paths-and-graph/links.tsv',
                              create_using=nx.DiGraph)

start_time = time.time()
lib_path_1, lib_explore_1 = machine_searchers.modded_astar_path(wikispeedia, 'Actor', 'Japan', heuristic=distance_two_words)
end_time = time.time()

# It's len - 1 because the target node is also included, and that node wasn't explored
print("Using the modded a star that returns explored nodes:")
print(" Found solution for Actor to Japan exploring the following number of nodes:", len(lib_explore_1)-1)
print(" Found it in:", end_time-start_time)

start_time = time.time()
lib_path_2, lib_explore_2 = machine_searchers.only_depth_first_astar_path(wikispeedia, 'Actor', 'Japan', heuristic=distance_two_words)
end_time = time.time()

print("Using depth first only A star that returns explored nodes:")
print(" Found solution for Actor to Japan exploring the following number of nodes:", len(lib_explore_2)-1)
print(" Found it in:", end_time-start_time)

In [None]:
result_dict = {}

for index, row in article_combinations.iterrows():
    source = row['first_article']
    target = row['last_article']

    lib_path_1, lib_explore_1 = machine_searchers.modded_astar_path(wikispeedia, source, target, heuristic=distance_two_words)
    lib_path_2, lib_explore_2 = machine_searchers.only_depth_first_astar_path(wikispeedia, source, target, heuristic=distance_two_words)
    
    result_dict[index] = [source, target, lib_path_1, lib_explore_1, lib_path_2, lib_explore_2]
    
    # Placeholder code for testing
    # if index >= 3:
    #     break

In [None]:
resulting_df = pd.DataFrame.from_dict(result_dict, orient='index', 
                                      columns=['Source', 'Target', 'Path_1', 'Explored_1', 'Path_2', 'Explored_2'])

resulting_df

In [None]:
resulting_df.to_csv('machine_data_runs.csv', encoding='utf-8')

In [None]:
test_read = pd.read_csv('machine_data_runs.csv')
test_read