## 1. Data loading and pre-processing

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
from src.utils.plot import plot_normalized_positions
from src.utils.paths import paths_no_backtrack
from src.utils.probs import posterior_probabilites, entropies_prior_posterior, information_gain
from src.utils.distances import compute_distances
from src.utils.load_data import load_data

In [None]:
articles, categories, links, paths_finished, paths_finished_llm, paths_unfinished, paths_unfinished_llm_df = load_data()

### Handling Missing Values

We found missing values for the rating of finished paths and decided not to impute them as replacing missing values with the median rating would not make sense. We will only use finished paths with a human rating and discard all other finished paths when we need to make analysis based on the rating.

### Handling the data size

Our pipeline can handle the datasize as we do not need to handle bigger datasets than the ones we have already treated through this pipeline.

In [None]:
# Computing descriptive statistics for key variables

paths_finished_len = paths_finished['path'].apply(len)
print(paths_finished_len.describe())
print(paths_finished['rating'].describe())

### Exploring correlations

Here we evaluate the correlation between the difficulty and the path length for human games. 
We observe that the rating is positively correlated with the path length.
R2 shows that our model explains some of the variance of the data.
p is close to 0 which shows that the rating is a good predictor for the path length.

In [None]:
paths_finished['path_length'] = paths_finished_len
mod = smf.ols(formula='path_length ~ rating', data=paths_finished)
res = mod.fit()
res.summary()

## 2. Compute probabilities and entropies

### 2.A. Prior click probability

![Formula for the prior click probability](./images/probs_prior.png)

The prior click probability only depends on the number of outlinks of each article.

In [None]:
# Count the out-links of each article to get the probability of clicking on any of them
out_degree = links.groupby('linkSource').size()
probs_prior = 1 / out_degree
# Create a Series indexed by 'linkSource' that gives a list of all the source’s out-links
out_links = links.groupby('linkSource')['linkTarget'].agg(list)

### 2.B. Posterior click probability

![Formula for the posterior click probability](./images/probs_posterior.png)

In [None]:
paths_finished = paths_no_backtrack(paths_finished['path'])
probs_posterior = posterior_probabilites(paths_finished, out_links, out_degree)

### 2.C. Prior and posterior entropies

Compute the prior and posterior entropy at each article along the path, except for the goal, because entropy is 0 once we have reached the goal.

In [None]:
entropies_prior, entropies_posterior = entropies_prior_posterior(paths_finished, probs_prior, probs_posterior, out_degree)

In [None]:
# Visualize the entropies like in Fig. 2 of the Wikispeedia paper
plot_normalized_positions(entropies_prior, 'prior entropy')
plot_normalized_positions(entropies_posterior, 'posterior entropy')

## 3. Split the paths

### 3.A. Compute “information gain”

This represents how much information we gain by looking at the click distribution, as described in section 4 and figure 2 of the paper.

In [None]:
info_gain = information_gain(entropies_prior, entropies_posterior)
plot_normalized_positions(info_gain, 'information gain')

This is similar to what the paper gets:

![information gain graph from the paper](./images/fig_2_paper.png)

### 3.B. Get the splitting point and split the paths

In [None]:
# Add all the “homing in” parts of the paths in the array, by splitting
# the paths at the point of minimum information

# Get the index of the point of lowest information gain for each path
argmin_info_gain = info_gain.apply(np.argmin)
# Split the path at this point and keep only the second part, corresponding to the “homing in” phase
paths_homing_in = pd.Series([path[argmin_info_gain[i]:] for i, path in paths_finished.items()])

## 4. Compute embedding distances

We now need to get a list of all the article titles whose embedding we need to compute, as well as the list of pairs of articles between which we need to compute the distance.

In [None]:
# Compute the set of all articles encountered in the paths_homing_in
# Every article encountered along a goal has a defined distance with the goal
all_distance_pairs = set()
all_articles = set()
for path in paths_homing_in:
    goal  = path[-1]
    for article in path:
        all_articles.add(article)
        all_distance_pairs.add((article, goal))

We use the sentence-transformers library to load the pre-trained BERT model and compute embeddings for each of our article titles. Then, we compute distances between these embeddings models, using both cosine similarity and Euclidean distance, which we return in a dataframe `similarities`.

In [None]:
from src.scripts.generate_embeddings_distances import get_embeddings_distances
similarities = get_embeddings_distances(all_articles, all_distance_pairs, 'data/article_similarities.csv')

## 5. Compute distances

In [None]:
distances_humans = compute_distances(links, probs_posterior, paths_homing_in, 'data/distances_humans.pkl')

## 6. Get wikispeedia distances from LLM games

First, we need to verify that the Wikispeedia distance can be computed in the same way for LLMs. We check if the LLMs adopt the same strategy of getting out-homing which is at the basis of our method.

In [None]:
'''Question 2'''

# Compute posterior probabilities
probs_posterior_llm = posterior_probabilites(paths_finished_llm, out_links, out_degree)

# Compute and plot entropies
entropies_prior_llm, entropies_posterior_llm = entropies_prior_posterior(paths_finished_llm, probs_prior, probs_posterior_llm, out_degree)
plot_normalized_positions(entropies_prior, 'prior entropy')
plot_normalized_positions(entropies_posterior, 'posterior entropy')

# Get llm information gain
info_gain_llm = information_gain(entropies_prior_llm, entropies_posterior_llm)
plot_normalized_positions(info_gain_llm, 'information gain')

# Split the paths

# Get the index of the point of lowest information gain for each path
argmin_info_gain_llm = info_gain_llm.apply(np.argmin)
# Split the path at this point and keep only the second part, corresponding to the “homing in” phase
paths_homing_in_llm = pd.Series([path[argmin_info_gain_llm[i]:] for i, path in paths_finished_llm.items()])

In the following code, we extract the Wikispeedia semantic distances from the corresponding finished paths.

In [None]:
distances_llm = compute_distances(links, probs_posterior_llm, paths_finished_llm, 'data/distances_llm.pkl')

distance_differences = dict()
common_distance_pairs = set(distances_llm.keys()) & set(distances_humans.keys())
for pair in common_distance_pairs:
    distance_differences[pair] = distances_humans[pair] - distances_llm[pair]

import seaborn as sns
sns.histplot(distance_differences.values())

common_distances_humans = pd.Series([distances_humans[pair] for pair in common_distance_pairs])
common_distances_llm = pd.Series([distances_llm[pair] for pair in common_distance_pairs])

In [None]:
print(common_distances_humans.describe())
print(common_distances_llm.describe())

In [None]:
series_distances_humans = pd.Series(distances_humans)
series_distances_llm = pd.Series(distances_llm)

# distances to csv
distances_humans_df = pd.DataFrame(list(distances_humans.items()), columns=['pair', 'distance'])
distances_humans_df.to_csv('data/distances_humans.csv', index=False)

distances_llm_df = pd.DataFrame(list(distances_llm.items()), columns=['pair', 'distance'])
distances_llm_df.to_csv('data/distances_llm.csv', index=False)