In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

In [None]:
categories = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "categories.tsv"),
    delimiter="\t",
    header=None,
    names=["article", "category"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

categories.head()

In [None]:
articles = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "articles.tsv"),
    delimiter="\t",
    header=None,
    names=["name"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

articles.head()

In [None]:
links = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "links.tsv"),
    delimiter="\t",
    header=None,
    names=["from", "to"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

links = links[links["from"] != links["to"]]
display(links.head())

name_to_index = dict(zip(articles["name"], articles.index))
links["from_index"] = links["from"].replace(name_to_index)
links["to_index"] = links["to"].replace(name_to_index)

display(links.head())

adjacency_list = links.groupby("to_index")["from_index"].agg(list).sort_index()
display(adjacency_list.head())


In [None]:
matrix = []

with open(os.path.join("Data", "wikispeedia_paths-and-graph", "shortest-path-distance-matrix.txt")) as file:
    for line in file:
        line = line.strip()
        if line == "" or line.startswith("#"):
            continue
    
        matrix.append(list(map(lambda x: -1 if x == "_" else int(x), list(line))))
        
matrix = np.array(matrix)

print(matrix[:10, :10])
    

In [None]:
names = ["Europe", "North America", "South America", "Africa", "Asia", "Australia", "Antarctica", "Middle East"]
ids = [articles.index[articles.name == name.replace(" ", "_")].tolist()[0] for name in names]

print(ids)

In [None]:
continet_rows = matrix[ids]

article_distances = pd.DataFrame(continet_rows.T, columns=names, index=articles.name)

non_reachable_articles = (article_distances == -1).any(axis="columns")
print("Number of non-reachable articles:", non_reachable_articles.sum())
print("Number of reachable articles:", len(non_reachable_articles) - non_reachable_articles.sum())

reachable_articles = article_distances[~non_reachable_articles]
reachable_articles.head()


In [None]:
possible_best = sum(reachable_articles[continent] == reachable_articles.min(axis="columns") for continent in names)
possible_best.value_counts()

In [None]:
for name in names:
    print(reachable_articles[name].value_counts())

In [None]:
reachable_articles[possible_best == 1].idxmin(axis="columns").value_counts()

In [None]:
def softmax(x):
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x / np.sum(e_x, axis=-1, keepdims=True)

In [None]:
article_weights = np.zeros((len(articles), len(names)))

for index, id_ in enumerate(ids):
    article_weights[id_, index] = 1

for _ in range(100):
    new_weights = article_weights.copy()
    for idx in np.random.permutation(adjacency_list.index):
        if idx in ids:
            continue
        
        weight = article_weights[adjacency_list[idx]].sum(axis=0)
        if weight.sum() > 0:
            new_weights[idx] = softmax(weight)
    article_weights = new_weights.copy()

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
x = pca.fit_transform(article_weights)

plt.scatter(x[:, 0], x[:, 1], alpha=0.5)
plt.show()

In [None]:
for i, name in enumerate(names):
    plt.hist(article_weights[:, i], label=name)
    plt.legend()
    plt.show()