In [34]:
import pandas as pd
import numpy as np
from urllib.parse import unquote

In [35]:
articles = pd.read_csv('./data/paths/articles.tsv', comment='#', names=["article"])

names_articles = articles["article"].apply(unquote).replace('_', ' ', regex=True)

# Matrix

In [36]:
# Read the file line by line
with open('./data/paths/shortest-path-distance-matrix.txt', 'r') as file:
    lines = file.readlines()

# Process each line to convert it into a list of distances
data = []
for line in lines:
    stripped_line = line.strip()
    if not stripped_line or stripped_line.startswith('#'):
        continue  # Skip comment lines and empty lines
    distances = [int(char) if char != '_' else np.nan for char in line.strip()]
    data.append(distances)

matrix = pd.DataFrame(data)
# Optionally, read the articles.tsv file to use as column headers & index
#matrix.columns = names_articles
#matrix.index = names_articles
matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4594,4595,4596,4597,4598,4599,4600,4601,4602,4603
0,0.0,,,,,,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,2.0
1,,0.0,,,,,2.0,2.0,2.0,2.0,...,4.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0
2,,,0.0,,,,3.0,3.0,2.0,2.0,...,4.0,3.0,2.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0
3,,,,0.0,,,3.0,3.0,3.0,3.0,...,4.0,2.0,2.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0
4,,,,,0.0,,2.0,2.0,3.0,2.0,...,4.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4599,,,,,,,2.0,2.0,2.0,2.0,...,3.0,2.0,2.0,3.0,3.0,0.0,3.0,3.0,3.0,2.0
4600,,,,,,,3.0,3.0,3.0,3.0,...,3.0,3.0,2.0,2.0,3.0,3.0,0.0,3.0,4.0,3.0
4601,,,,,,,2.0,2.0,2.0,2.0,...,3.0,2.0,2.0,3.0,4.0,3.0,3.0,0.0,3.0,3.0
4602,,,,,,,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,4.0,4.0,3.0,5.0,4.0,0.0,3.0


In [37]:
# Define distances to analyze
distance_thresholds = range(1, 10)  # Distances to compute related articles for

# Initialize a dictionary to store counts for each article
related_articles_count = {article: {distance: 0 for distance in distance_thresholds} for article in matrix.index}

# Count related articles within each distance threshold
for article in matrix.index:
    for distance in distance_thresholds:
        count = matrix.loc[article].apply(lambda x: 1 if x == distance else 0).sum()
        related_articles_count[article][distance] = count

# Convert to a DataFrame for readability
related_articles_df = pd.DataFrame(related_articles_count).T
related_articles_df.index = names_articles


print("Number of articles at a given distance from the optimal path to the Article")
related_articles_df

Number of articles at a given distance from the optimal path to the Article


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Áedán mac Gabráin,11,331,2066,1382,222,38,5,0,0
Åland,19,754,2192,903,152,33,2,0,0
Édouard Manet,20,610,2284,968,141,29,3,0,0
Éire,8,398,2173,1250,192,31,3,0,0
Óengus I of the Picts,10,354,2063,1355,232,37,4,0,0
...,...,...,...,...,...,...,...,...,...
Zionism,74,1426,1952,497,84,20,1,0,0
Zirconium,42,994,2171,724,111,11,1,0,0
Zoroaster,28,666,2209,967,159,23,2,0,0
Zuid-Gelders,5,49,1182,2148,572,76,22,0,0


In [38]:
# Indegree of the target node
print("The number of pages that have the target node as a link.")
related_articles_df[1]

The number of pages that have the target node as a link.


article
Áedán mac Gabráin        11
Åland                    19
Édouard Manet            20
Éire                      8
Óengus I of the Picts    10
                         ..
Zionism                  74
Zirconium                42
Zoroaster                28
Zuid-Gelders              5
Zulu                     15
Name: 1, Length: 4604, dtype: int64

# Links

In [39]:
links = pd.read_csv('./data/paths/links.tsv', sep='\t', comment='#', names=["linkSource", "linkTarget"])
links["linkSource"] = links["linkSource"].apply(unquote).replace('_', ' ', regex=True)
links["linkTarget"] = links["linkTarget"].apply(unquote).replace('_', ' ', regex=True)
links

Unnamed: 0,linkSource,linkTarget
0,Áedán mac Gabráin,Bede
1,Áedán mac Gabráin,Columba
2,Áedán mac Gabráin,Dál Riata
3,Áedán mac Gabráin,Great Britain
4,Áedán mac Gabráin,Ireland
...,...,...
119877,Zulu,South Africa
119878,Zulu,Swaziland
119879,Zulu,United Kingdom
119880,Zulu,Zambia


## Link Density (Unique links)
May be more links per Article, if the same link repeat again througth the artcile, need to confirm by looking at the text or HTML

In [40]:
# Link Density
# Group by linkSource and count the number of links to each target
link_density = links.groupby('linkSource').count()
link_density.columns = ['linkDensity']
link_density

Unnamed: 0_level_0,linkDensity
linkSource,Unnamed: 1_level_1
1 Ceres,32
10th century,26
11th century,48
12th century,45
13th century,34
...,...
Åland,19
Édouard Manet,20
Éire,8
Óengus I of the Picts,10


In [41]:
# I want to check if by grouping the links by the source, each target is unique
# If the number of links is equal to the number of unique targets, then each target is unique
link_density['uniqueTargets'] = links.groupby('linkSource')['linkTarget'].nunique()

# Check if the two columns are equal
link_density['linkDensity'].equals(link_density['uniqueTargets'])

True