In [34]:
import pandas as pd
import numpy as np
from urllib.parse import unquote

# Load files (Matrix, Links & Articles)

In [77]:
articles = pd.read_csv('./data/paths/articles.tsv', comment='#', names=["article"])
names_articles = articles["article"].apply(unquote).replace('_', ' ', regex=True)
len(names_articles)

4604

In [78]:
links = pd.read_csv('./data/paths/links.tsv', sep='\t', comment='#', names=["linkSource", "linkTarget"])
links["linkSource"] = links["linkSource"].apply(unquote).replace('_', ' ', regex=True)
links["linkTarget"] = links["linkTarget"].apply(unquote).replace('_', ' ', regex=True)
links

Unnamed: 0,linkSource,linkTarget
0,Áedán mac Gabráin,Bede
1,Áedán mac Gabráin,Columba
2,Áedán mac Gabráin,Dál Riata
3,Áedán mac Gabráin,Great Britain
4,Áedán mac Gabráin,Ireland
...,...,...
119877,Zulu,South Africa
119878,Zulu,Swaziland
119879,Zulu,United Kingdom
119880,Zulu,Zambia


In [79]:
# Read the file line by line
with open('./data/paths/shortest-path-distance-matrix.txt', 'r') as file:
    lines = file.readlines()

# Process each line to convert it into a list of distances
data = []
for line in lines:
    stripped_line = line.strip()
    if not stripped_line or stripped_line.startswith('#'):
        continue  # Skip comment lines and empty lines
    distances = [int(char) if char != '_' else np.nan for char in line.strip()]
    data.append(distances)

matrix = pd.DataFrame(data)
# Optionally, read the articles.tsv file to use as column headers & index
#matrix.columns = names_articles
#matrix.index = names_articles

print("The rows are the source articles and the columns are the destination articles")
matrix

The rows are the source articles and the columns are the destination articles


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4594,4595,4596,4597,4598,4599,4600,4601,4602,4603
0,0.0,,,,,,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,2.0
1,,0.0,,,,,2.0,2.0,2.0,2.0,...,4.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0
2,,,0.0,,,,3.0,3.0,2.0,2.0,...,4.0,3.0,2.0,3.0,4.0,3.0,4.0,3.0,3.0,3.0
3,,,,0.0,,,3.0,3.0,3.0,3.0,...,4.0,2.0,2.0,3.0,4.0,3.0,4.0,4.0,3.0,3.0
4,,,,,0.0,,2.0,2.0,3.0,2.0,...,4.0,2.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4599,,,,,,,2.0,2.0,2.0,2.0,...,3.0,2.0,2.0,3.0,3.0,0.0,3.0,3.0,3.0,2.0
4600,,,,,,,3.0,3.0,3.0,3.0,...,3.0,3.0,2.0,2.0,3.0,3.0,0.0,3.0,4.0,3.0
4601,,,,,,,2.0,2.0,2.0,2.0,...,3.0,2.0,2.0,3.0,4.0,3.0,3.0,0.0,3.0,3.0
4602,,,,,,,3.0,3.0,3.0,3.0,...,4.0,3.0,3.0,4.0,4.0,3.0,5.0,4.0,0.0,3.0


# InDegree

## Option A: With links (faster & safer)
- Numbers of articles != Numbers of articles with a least one **link** directing to it

In [75]:
link_density = links.groupby('linkTarget').count()
link_density = link_density.rename(columns={"linkSource": "indegree"})
link_density

Unnamed: 0_level_0,indegree
linkTarget,Unnamed: 1_level_1
1 Ceres,12
10th century,65
11th century,57
12th century,85
13th century,84
...,...
Zirconium,15
Zoroaster,9
Zuid-Gelders,5
Zulu,14


## Option B: With the Matrix (slower & need to carefully distinc between Source & Target)
- Links are not **bidirectional** !
- But allow to also have more info about the best paths possibles

In [86]:
# Define distances to analyze
distance_thresholds = range(1, 10)  # Distances to compute related articles for

# Initialize a dictionary to store counts for each article
related_articles_count = {article: {distance: 0 for distance in distance_thresholds} for article in matrix.columns}

# Count related articles within each distance threshold
for article in matrix.columns:
    for distance in distance_thresholds:
        count = matrix.T.loc[article].apply(lambda x: 1 if x == distance else 0).sum()
        related_articles_count[article][distance] = count

# Convert to a DataFrame for readability
related_articles_df = pd.DataFrame(related_articles_count).T
related_articles_df.index = names_articles


print("Number of articles at a given distance with the optimal path from the Source to the Target article")
related_articles_df

Number of articles at a given distance with the optimal path from the Source to the Target article


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Áedán mac Gabráin,0,0,0,0,0,0,0,0,0
Åland,0,0,0,0,0,0,0,0,0
Édouard Manet,0,0,0,0,0,0,0,0,0
Éire,0,0,0,0,0,0,0,0,0
Óengus I of the Picts,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
Zionism,25,1020,3406,133,0,0,0,0,0
Zirconium,15,253,1636,2624,55,1,0,0,0
Zoroaster,9,492,3539,541,3,0,0,0,0
Zuid-Gelders,5,98,2723,1737,20,1,0,0,0


In [87]:
# Indegree of the target node
print("The number of pages that have the target node as a link.")
related_articles_df[1]

The number of pages that have the target node as a link.


article
Áedán mac Gabráin         0
Åland                     0
Édouard Manet             0
Éire                      0
Óengus I of the Picts     0
                         ..
Zionism                  25
Zirconium                15
Zoroaster                 9
Zuid-Gelders              5
Zulu                     14
Name: 1, Length: 4604, dtype: int64

# Link Density (Unique links)
- May be more links per Article, if the same link repeat again througth the artcile, need to confirm by looking at the text or HTML
- Numbers of articles != Numbers of articles with a least one **link** to another article in the corpus

In [88]:
# Group by linkSource and count the number of links to each target
link_density = links.groupby('linkSource').count()
link_density.columns = ['linkDensity']
link_density

Unnamed: 0_level_0,linkDensity
linkSource,Unnamed: 1_level_1
1 Ceres,32
10th century,26
11th century,48
12th century,45
13th century,34
...,...
Åland,19
Édouard Manet,20
Éire,8
Óengus I of the Picts,10


In [89]:
# I want to check if by grouping the links by the source, each target is unique
# If the number of links is equal to the number of unique targets, then each target is unique
link_density['uniqueTargets'] = links.groupby('linkSource')['linkTarget'].nunique()

# Check if the two columns are equal
link_density['linkDensity'].equals(link_density['uniqueTargets'])

True