In [8]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from urllib.parse import urlencode
from urllib.request import urlopen, Request
from pathlib import Path
import re

# Build network based on nodes and edges from dataset

In [9]:
nodes_df = pd.read_csv('Nodes.csv')
edges_df = pd.read_csv('Edges.csv', delimiter=';')

In [10]:
network = nx.DiGraph()

# Feature to include from dataset
features = ['Name', 'NameURL', 'Link', 'birthcity', 'countryName', 'continentName', 'birthyear', 'deathyear', 'gender', 'occupation', 'industry']

In [11]:
# Load nodes
for index, row in nodes_df.iterrows():
    network.add_node(row['Id'], **row[features].to_dict())

# Handling weird numberings
number_formatted_nodes = {
    "1,00E+03": "1000",
    "2,00E+03": "2000",
    "3,00E+03": "3000",
    "4,00E+03": "4000",
    "5,00E+03": "5000",
    "6,00E+03": "6000",
    "7,00E+03": "7000",
    "8,00E+03": "8000",
    "9,00E+03": "9000",
    "1,00E+04": "10000",
    "1,10E+04": "11000"
}

# Load edges
for index, row in edges_df.iterrows():
    try:
        source_id = row['Source']
        target_id = row['Target']
        if row['Source'] in number_formatted_nodes:
            source_id = number_formatted_nodes[row['Source']]
        if row['Target'] in number_formatted_nodes:
            target_id = number_formatted_nodes[row['Target']]
        network.add_edge(int(source_id), int(target_id))
    except Exception as e:
        print(f"Error adding edge from {row['Source']} to {row['Target']}: {e}")

In [12]:
def get_wikitext(title):
    baseurl = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "format": "json"
    }
    query = f"{baseurl}?{urlencode(params)}"
    wikiurl = Request(query, headers={'User-Agent': 'Mozilla/5.0'})
    wikiresponse = urlopen(wikiurl)
    wikidata = wikiresponse.read()
    wikitext = json.loads(wikidata.decode("utf-8"))
    pageId = str(list(wikitext["query"]["pages"].keys())[0])
    page_text = wikitext["query"]["pages"][pageId]['revisions'][0]['slots']['main']['*']
    return page_text

In [None]:
progress = 0
milestone = 10 # percent
total = network.number_of_nodes()
for node in network.nodes(data=True):
    nameURL = node[1]['NameURL']
    try:
        wikitext = get_wikitext(nameURL)
    except Exception as e:
        print(f"Error fetching wikitext for {nameURL}: {e}")
        continue
    with open(f'wikipages/{nameURL}.txt', 'w+', encoding='utf-8') as f:
        f.write(wikitext)
    progress += 1
    if (progress / total) * 100 >= milestone:
        print(f"Progress: {progress}/{total} ({(progress / total) * 100:.2f}%)")
        milestone += 10

In [14]:
# Check whether all pages are fetched
for node in network.nodes(data=True):
    nameURL = node[1]['NameURL']
    file_path = Path(f"wikipages/{nameURL}.txt")
    
    if not file_path.exists():
        print(f"\n❌ Failure: '{file_path}' was NOT found.")


❌ Failure: 'wikipages/Valdemar_IV_Atterdag_of_Denmark.txt' was NOT found.

❌ Failure: 'wikipages/Nicolas_de_Caritat,_marquis_de_Condorcet.txt' was NOT found.

❌ Failure: 'wikipages/Donatien_Alphonse_François_de_Sade,_Marquis_de_Sade.txt' was NOT found.

❌ Failure: 'wikipages/Geoffrey_V_Plantagenet,_Count_of_Anjou.txt' was NOT found.

❌ Failure: 'wikipages/Charles_the_Bald,_Holy_Roman_Emperor.txt' was NOT found.

❌ Failure: 'wikipages/Clemens_Maria_Wenzeslaus_von_Brentano.txt' was NOT found.

❌ Failure: 'wikipages/Dimitris_Kraniotris.txt' was NOT found.

❌ Failure: 'wikipages/Edgar_the_Peaceful_of_England.txt' was NOT found.

❌ Failure: 'wikipages/Arnulf_of_Carinthia,_Holy_Roman_Emperor.txt' was NOT found.


In [None]:
old_to_new_nameURL = {
    "Valdemar_IV_Atterdag_of_Denmark": "Valdemar_IV_of_Denmark",
    "Nicolas_de_Caritat,_marquis_de_Condorcet": "Marquis_de_Condorcet",
    "Donatien_Alphonse_François_de_Sade,_Marquis_de_Sade": "Marquis_de_Sade",
    "Geoffrey_V_Plantagenet,_Count_of_Anjou": "Geoffrey_Plantagenet,_Count_of_Anjou",
    "Charles_the_Bald,_Holy_Roman_Emperor": "Charles_the_Bald",
    "Clemens_Maria_Wenzeslaus_von_Brentano" : "Clemens_Brentano",
    "Dimitris_Kraniotris": "Dimitris_P._Kraniotis",
    "Edgar_the_Peaceful_of_England": "Edgar,_King_of_England",
    "Arnulf_of_Carinthia,_Holy_Roman_Emperor": "Arnulf_of_Carinthia"
}

for nameURL, newNameURL in old_to_new_nameURL.items():
    node_id = next((n for n, attr in network.nodes(data=True) if attr['NameURL'] == nameURL), None)
    network.nodes[node_id]['redirectNameURL'] = newNameURL
    try:
        wikitext = get_wikitext(old_to_new_nameURL[nameURL])
        with open(f'wikipages/{newNameURL}.txt', 'w+', encoding='utf-8') as f:
            f.write(wikitext)
    except Exception as e:
        print(f"Error fetching wikitext for {nameURL}: {e}")
        continue

Handle duplicated nodes in nodes_df based on 'NameURL' column

* Remove the first one for most because it doesn't have any edges pointing to it

* "James_Connolly" is a special case because there is the UK one and US one. A bit problematic so we will remove both

* "Matthew_Perry" is another special case, because the second node is the correct one

In [24]:
duplicated_index = nodes_df.index[nodes_df.duplicated(subset=['NameURL'])]
nodes_df.loc[duplicated_index]

Unnamed: 0.1,Unnamed: 0,Id,en_curid,Name,NameURL,Link,birthcity,countryName,countryCode_alpha2,countryCode_alpha3,...,harmonicClosenessCentrality,betweenessCentrality,authority,hub,indegree,outdegree,degree,clustering,eigenCentrality,BCI
2025,2026,2025,157272,Alexandre Dumas,Alexandre_Dumas,https://en.wikipedia.org/wiki/Alexandre_Dumas,VILLERS-COTTERÌ_TS,FRANCE,FR,FRA,...,0.239137,99812950000.0,0.000224,0.00011,33,15,48,0.040703,0.055658,0.035416
8039,8040,8039,190391,Dido,Dido,https://en.wikipedia.org/wiki/Dido,LONDON,UNITED KINGDOM,GB,GBR,...,0.237527,0.0,0.0,0.000174,0,14,14,0.076923,0.0,0.001153
8234,8235,8234,46402,Jane Seymour,Jane_Seymour,https://en.wikipedia.org/wiki/Jane_Seymour,LONDON,UNITED KINGDOM,GB,GBR,...,0.213809,5191401000.0,4.6e-05,2.8e-05,14,9,23,0.400735,0.0305,0.007975
9842,9843,9842,2293356,Ben Foster,Ben_Foster,https://en.wikipedia.org/wiki/Ben_Foster,ROYAL LEAMINGTON SPA,UNITED KINGDOM,GB,GBR,...,0.235723,24515050000.0,9.5e-05,0.000108,7,8,15,0.030303,0.011723,0.00257
9880,9881,9880,168309,James Connolly,James_Connolly,https://en.wikipedia.org/wiki/James_Connolly,COWGATE,UNITED KINGDOM,GB,GBR,...,0.232704,4995274000.0,3.1e-05,0.000106,6,5,11,0.088889,0.006121,0.00361
9900,9901,9900,233338,Matthew Perry,Matthew_Perry,https://en.wikipedia.org/wiki/Matthew_Perry,WILLIAMSTOWN,UNITED STATES,US,USA,...,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.001205
10099,10100,10099,1624,Andrew Johnson,Andrew_Johnson,https://en.wikipedia.org/wiki/Andrew_Johnson,RALEIGH,UNITED STATES,US,USA,...,0.229379,27288450000.0,0.000212,0.000132,28,20,48,0.206456,0.082999,0.066761


In [25]:
remove_duplicate_node_id = [2024, 4987, 7669, 7921, 9900, 7888]

# Remove both James Connolly
james_id = [7656, 9880]

for node_id in remove_duplicate_node_id:
    assert len(edges_df[edges_df['Source'] == str(node_id)]) == 0 or len(edges_df[edges_df['Target'] == str(node_id)]) == 0
network.remove_nodes_from(remove_duplicate_node_id+james_id)

In [26]:
# Check if there are any duplicates left
nodes_nameURL_list = []
for node in network.nodes(data=True):
    title = node[1]['NameURL']
    nodes_nameURL_list.append(title)
assert len(nodes_nameURL_list) == len(set(nodes_nameURL_list)), "There are still duplicate NameURL entries!"

TODO: 
* Get pages with #REDIRECT

* For every page, get redirect link names and store them as an attribute to get correct links later
{{Redirect|AR Rahman|the surah of the Quran|Ar-Rahman|other uses|Al rahman (disambiguation)}}

* Handle not found pages manually and find new wiki pages

* Create new attributes, newName and newLink for this

* Update edges, check all redirect link names and new names

In [31]:
regex = r"#REDIRECT\s*\[\[(.*?)\]\]"


In [None]:
for node in list(network.nodes(data=True)):
    nameURL = node[1]['NameURL']
    file_path = Path(f"wikipages/{nameURL}.txt")
    if not file_path.exists():
        continue
    with open(file_path, 'r', encoding='utf-8') as f:
        wikitext = f.read()
        match = re.search(regex, wikitext, re.IGNORECASE)
        if match:
            redirect_title = match.group(1)
            network.nodes[node[0]]['redirectNameURL'] = redirect_title
            try:
                wikitext = get_wikitext(redirect_title)
            except Exception as e:
                print(f"Error fetching wikitext for {nameURL}: {e}")
                continue
            with open(f'wikipages/{nameURL}.txt', 'w', encoding='utf-8') as f:
                f.write(wikitext)

In [34]:
# Check is all redirects are resolved
for node in network.nodes(data=True):
    nameURL = node[1]['NameURL']
    file_path = Path(f"wikipages/{nameURL}.txt")
    if not file_path.exists():
        continue
    with open(file_path, 'r', encoding='utf-8') as f:
        wikitext = f.read()
        match = re.search(regex, wikitext, re.IGNORECASE)
        assert not match, f"Redirect still found in {nameURL}"

In [78]:
# Add wikicontent as a node attribute
for node in network.nodes(data=True):
    nameURL = node[1]['NameURL']
    file_path = Path(f"wikipages/{nameURL}.txt")
    if not file_path.exists():
        nameURL = node[1].get('redirectNameURL', None)
        if nameURL is None:
            print(f"\n❌ Failure: '{file_path}' was NOT found as an attribute.")
            continue
        file_path = Path(f"wikipages/{nameURL}.txt")
        if not file_path.exists():
            print(f"\n❌ Failure: '{file_path}' was NOT found.")
            continue
    with open(file_path, 'r', encoding='utf-8') as f:
        wikitext = f.read()
        network.nodes[node[0]]['wikicontent'] = wikitext

Handle case like Aaron Johnson or Fabio where there are multiple people with the same name but different Wikipedia pages

* Also check if there are pages with short length, maybe a problem there too

In [79]:
# this is just for testing

directory_path = Path("wikipages")
files_found_path = list(directory_path.glob("*.txt"))
output_path = Path("wikipages_less300.txt")
ambiguation_list_by_word = []
for file_path in files_found_path:
    try:
        content = file_path.read_text(encoding='utf-8', errors='replace')
        word_count = len(content.split())
        if word_count < 300:
            ambiguation_list_by_word.append(file_path)
            with output_path.open("a", encoding="utf-8") as output_file:
                output_file.write(f"{content}\n{"+"*300}\n{"+"*300}\n")
            print(f"❌ Warning: '{file_path}' has only {word_count} words.")
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        continue
print(f"\nTotal files with less than 300 words: {len(ambiguation_list_by_word)}")


Total files with less than 300 words: 42


In [80]:
disambiguation_regex = r"\{\{\s*(?:[^|}]*disambiguation|hndis|given name|surname|disambig|hndisambig|hndab)(?:\|.*?)?\}\}"

In [81]:
directory_path = Path("wikipages")
files_found_path = list(directory_path.glob("*.txt"))
output_path = Path("wikipages_disambiguatiion_template.txt")
ambiguation_list_by_template = []
for file_path in files_found_path:
    try:
        content = file_path.read_text(encoding='utf-8', errors='replace')
        word_count = len(content.split())
        if re.search(disambiguation_regex, content, re.IGNORECASE):
            ambiguation_list_by_template.append(file_path)
            with output_path.open("a", encoding="utf-8") as output_file:
                output_file.write(f"{content}\n{"+"*300}\n{"+"*300}\n")
            print(f"❌ Warning: '{file_path}' contains a disambiguation template. Word count: {word_count}")
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        continue
print(f"\nTotal files with disambiguation templates: {len(ambiguation_list_by_template)}")


Total files with disambiguation templates: 46


In [82]:
len(network.nodes)

11332

In [None]:
# this is just for testing
diff = set(ambiguation_list_by_word) - set(ambiguation_list_by_template)
print(f"\nFiles with less than 300 words but no disambiguation template: {len(diff)}")
for file in diff:
    print(f" - {file}")


Files with less than 300 words but no disambiguation template: 4
 - wikipages/Theuderic_IV.txt
 - wikipages/Kallikrates.txt
 - wikipages/Benjamin_Angus_Wright.txt
 - wikipages/Klas_Pontus_Arnoldson.txt


If you check, these pages are correct but just short

In [86]:
removed_nodes = []
nodes = list(network.nodes(data=True))
for node in nodes:
    node_id = node[0]
    content = network.nodes[node_id].get('wikicontent', '')
    word_count = len(content.split())
    if re.search(disambiguation_regex, content, re.IGNORECASE):
        print(f"Node ID {node_id} with NameURL {network.nodes[node_id]['NameURL']} word count {word_count} contains a disambiguation template.")
        removed_nodes.append((node_id, network.nodes[node_id]['NameURL']))
        network.remove_node(node_id)
print(f"\nTotal nodes removed due to disambiguation templates: {len(removed_nodes)}")

Node ID 80 with NameURL Abdul_Qadir word count 1459 contains a disambiguation template.
Node ID 95 with NameURL Baldwin_Spencer word count 26 contains a disambiguation template.
Node ID 126 with NameURL Lisandro_López word count 21 contains a disambiguation template.
Node ID 277 with NameURL Mark_Webber word count 48 contains a disambiguation template.
Node ID 699 with NameURL Daniel_Alves word count 34 contains a disambiguation template.
Node ID 714 with NameURL Fabio word count 725 contains a disambiguation template.
Node ID 791 with NameURL Zico word count 160 contains a disambiguation template.
Node ID 967 with NameURL Nani word count 166 contains a disambiguation template.
Node ID 1174 with NameURL Nikola_Kalinić word count 35 contains a disambiguation template.
Node ID 1381 with NameURL Frederik,_Crown_Prince_of_Denmark word count 124 contains a disambiguation template.
Node ID 1481 with NameURL Omar_Suleiman word count 50 contains a disambiguation template.
Node ID 2549 with Nam

In [87]:
len(network.nodes)

11286

# Save network

In [88]:
nx.write_graphml(network, "celebrities_clean.graphml")

In [89]:
network_without_content = network.copy()
for node in network_without_content.nodes(data=True):
    if 'wikicontent' in node[1]:
        del node[1]['wikicontent']

In [92]:
nx.write_graphml(network_without_content, "celebrities_clean_without_content.graphml")

# Update edges based on new wikipages