In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from urllib.parse import urlencode
from urllib.request import urlopen, Request
from pathlib import Path
import re

In [2]:
nodes_df = pd.read_csv('Nodes.csv')
edges_df = pd.read_csv('Edges.csv', delimiter=';')

In [3]:
network = nx.DiGraph()

# Feature to include from dataset
features = ['Name', 'NameURL', 'Link', 'birthcity', 'countryName', 'continentName', 'birthyear', 'deathyear', 'gender', 'occupation', 'industry']

In [4]:
# Load nodes
for index, row in nodes_df.iterrows():
    network.add_node(row['Id'], **row[features].to_dict())

# Handling weird numberings
number_formatted_nodes = {
    "1,00E+03": "1000",
    "2,00E+03": "2000",
    "3,00E+03": "3000",
    "4,00E+03": "4000",
    "5,00E+03": "5000",
    "6,00E+03": "6000",
    "7,00E+03": "7000",
    "8,00E+03": "8000",
    "9,00E+03": "9000",
    "1,00E+04": "10000",
    "1,10E+04": "11000"
}

# Load edges
for index, row in edges_df.iterrows():
    try:
        source_id = row['Source']
        target_id = row['Target']
        if row['Source'] in number_formatted_nodes:
            source_id = number_formatted_nodes[row['Source']]
        if row['Target'] in number_formatted_nodes:
            target_id = number_formatted_nodes[row['Target']]
        network.add_edge(int(source_id), int(target_id))
    except Exception as e:
        print(f"Error adding edge from {row['Source']} to {row['Target']}: {e}")

In [5]:
def get_wikitext(title):
    baseurl = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "titles": title,
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
        "format": "json"
    }
    query = f"{baseurl}?{urlencode(params)}"
    wikiurl = Request(query, headers={'User-Agent': 'Mozilla/5.0'})
    wikiresponse = urlopen(wikiurl)
    wikidata = wikiresponse.read()
    wikitext = json.loads(wikidata.decode("utf-8"))
    pageId = str(list(wikitext["query"]["pages"].keys())[0])
    page_text = wikitext["query"]["pages"][pageId]['revisions'][0]['slots']['main']['*']
    return page_text

In [6]:
# progress = 0
# milestone = 10 # percent
# milestone = 70 # delete later
# total = network.number_of_nodes()
# for node in network.nodes(data=True):
#     if progress < 8500:
#         progress += 1
#         continue
#     title = node[1]['NameURL']
#     try:
#         wikitext = get_wikitext(title)
#     except Exception as e:
#         print(f"Error fetching wikitext for {title}: {e}")
#         continue
#     network.nodes[node[0]]['content'] = wikitext
#     with open(f'wikipages/{node[1]["NameURL"]}.txt', 'w+', encoding='utf-8') as f:
#         f.write(wikitext)
#     progress += 1
#     if (progress / total) * 100 >= milestone:
#         print(f"Progress: {progress}/{total} ({(progress / total) * 100:.2f}%)")
#         milestone += 10

In [7]:
# Check whether all pages are fetched
for node in network.nodes(data=True):
    title = node[1]['NameURL']
    file_path = Path(f"wikipages/{title}.txt")
    
    if not file_path.exists():
        print(f"\n❌ Failure: '{file_path}' was NOT found.")


❌ Failure: 'wikipages/Milo_Đukanović.txt' was NOT found.

❌ Failure: 'wikipages/Mirko_Vučinić.txt' was NOT found.

❌ Failure: 'wikipages/Mobutu_Sese_Seko.txt' was NOT found.

❌ Failure: 'wikipages/Mustafa_III.txt' was NOT found.

❌ Failure: 'wikipages/Ne_Win.txt' was NOT found.

❌ Failure: 'wikipages/Nick_Drake.txt' was NOT found.

❌ Failure: 'wikipages/Niels_Ryberg_Finsen.txt' was NOT found.

❌ Failure: 'wikipages/Nikephoros_I.txt' was NOT found.

❌ Failure: 'wikipages/Omar_Bongo.txt' was NOT found.

❌ Failure: 'wikipages/Predrag_Mijatović.txt' was NOT found.

❌ Failure: 'wikipages/Radovan_Karadžić.txt' was NOT found.

❌ Failure: 'wikipages/Raymond_Barre.txt' was NOT found.

❌ Failure: 'wikipages/Roland_Georges_Garros.txt' was NOT found.

❌ Failure: 'wikipages/Stefan_Nemanja.txt' was NOT found.

❌ Failure: 'wikipages/Stevan_Jovetić.txt' was NOT found.

❌ Failure: 'wikipages/Steve_Mandanda.txt' was NOT found.

❌ Failure: 'wikipages/Than_Shwe.txt' was NOT found.

❌ Failure: 'wikipages/

Handle duplicated nodes in nodes_df based on 'NameURL' column

* Remove the first one for most because it doesn't have any edges pointing to it

* "James_Connolly" is a special case because there is the UK one and US one. A bit problematic so we will remove both

* "Matthew_Perry" is another special case, because the second node is the correct one

In [8]:
duplicated_index = nodes_df.index[nodes_df.duplicated(subset=['NameURL'])]
nodes_df.loc[duplicated_index]

Unnamed: 0.1,Unnamed: 0,Id,en_curid,Name,NameURL,Link,birthcity,countryName,countryCode_alpha2,countryCode_alpha3,...,harmonicClosenessCentrality,betweenessCentrality,authority,hub,indegree,outdegree,degree,clustering,eigenCentrality,BCI
2025,2026,2025,157272,Alexandre Dumas,Alexandre_Dumas,https://en.wikipedia.org/wiki/Alexandre_Dumas,VILLERS-COTTERÌ_TS,FRANCE,FR,FRA,...,0.239137,99812950000.0,0.000224,0.00011,33,15,48,0.040703,0.055658,0.035416
8039,8040,8039,190391,Dido,Dido,https://en.wikipedia.org/wiki/Dido,LONDON,UNITED KINGDOM,GB,GBR,...,0.237527,0.0,0.0,0.000174,0,14,14,0.076923,0.0,0.001153
8234,8235,8234,46402,Jane Seymour,Jane_Seymour,https://en.wikipedia.org/wiki/Jane_Seymour,LONDON,UNITED KINGDOM,GB,GBR,...,0.213809,5191401000.0,4.6e-05,2.8e-05,14,9,23,0.400735,0.0305,0.007975
9842,9843,9842,2293356,Ben Foster,Ben_Foster,https://en.wikipedia.org/wiki/Ben_Foster,ROYAL LEAMINGTON SPA,UNITED KINGDOM,GB,GBR,...,0.235723,24515050000.0,9.5e-05,0.000108,7,8,15,0.030303,0.011723,0.00257
9880,9881,9880,168309,James Connolly,James_Connolly,https://en.wikipedia.org/wiki/James_Connolly,COWGATE,UNITED KINGDOM,GB,GBR,...,0.232704,4995274000.0,3.1e-05,0.000106,6,5,11,0.088889,0.006121,0.00361
9900,9901,9900,233338,Matthew Perry,Matthew_Perry,https://en.wikipedia.org/wiki/Matthew_Perry,WILLIAMSTOWN,UNITED STATES,US,USA,...,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0,0.001205
10099,10100,10099,1624,Andrew Johnson,Andrew_Johnson,https://en.wikipedia.org/wiki/Andrew_Johnson,RALEIGH,UNITED STATES,US,USA,...,0.229379,27288450000.0,0.000212,0.000132,28,20,48,0.206456,0.082999,0.066761


In [9]:
remove_duplicate_node_id = [2024, 4987, 7669, 7921, 9900, 7888]

# Remove both James Connolly
james_id = [7656, 9880]

for node_id in remove_duplicate_node_id:
    assert len(edges_df[edges_df['Source'] == str(node_id)]) == 0 or len(edges_df[edges_df['Target'] == str(node_id)]) == 0
network.remove_nodes_from(remove_duplicate_node_id+james_id)

In [10]:
# Check if there are any duplicates left
nodes_nameURL_list = []
for node in network.nodes(data=True):
    title = node[1]['NameURL']
    nodes_nameURL_list.append(title)
assert len(nodes_nameURL_list) == len(set(nodes_nameURL_list)), "There are still duplicate NameURL entries!"

TODO: 
* Get pages with #REDIRECT

* For every page, get redirect link names and store them as an attribute to get correct links later
{{Redirect|AR Rahman|the surah of the Quran|Ar-Rahman|other uses|Al rahman (disambiguation)}}

* Handle not found pages manually and find new wiki pages

* Create new attributes, newName and newLink for this

* Update edges, check all redirect link names and new names

In [11]:
regex = r"#REDIRECT\s+\[\[(.*?)\]\]"
for node in list(network.nodes(data=True)):
    title = node[1]['NameURL']
    file_path = Path(f"wikipages/{title}.txt")
    if not file_path.exists():
        continue
    with open(file_path, 'r', encoding='utf-8') as f:
        wikitext = f.read()
        match = re.search(regex, wikitext, re.IGNORECASE)
        if match:
            redirect_title = match.group(1)
            network.nodes[node[0]]['redirectTitle'] = redirect_title
            # try:
            #     wikitext = get_wikitext(redirect_title)
            # except Exception as e:
            #     print(f"Error fetching wikitext for {title}: {e}")
            #     continue
            # with open(f'wikipages/{title}.txt', 'w', encoding='utf-8') as f:
            #     f.write(wikitext)

In [14]:
# Check is all redirects are resolved
for node in network.nodes(data=True):
    title = node[1]['NameURL']
    file_path = Path(f"wikipages/{title}.txt")
    if not file_path.exists():
        continue
    with open(file_path, 'r', encoding='utf-8') as f:
        wikitext = f.read()
        match = re.search(regex, wikitext, re.IGNORECASE)
        assert not match, f"Redirect still found in {title}"

<bound method Graph.number_of_nodes of <networkx.classes.digraph.DiGraph object at 0x15a851e80>>