In [11]:
import os
import glob
import pandas as pd

links_path = "./Data/links.tsv"
articles_path = "./Data/wp/"

In [12]:
non_wiki_articles = [
    "Friend_Directdebit",
    "Sponsorship_Directdebit",
    "Sponsorship_Directdebit",
]

df = pd.read_csv(links_path, sep="\t", header=None, names=["linkSource", "linkTarget"])
df = df[
    ~df["linkSource"].isin(non_wiki_articles)
    & ~df["linkTarget"].isin(non_wiki_articles)
]
df.dropna(inplace=True)
df

Unnamed: 0,linkSource,linkTarget
11,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Bede
12,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba
13,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata
14,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain
15,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland
...,...,...
119888,Zulu,South_Africa
119889,Zulu,Swaziland
119890,Zulu,United_Kingdom
119891,Zulu,Zambia


In [13]:
# extract article names from linkSource and linkTarget columns
articles = set(df["linkSource"]).union(set(df["linkTarget"]))

# find articles that have no .htm file with the same name
missing_articles = []
for article in articles:
    article_folder = article[0].lower()
    article_path = os.path.join(articles_path, article_folder, article + ".htm")
    if not os.path.exists(article_path):
        missing_articles.append(article)

if not missing_articles:
    print("All articles are present.")
else:
    print("The following articles are missing:")
    print(missing_articles)

All articles are present.


In [7]:
# get all .htm files in the articles_path directory
htm_files = glob.glob(os.path.join(articles_path, "*", "*.htm"))

# extract article names from the file paths
articles_in_folders = set([os.path.splitext(os.path.basename(f))[0] for f in htm_files])

# find articles that are in the folders but not in the dataset
missing_articles = articles_in_folders - set(df["linkSource"]).union(
    set(df["linkTarget"])
)

if not missing_articles:
    print("All articles are present.")
else:
    print(f"{len(missing_articles)} articles are missing:")
    print(missing_articles)

    # get articles that are missing and in the index subfolder
    missing_articles_in_index = [
        article for article in missing_articles if article.startswith("index/")
    ]

873 articles are missing:
{'j-list', 'Winter2005_Letter', 'Child_Photo_41', 'Pakistan_Earthquake_Third_Report', 'Why_I_Will_Remember_This_Charity', 'subject.Everyday_life.Cartoons', 'Women_Lives_India', 'Peru_A', 'Aids_Zimbabwe_Africa', 'Fa_Lesotho', 'Prediction_Game', 'subject.Science.Chemistry.General_Chemistry', 'subject.Science.Biology.Mammals', 'Ethiopia_Flooding_031106', 'Europe_A', 'Mozambique_Cyclone_230207', 'Panama_A', 'Autumn_06_Letter', 'Schatzki_ring', 'Directdebit_B', 'Asia_A', 't', 'Justgiving', 'Demographics_of_Egypt', 'Child_Photo_37', 'Annual_Review_2005_Page_5', 'Demographics_of_Gabon', 'Kosovo_050207', 'Sos_Children', 'q-list', 'q', 'Tbilisi', 'Wowpurchase_B', 'Demographics_of_Rwanda', 'Yugoslavia_A', 'Annual_Review_2005_Page_7', 'subject.History.World_War_II', 'subject.Religion.Philosophy', 'Honduras_A', 'Aids_Somalia_Africa', 'Swaziland_A', 'Tsunami_One_Year_On_India_2', 'Classicresponse', 'Six_Villages_Ku_World_Record', 'Lebanon_News_240706', 'Demographics_of_Syr

In [14]:
from helpers import get_order

df["order"] = df.apply(
    lambda row: get_order(row["linkSource"], row["linkTarget"], articles_path), axis=1
)

# Display the updated DataFrame
print(df)

%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%85land
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89douard_Manet
%C3%89ire
%C3%89ire
%C3%89ire
%C3%89ire
%C3%89ir

KeyboardInterrupt: 

In [19]:
df_with_order = df.copy()
df_with_order.to_csv("./Data/links_with_order.tsv", sep="\t", index=False)

In [24]:
df_with_order[df_with_order['order'].isna()]

Unnamed: 0,linkSource,linkTarget,order
42608,Genghis_Khan,Genghis_Khan,
64657,List_of_European_Union_member_states_by_accession,Germany,


In [26]:
df_with_order[df_with_order['linkSource'] == 'Friend_Directdebit']
df_with_order[df_with_order['linkSource'] == 'Zulu']

Unnamed: 0,linkSource,linkTarget,order
119878,Zulu,AK-47,15.0
119879,Zulu,Bantu,4.0
119880,Zulu,Cape_Town,14.0
119881,Zulu,Christianity,3.0
119882,Zulu,English_language,1.0
119883,Zulu,History_of_South_Africa,9.0
119884,Zulu,Ladysmith_Black_Mambazo,16.0
119885,Zulu,Mozambique,8.0
119886,Zulu,Portuguese_language,2.0
119887,Zulu,Shaka,10.0
