In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
def load_data(file_path):
    columns = None
    data = []
    with open(file_path) as file:
        while line := file.readline():
            line = line.rstrip()
            if line.startswith("# FORMAT:") and not columns:
                data_str = line.split("# FORMAT:   ")[1] if line.startswith("# FORMAT:   ") else "value"
                columns = data_str.split("   ")
            elif line.startswith("# FORMAT:"):
                raise RuntimeError()
            elif len(line) > 0 and not line.startswith("#"):
                data_line = line.split("	")
                assert len(data_line) == len(columns)
                data.append(data_line)
    
    df = pd.DataFrame(data, columns=columns)
    
    return df

In [3]:
paths_and_graph = {}

with os.scandir("data/wikispeedia_paths-and-graph") as it:
    for entry in it:
        if (entry.name.endswith(".tsv") or entry.name.endswith(".txt")) and entry.is_file():
            key = entry.name.split(".")[0]
            paths_and_graph[key] = load_data(entry.path)
            
paths_and_graph

{'shortest-path-distance-matrix':                                                   value
 0     0_____33333325634333435_2433544334_3_422343544...
 1     _0____22222325623232424_2422544324_3_312242544...
 2     __0___33222425623232324_2333444433_3_422343434...
 3     ___0__33333325634233334_2433434333_2_423343433...
 4     ____0_22323335633332435_2433545434_3_423343544...
 ...                                                 ...
 4599  ______22222325622231424_1322544334_3_422232544...
 4600  ______33333434523232434_2332544324_3_323333544...
 4601  ______22222424522231434_2322545434_3_422232544...
 4602  ______33333436733342435_2433545444_3_523353544...
 4603  ______22233325623232434_2423544334_3_423343544...
 
 [4604 rows x 1 columns],
 'paths_finished':         hashedIpAddress   timestamp durationInSec  \
 0      6a3701d319fc3754  1297740409           166   
 1      3824310e536af032  1344753412            88   
 2      415612e93584d30e  1349298640           138   
 3      64dd5cd342e37

In [4]:
from urllib.parse import unquote

paths_and_graph["articles"]["article_decoded"] = paths_and_graph["articles"]["article"].apply(unquote)

paths_and_graph["categories"]["article_decoded"] = paths_and_graph["categories"]["article"].apply(unquote)

paths_and_graph["links"]["linkSource_decoded"] = paths_and_graph["links"]["linkSource"].apply(unquote)
paths_and_graph["links"]["linkTarget_decoded"] = paths_and_graph["links"]["linkTarget"].apply(unquote)

In [5]:
paths_and_graph["shortest-path-distance-matrix"] = np.array(list(map(
        lambda s: np.array(list(map(lambda e: np.NaN if e == "_" else int(e), list(s)))),
        paths_and_graph["shortest-path-distance-matrix"].value.values
    ))
)

paths_and_graph["shortest-path-distance-matrix"]

array([[ 0., nan, nan, ...,  4.,  4.,  2.],
       [nan,  0., nan, ...,  3.,  3.,  3.],
       [nan, nan,  0., ...,  3.,  3.,  3.],
       ...,
       [nan, nan, nan, ...,  0.,  3.,  3.],
       [nan, nan, nan, ...,  4.,  0.,  3.],
       [nan, nan, nan, ...,  3.,  3.,  0.]])

In [6]:
paths_and_graph["shortest-path-distance-matrix"].shape

(4604, 4604)

In [7]:
paths_and_graph["paths_finished"]

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3
...,...,...,...,...,...
51313,15a13a1d66ef5456,1349231015,66,Yagan;Ancient_Egypt;Civilization,
51314,2ef7ac844cefda58,1300254138,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3
51315,12863abb7887f890,1385095372,228,Yagan;Australia;England;France;United_States;T...,
51316,19f8284371753362,1298792567,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1


In [8]:
pd.DataFrame(
    data={
        "key": list(paths_and_graph.keys()),
        "shape": list(map(lambda d: d.shape, paths_and_graph.values()))
    }
)

Unnamed: 0,key,shape
0,shortest-path-distance-matrix,"(4604, 4604)"
1,paths_finished,"(51318, 5)"
2,articles,"(4604, 2)"
3,paths_unfinished,"(24875, 6)"
4,links,"(119882, 4)"
5,categories,"(5204, 3)"


In [9]:
paths_and_graph["links"]

Unnamed: 0,linkSource,linkTarget,linkSource_decoded,linkTarget_decoded
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Bede,Áedán_mac_Gabráin,Bede
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba,Áedán_mac_Gabráin,Columba
2,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata,Áedán_mac_Gabráin,Dál_Riata
3,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain,Áedán_mac_Gabráin,Great_Britain
4,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland,Áedán_mac_Gabráin,Ireland
...,...,...,...,...
119877,Zulu,South_Africa,Zulu,South_Africa
119878,Zulu,Swaziland,Zulu,Swaziland
119879,Zulu,United_Kingdom,Zulu,United_Kingdom
119880,Zulu,Zambia,Zulu,Zambia


In [10]:
from lxml.html import parse

def extract_links(page_path):
    doc = parse(page_path).getroot()
    
    return [
        (a.text_content(), href.split("/")[-1].split(".htm")[0], unquote(href.split("/")[-1].split(".htm")[0])) 
        for a in doc.cssselect('a') 
        if (href := a.get('href'))
    ]

In [11]:
# traverse root directory, and list directories as dirs and files as files
articles_data = {}
for root, dirs, files in os.walk("data/wpcd/wp"):
    path = root.split(os.sep)
    for file in files:
        if file.endswith(".htm"):
            article_name = file.split(".")[0]
            article_links = extract_links(os.path.join(root, file))
            
            articles_data[article_name] = {
                "links": article_links
            }

In [12]:
articles_data = pd.DataFrame.from_dict(articles_data, orient="index")

In [13]:
paths_and_graph["articles"] = paths_and_graph["articles"].merge(articles_data, left_on="article", right_index=True)
paths_and_graph["articles"]["num_links"] = paths_and_graph["articles"]["links"].apply(lambda l: len(l))

paths_and_graph["articles"]

Unnamed: 0,article,article_decoded,links,num_links
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Áedán_mac_Gabráin,"[(2007 Schools Wikipedia Selection, index, ind...",21
1,%C3%85land,Åland,"[(2007 Schools Wikipedia Selection, index, ind...",51
2,%C3%89douard_Manet,Édouard_Manet,"[(2007 Schools Wikipedia Selection, index, ind...",57
3,%C3%89ire,Éire,"[(2007 Schools Wikipedia Selection, index, ind...",20
4,%C3%93engus_I_of_the_Picts,Óengus_I_of_the_Picts,"[(2007 Schools Wikipedia Selection, index, ind...",24
...,...,...,...,...
4599,Zionism,Zionism,"[(2007 Schools Wikipedia Selection, index, ind...",129
4600,Zirconium,Zirconium,"[(2007 Schools Wikipedia Selection, index, ind...",68
4601,Zoroaster,Zoroaster,"[(2007 Schools Wikipedia Selection, index, ind...",37
4602,Zuid-Gelders,Zuid-Gelders,"[(2007 Schools Wikipedia Selection, index, ind...",14


In [14]:
articles = paths_and_graph["articles"]
articles.sort_values(by="num_links", ascending=False)

Unnamed: 0,article,article_decoded,links,num_links
3920,Sudan,Sudan,"[(2007 Schools Wikipedia Selection, index, ind...",606
1433,Europe,Europe,"[(2007 Schools Wikipedia Selection, index, ind...",558
2504,List_of_countries,List_of_countries,"[(2007 Schools Wikipedia Selection, index, ind...",558
1694,Germany,Germany,"[(2007 Schools Wikipedia Selection, index, ind...",552
2863,Mozambique,Mozambique,"[(2007 Schools Wikipedia Selection, index, ind...",550
...,...,...,...,...
2351,Klinefelter%27s_syndrome,Klinefelter's_syndrome,"[(2007 Schools Wikipedia Selection, index, ind...",4
4545,Wowpurchase,Wowpurchase,"[(place your order by post, Contact, Contact),...",3
1600,Friend_Directdebit,Friend_Directdebit,[(use this link to making a smaller regular do...,3
1210,Directdebit,Directdebit,"[(children charity, Children_Charity, Children...",2


In [15]:
relations = pd.concat(
    list(map(lambda row: pd.DataFrame(
        data={
            "from": [row.article_decoded]*len(row.links),
            "to": [link[2] for link in row.links]
        }
    ), articles.itertuples(index=False)))
)

relations.drop_duplicates(inplace=True)
relations

Unnamed: 0,from,to
0,Áedán_mac_Gabráin,index
1,Áedán_mac_Gabráin,subject.History.British_History.British_Histor...
2,Áedán_mac_Gabráin,subject.People.Historical_figures
3,Áedán_mac_Gabráin,4250.png
5,Áedán_mac_Gabráin,D%C3%A1l_Riata
...,...,...
16,Zulu,Cape_Town
17,Zulu,AK-47
18,Zulu,Ladysmith_Black_Mambazo
23,Zulu,Wikipedia_Text_of_the_GNU_Free_Documentation_L...


In [16]:
relations.groupby(by="from").count().sort_values("to", ascending=False).head(5)

Unnamed: 0_level_0,to
from,Unnamed: 1_level_1
List_of_countries,491
List_of_circulating_currencies,471
List_of_sovereign_states,423
United_States,334
Africa,288


In [17]:
relations.groupby(by="to").count().sort_values("from", ascending=False).head(15)

Unnamed: 0_level_0,from
to,Unnamed: 1_level_1
index,4536
Wikipedia_Text_of_the_GNU_Free_Documentation_License,4536
disclaimer,4536
United_States,1519
United_Kingdom,962
France,947
Europe,919
England,734
Germany,733
World_War_II,733


In [18]:
relations.query("`from` == `to`")

Unnamed: 0,from,to
24,American_Revolutionary_War,American_Revolutionary_War
10,Athens,Athens
63,Autostereogram,Autostereogram
42,Baltic_Sea,Baltic_Sea
13,Beijing,Beijing
...,...,...
10,War,War
4,Wikimedia_Foundation,Wikimedia_Foundation
3,Wikipedia_Text_of_the_GNU_Free_Documentation_L...,Wikipedia_Text_of_the_GNU_Free_Documentation_L...
84,World_Trade_Organization,World_Trade_Organization
