In [1]:
%load_ext autoreload
%autoreload 2

import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm

from utils.loading import load_paths_and_graph

In [2]:
paths_and_graph = load_paths_and_graph()

loading raw data from tsv files...
formatting data...
extracting additional information from source code...


In [3]:
articles = paths_and_graph["articles"]
articles.sort_values(by="num_links", ascending=False)

Unnamed: 0,article,article_decoded,links,num_links
3920,Sudan,Sudan,"[(2007 Schools Wikipedia Selection, index, ind...",606
1433,Europe,Europe,"[(2007 Schools Wikipedia Selection, index, ind...",558
2504,List_of_countries,List_of_countries,"[(2007 Schools Wikipedia Selection, index, ind...",552
1694,Germany,Germany,"[(2007 Schools Wikipedia Selection, index, ind...",551
2863,Mozambique,Mozambique,"[(2007 Schools Wikipedia Selection, index, ind...",550
...,...,...,...,...
2351,Klinefelter%27s_syndrome,Klinefelter's_syndrome,"[(2007 Schools Wikipedia Selection, index, ind...",4
4545,Wowpurchase,Wowpurchase,"[(place your order by post, Contact, Contact),...",3
1600,Friend_Directdebit,Friend_Directdebit,[(use this link to making a smaller regular do...,3
1231,Donation,Donation,"[(children charity, Children_Charity, Children...",2


In [4]:
hyperlinks = pd.concat(
    list(map(
        lambda row: pd.DataFrame(
            data={"from": [row.article_decoded]*row.num_links, 
            "to": [link[2] for link in row.links]}
        ) if row.num_links > 0 else pd.DataFrame(), 
        articles.itertuples(index=False)
    ))
)

hyperlinks.drop_duplicates(inplace=True)
hyperlinks

Unnamed: 0,from,to
0,Áedán_mac_Gabráin,index
1,Áedán_mac_Gabráin,subject.History.British_History.British_Histor...
2,Áedán_mac_Gabráin,subject.People.Historical_figures
3,Áedán_mac_Gabráin,4250.png
5,Áedán_mac_Gabráin,Dál_Riata
...,...,...
16,Zulu,Cape_Town
17,Zulu,AK-47
18,Zulu,Ladysmith_Black_Mambazo
23,Zulu,Wikipedia_Text_of_the_GNU_Free_Documentation_L...


In [5]:
hyperlinks.groupby(by="from").count().sort_values("to", ascending=False).head(5)

Unnamed: 0_level_0,to
from,Unnamed: 1_level_1
List_of_countries,486
List_of_circulating_currencies,471
List_of_sovereign_states,422
United_States,334
Africa,288


In [6]:
hyperlinks.groupby(by="to").count().sort_values("from", ascending=False).head(15)

Unnamed: 0_level_0,from
to,Unnamed: 1_level_1
index,4599
disclaimer,4599
Wikipedia_Text_of_the_GNU_Free_Documentation_License,4599
United_States,1551
United_Kingdom,972
France,959
Europe,933
England,751
World_War_II,751
Germany,743


In [7]:
outer_join = articles.merge(hyperlinks, left_on="article_decoded", right_on="to", how="outer")

not_reachable_articles = outer_join[outer_join.to.isna()][articles.columns]
no_matched_article = outer_join[outer_join.article_decoded.isna()][hyperlinks.columns]

display(not_reachable_articles.article_decoded)

# remove not interesting pages
no_matched_article = no_matched_article[~no_matched_article.to.str.endswith(".jpg")]
no_matched_article = no_matched_article[~no_matched_article.to.str.endswith(".png")]
no_matched_article = no_matched_article[~no_matched_article.to.str.endswith(".gif")]
no_matched_article = no_matched_article[~no_matched_article.to.str.endswith(".svg")]
no_matched_article = no_matched_article[~no_matched_article.to.str.endswith(".jpeg")]

no_matched_article = no_matched_article[~no_matched_article.to.str.endswith("_A")]
no_matched_article = no_matched_article[~no_matched_article.to.str.endswith("_B")]

no_matched_article = no_matched_article[~no_matched_article.to.str.startswith("Demographics_of_")]

no_matched_article = no_matched_article[~no_matched_article.to.str.startswith("subject.")]

display(no_matched_article.drop_duplicates("to", keep="first"))

13395     2005_Hertfordshire_Oil_Storage_Terminal_fire
13401                  2005_Lake_Tanganyika_earthquake
35319                                 A._E._J._Collins
35322                                            AC_DC
35433                                 ATLAS_experiment
                              ...                     
157722                                  Yotsuya_Kaidan
157723                            You're_Still_the_One
157806                                    Yungay,_Peru
157933                                      Zara_Yaqob
172687                          €2_commemorative_coins
Name: article_decoded, Length: 463, dtype: object

Unnamed: 0,from,to
37207,AIDS,Aids_Africa
37208,Lusaka,Aids_Zambia_Africa
49873,Bosnia_and_Herzegovina,Bosnia
56610,Directdebit,Children_Charity
60832,Democratic_Republic_of_the_Congo,Congo
60965,Wowpurchase,Contact
61538,Abidjan,Cote_Divoire
63190,Czech_Republic,Czech
73776,Abuja,Fifa_Nigeria
80304,Accra,Ghana_Cv_Tema


In [8]:
wikispeedia_graph = nx.DiGraph()

wikispeedia_graph.add_nodes_from(articles.article_decoded)
wikispeedia_graph.add_edges_from(hyperlinks.values)

In [9]:
# sanity check the graph

distance_matrix = paths_and_graph["shortest-path-distance-matrix"]
valid_indices = [(i, j) for i in range(len(distance_matrix)) for j in range(len(distance_matrix))]

arr = set()
for source_index, target_index in tqdm(valid_indices):
    best_distance = distance_matrix[(source_index, target_index)]
    if best_distance > 1:
        continue
    
    source = articles.article_decoded.iloc[source_index]
    target = articles.article_decoded.iloc[target_index]
    
    if (source, target) in arr:
        continue

    found_shortest_length = float("inf")
    try:
        shortest_path = nx.shortest_path(wikispeedia_graph, source=source, target=target)
        found_shortest_length = len(shortest_path) - 1
        
        if best_distance == np.nan:
            print(source, "->", target, "should not exist")
            arr.add((source, target))
            
        if found_shortest_length > best_distance:
            print(source, "->", target, "should exist")
            arr.add((source, target))
    except:
        if best_distance < found_shortest_length:
            print(source, "->", target, "should exist")
            arr.add((source, target))

100%|██████████| 21196816/21196816 [00:40<00:00, 526344.99it/s]
