In [1]:
# Use cuda backend for networkx
# Requires to run this first:
# pip install nx-cugraph-cu12 --extra-index-url https://pypi.nvidia.com
# %env NX_CUGRAPH_AUTOCONFIG=True

In [2]:
import pandas as pd
import networkx as nx
import seaborn as sns

from src.data.some_dataloader import *
from articles_clicks_links import click_count_in_paths

In [3]:
links_df = load_links_dataframe()

links_df.head(10)

Unnamed: 0,linkSource,linkTarget
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Bede
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba
2,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata
3,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain
4,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland
5,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Isle_of_Man
6,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Monarchy
7,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Orkney
8,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Picts
9,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Scotland


In [4]:
len(links_df)

119882

In [5]:
edges = [(row['linkSource'], row['linkTarget']) for index, row in links_df.iterrows()]
G = nx.DiGraph(edges)

In [6]:
truncated_edges = [(row['linkSource'], row['linkTarget']) for index, row in links_df.head(1000).iterrows()]
truncated_G = nx.DiGraph(truncated_edges)

In [7]:
truncated_pagerank = nx.pagerank(truncated_G)
truncated_pagerank

{'%C3%81ed%C3%A1n_mac_Gabr%C3%A1in': 0.0016279605036263532,
 'Bede': 0.0017537671871455235,
 'Columba': 0.0017537671871455235,
 'D%C3%A1l_Riata': 0.001892154539016611,
 'Great_Britain': 0.001892154539016611,
 'Ireland': 0.002222859509574436,
 'Isle_of_Man': 0.0017995018573894964,
 'Monarchy': 0.0018119473074529487,
 'Orkney': 0.0017537671871455235,
 'Picts': 0.0017537671871455235,
 'Scotland': 0.0020290711594543815,
 'Wales': 0.0017537671871455235,
 '%C3%85land': 0.0016279605036263532,
 '20th_century': 0.0018529914034421094,
 'Baltic_Sea': 0.001700795951979557,
 'Crimean_War': 0.0017108238955587302,
 'Currency': 0.001700795951979557,
 'Euro': 0.001700795951979557,
 'European_Union': 0.0017302400693989374,
 'Finland': 0.0018010175411465796,
 'League_of_Nations': 0.001700795951979557,
 'List_of_countries_by_system_of_government': 0.001700795951979557,
 'Nationality': 0.001700795951979557,
 'Parliamentary_system': 0.001700795951979557,
 'Police': 0.001700795951979557,
 'Russia': 0.0017709

In [8]:
pagerank = nx.pagerank(G)
pagerank

{'%C3%81ed%C3%A1n_mac_Gabr%C3%A1in': 3.2710390395592254e-05,
 'Bede': 0.00021938161316650256,
 'Columba': 0.00012116881897593145,
 'D%C3%A1l_Riata': 0.00010556015467803797,
 'Great_Britain': 0.0015061292462843425,
 'Ireland': 0.0018999654148516224,
 'Isle_of_Man': 0.00046032088563309313,
 'Monarchy': 0.0008133221072287704,
 'Orkney': 0.00028994588516574057,
 'Picts': 0.00019759121101826143,
 'Scotland': 0.0021425304277849183,
 'Wales': 0.0010997632074287427,
 '%C3%85land': 3.2710390395592254e-05,
 '20th_century': 0.002361539066403613,
 'Baltic_Sea': 0.0006497932466591777,
 'Crimean_War': 0.00022012926646383522,
 'Currency': 0.003237155919656553,
 'Euro': 0.001249533710187169,
 'European_Union': 0.002318582455038707,
 'Finland': 0.001009621435681574,
 'League_of_Nations': 0.0005769703027683626,
 'List_of_countries_by_system_of_government': 0.0028333266533963103,
 'Nationality': 0.00022617861790487603,
 'Parliamentary_system': 0.000721430277649281,
 'Police': 0.00025282362854375833,
 'Ru

In [9]:
pagerank_df = pd.DataFrame({
    'article_name': pagerank.keys(),
    'rank': pagerank.values() # Order will correspond to keys
})
pagerank_df.sort_values(by='rank', ascending=False, inplace=True, ignore_index=True)
pagerank_df.head(10)

Unnamed: 0,article_name,rank
0,United_States,0.009566
1,France,0.006423
2,Europe,0.00634
3,United_Kingdom,0.006235
4,English_language,0.004865
5,Germany,0.004824
6,World_War_II,0.004725
7,England,0.004474
8,Latin,0.004424
9,India,0.004036


In [10]:
articles = load_articles_dataframe()
articles.head(10)

Unnamed: 0,articles
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
1,%C3%85land
2,%C3%89douard_Manet
3,%C3%89ire
4,%C3%93engus_I_of_the_Picts
5,%E2%82%AC2_commemorative_coins
6,10th_century
7,11th_century
8,12th_century
9,13th_century


In [11]:
finished_paths = load_path_finished_dataframe()
unfinished_paths = load_path_unfinished_distance_dataframe()
paths_merged = pd.concat([finished_paths["path"], unfinished_paths["path"]])
paths_merged = paths_merged.apply(lambda row: row.split(';'))

paths_merged.head(10)

0    [14th_century, 15th_century, 16th_century, Pac...
1    [14th_century, Europe, Africa, Atlantic_slave_...
2    [14th_century, Niger, Nigeria, British_Empire,...
3    [14th_century, Renaissance, Ancient_Greece, Gr...
4    [14th_century, Italy, Roman_Catholic_Church, H...
5    [14th_century, Europe, North_America, United_S...
6               [14th_century, China, Gunpowder, Fire]
7    [14th_century, Time, Isaac_Newton, Light, Colo...
8                 [14th_century, Time, Light, Rainbow]
9    [14th_century, 15th_century, Plato, Nature, Ul...
Name: path, dtype: object

In [12]:
df_clicks = click_count_in_paths(articles, paths_merged)
df_clicks.head(10)

there are 476073 clicks in the whole whikispeedia dataset (both finished and unfinished paths)


Unnamed: 0,click_count
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,0
%C3%85land,4
%C3%89douard_Manet,7
%C3%89ire,13
%C3%93engus_I_of_the_Picts,0
%E2%82%AC2_commemorative_coins,1
10th_century,150
11th_century,141
12th_century,169
13th_century,175


In [13]:
df_player_frequencies = df_clicks.copy()
df_player_frequencies.click_count = df_player_frequencies.click_count / df_player_frequencies.click_count.sum()
df_player_frequencies.rename(columns={'click_count': 'rank'}, inplace=True)
df_player_frequencies.index.name = 'article_name'
df_player_frequencies.reset_index(inplace=True)
df_player_frequencies.sort_values(by='rank', ascending=False, inplace=True, ignore_index=True)

df_player_frequencies.head(10)

Unnamed: 0,article_name,rank
0,United_States,0.028127
1,Europe,0.012677
2,United_Kingdom,0.012038
3,England,0.010298
4,Earth,0.009525
5,Africa,0.008111
6,World_War_II,0.006622
7,North_America,0.005861
8,Animal,0.005583
9,Brain,0.005477


In [14]:
df_player_frequencies['type'] = 'Player'
pagerank_df['type'] = 'Pagerank'
rank_v_freq = pd.concat([df_player_frequencies, pagerank_df], ignore_index=True)
rank_v_freq.sort_values(by='rank', ignore_index=True, inplace=True, ascending=False)
rank_v_freq.head(10)

Unnamed: 0,article_name,rank,type
0,United_States,0.028127,Player
1,Europe,0.012677,Player
2,United_Kingdom,0.012038,Player
3,England,0.010298,Player
4,United_States,0.009566,Pagerank
5,Earth,0.009525,Player
6,Africa,0.008111,Player
7,World_War_II,0.006622,Player
8,France,0.006423,Pagerank
9,Europe,0.00634,Pagerank


In [18]:
draw_first_k = 5
first_k_article_names = pagerank_df.head(draw_first_k).article_name
truncated_r_v_f = rank_v_freq[rank_v_freq.article_name.isin(first_k_article_names.values)].reset_index(drop=True)
truncated_r_v_f

Unnamed: 0,article_name,rank,type
0,United_States,0.028127,Player
1,Europe,0.012677,Player
2,United_Kingdom,0.012038,Player
3,United_States,0.009566,Pagerank
4,France,0.006423,Pagerank
5,Europe,0.00634,Pagerank
6,United_Kingdom,0.006235,Pagerank
7,France,0.005079,Player
8,English_language,0.004865,Pagerank
9,English_language,0.004596,Player


In [None]:
sns.barplot(truncated_r_v_f, x='article_name', y='rank', hue='type')