In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [16]:
import html
import urllib

Reading data
------------

In [34]:
graphs_path = './data/wikispeedia_paths-and-graph/'

In [35]:
!ls data/wikispeedia_paths-and-graph/

articles.tsv	links.tsv	    paths_unfinished.tsv
categories.tsv	paths_finished.tsv  shortest-path-distance-matrix.txt


In [25]:
!head './data/wikispeedia_paths-and-graph/categories.tsv'

# Hierarchical categories of all articles.
# Many articles have more than one category. Some articles have no category.
# Article names are URL-encoded; e.g., in Java they can be decoded using java.net.URLDecoder.decode(articleName, "UTF-8").
# FORMAT:   article   category
#
# When publishing on this data set, please cite:
# (1) Robert West and Jure Leskovec:
#     Human Wayfinding in Information Networks.
#     21st International World Wide Web Conference (WWW), 2012.
# (2) Robert West, Joelle Pineau, and Doina Precup:


In [26]:
def read_tsv(file_name, names=None):
    
    file_path = graphs_path + f'{file_name}.tsv'
    
    if not names:
        with open(file_path, 'r') as file:
            for line in file:
                if 'FORMAT' in line:
                    format_line = line
                    break

        names = format_line.split()[2:]
    
    return pd.read_csv(file_path,
                           delimiter='\t', comment='#', header=0, names=names)

In [27]:
link_network = read_tsv('links', names=['source', 'target'])
link_network

# link_network = link_network.applymap(urllib.parse.unquote_plus).copy()

Unnamed: 0,source,target
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata
2,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain
3,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland
4,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Isle_of_Man
...,...,...
119876,Zulu,South_Africa
119877,Zulu,Swaziland
119878,Zulu,United_Kingdom
119879,Zulu,Zambia


In [28]:
categories = read_tsv('categories')
categories

Unnamed: 0,article,category
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures
1,%C3%85land,subject.Countries
2,%C3%85land,subject.Geography.European_Geography.European_...
3,%C3%89douard_Manet,subject.People.Artists
4,%C3%89ire,subject.Countries
...,...,...
5198,Zirconium,subject.Science.Chemistry.Chemical_elements
5199,Zoroaster,subject.People.Religious_figures_and_leaders
5200,Zuid-Gelders,subject.Geography.European_Geography
5201,Zuid-Gelders,subject.Language_and_literature.Languages


In [32]:
finished_paths = read_tsv('paths_finished')
finished_paths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
1,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
2,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
3,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0
4,5295bca242be81fe,1372890414,110,14th_century;Europe;North_America;United_State...,
...,...,...,...,...,...
51312,15a13a1d66ef5456,1349231015,66,Yagan;Ancient_Egypt;Civilization,
51313,2ef7ac844cefda58,1300254138,165,Yagan;Folklore;Brothers_Grimm;<;19th_century;C...,3.0
51314,12863abb7887f890,1385095372,228,Yagan;Australia;England;France;United_States;T...,
51315,19f8284371753362,1298792567,56,Yarralumla%2C_Australian_Capital_Territory;Aus...,1.0


In [36]:
unfinished_paths = read_tsv('paths_unfinished')
unfinished_paths

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,target,type
0,26141fd878806294,1297055651,1805,Julius_Caesar,Caracas,timeout
1,2b015fb8181c48f2,1297090819,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout
2,53a53bc244e08a6a,1297094761,49,Paraguay,Mount_St._Helens,restart
3,53a53bc244e08a6a,1297099105,1808,Paraguay;Bolivia,Mount_St._Helens,timeout
4,131600803df4895e,1297100557,2009,Agriculture;History_of_the_world;China;Yangtze...,Grand_Canal_of_China,timeout
...,...,...,...,...,...,...
24869,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,Cholera,restart
24870,232f992e57d43e8d,1389787697,6,Modern_history,Hollandic,restart
24871,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,The_Beatles,timeout
24872,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,Alan_Turing,timeout


In [44]:
articles = read_tsv('articles')
articles

Unnamed: 0,article
0,%C3%85land
1,%C3%89douard_Manet
2,%C3%89ire
3,%C3%93engus_I_of_the_Picts
4,%E2%82%AC2_commemorative_coins
...,...
4598,Zionism
4599,Zirconium
4600,Zoroaster
4601,Zuid-Gelders


Pre-processing
--------------

In [49]:
# count number of incoming and outgoing links

source_counts = link_network['source'].value_counts().rename_axis('article').reset_index(name='source_count')
target_counts = link_network['target'].value_counts().rename_axis('article').reset_index(name='target_count')

link_counts = pd.merge(source_counts, target_counts, how='outer')
link_counts.fillna(0, inplace=True)

In [64]:
link_counts['target_source_ratio'] = link_counts['target_count'] / link_counts['source_count']
link_counts['target_source_ratio'].replace([np.inf], np.nan, inplace=True)

link_counts

Unnamed: 0,article,source_count,target_count,target_source_ratio
0,United_States,294.0,1551.0,5.275510
1,Driving_on_the_left_or_right,255.0,0.0,0.000000
2,List_of_countries,244.0,63.0,0.258197
3,List_of_circulating_currencies,236.0,8.0,0.033898
4,List_of_sovereign_states,216.0,43.0,0.199074
...,...,...,...,...
4587,Osteomalacia,0.0,3.0,
4588,Directdebit,0.0,2.0,
4589,Duchenne_muscular_dystrophy,0.0,1.0,
4590,Klinefelter%27s_syndrome,0.0,1.0,


In [71]:
import uuid
import os
import urllib.parse
from bs4 import BeautifulSoup, Comment

def gen_uniq_str(str_):
    return uuid.uuid4().hex[:len(str_)+1]

def find_html_position(source, targets):
    article_quote = source
    with open('data/wpcd/wp/{}/{}.htm'.format(article_quote[0].lower(), article_quote)) as f:
        try:
            art_html = f.read()
        except:
            print(source)
            return -1

    soup = BeautifulSoup(art_html, features="html.parser")
    for script in soup(["script", "style", "head"]):
        script.extract()    # rip it out

    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    for comment in comments:
        comment.extract()
        
    locators = []
    
    for tgt in targets:
        try:
            tgt = urllib.parse.unquote_plus(tgt)
            locator = gen_uniq_str(tgt)
            locators.append(locator)
            soup.find('a', attrs={'title': tgt.replace("_", " ")}).replace_with(locator)
        except:
            pass
    
    text = " ".join(soup.text.split())
    
    pos = {}
    for iloc, loc in enumerate(locators):
        pos[targets[iloc]] = text.find(loc)/len(text)
        
    return pos


In [None]:
# Add the 'html_position' column to the DataFrame
link_network['html_position'] = link_network.apply(
    lambda row: find_html_position(row['source'], row['target']), axis=1
)


  comments = soup.findAll(text=lambda text:isinstance(text, Comment))
