# Imports

In [1]:
import os
import re
import pandas as pd
from scipy.special import softmax

# Data loading

In [2]:
df_categories = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "categories.tsv"),
    sep="\t",
    comment='#',
    skip_blank_lines=True,
    header=None,
    names=["name", "category"]
)

df_categories.head()

Unnamed: 0,name,category
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures
2,%C3%85land,subject.Countries
3,%C3%85land,subject.Geography.European_Geography.European_...
4,%C3%89douard_Manet,subject.People.Artists


In [3]:
df_paths_finished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_finished.tsv"),
    sep="\t",
    comment='#',
    skip_blank_lines=True,
    header=None,
    index_col=False,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"]
)

df_paths_finished["start"] = df_paths_finished.path.apply(lambda x: x.split(";")[0])
df_paths_finished["target"] = df_paths_finished.path.apply(lambda x: x.split(";")[-1])

df_paths_finished.head()

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,start,target
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,14th_century,African_slave_trade
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,14th_century,African_slave_trade
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,14th_century,African_slave_trade
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,,14th_century,Greece
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,14th_century,John_F._Kennedy


In [4]:
df_paths_unfinished = pd.read_csv(
    os.path.join("Data", "wikispeedia_paths-and-graph", "paths_unfinished.tsv"),
    sep="\t",
    comment='#',
    skip_blank_lines=True,
    header=None,
    index_col=False,
    names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"]
)

df_paths_unfinished["start"] = df_paths_unfinished.path.apply(lambda x: x.split(";")[0])

df_paths_unfinished.head()

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,target,type,start
0,2426091a53125110,1297054935,1804,Obi-Wan_Kenobi,Microsoft,timeout,Obi-Wan_Kenobi
1,26141fd878806294,1297055651,1805,Julius_Caesar,Caracas,timeout,Julius_Caesar
2,2b015fb8181c48f2,1297090819,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout,Malawi
3,53a53bc244e08a6a,1297094761,49,Paraguay,Mount_St._Helens,restart,Paraguay
4,53a53bc244e08a6a,1297099105,1808,Paraguay;Bolivia,Mount_St._Helens,timeout,Paraguay


In [5]:
df_links = pd.read_csv(
    os.path.join("Data", "links_with_order.tsv"),
    delimiter="\t",
).sort_values(by="order")

df_links.head()

Unnamed: 0,linkSource,linkTarget,order
60904,Khazars,History_of_Russia,1.0
70072,Marseille,List_of_countries,1.0
70094,Marshall_Islands,English_language,1.0
31458,Domestic_AC_power_plugs_and_sockets,Alternating_current,1.0
4285,Akkadian_Empire,Mesopotamia,1.0


In [6]:
df_page_rank = pd.read_csv(os.path.join("Data", "page_rank.csv")).sort_values(by="PageRank", ascending=False)

df_page_rank.head()

Unnamed: 0,Article,PageRank
0,United_States,0.014263
1,United_Kingdom,0.007679
2,Scientific_classification,0.007209
3,Europe,0.007043
4,England,0.006815


# Continent categories

In [7]:
categories = df_categories.category.unique()

subcategories = {}
init_key_words = {
    "Europe": [
        "europe",
        "britain",
        "british"
    ],
    "North America": [
        "north_americ",
        "usa",
        "canadian",
    ],
    "Asia": [
        "asia",
    ],
    "Africa": [
        "africa"
    ],
    "South America": [
        "south_americ"
    ],
    "Australia": [
        "australasia"
    ],
    "Antarctica": [
        "antarctica"
    ],
    "Middle East": [
        "middle_east"
    ],
}

for category in categories:

    found = False
    for key, values in init_key_words.items():
        for value in values:
            if re.search(f"[\._]{value}", category.lower()):
                if key not in subcategories:
                    subcategories[key] = set()
                subcategories[key].add(category)

                found = True
                break
        if found:
            break


In [8]:
key_words = {}

for name in df_categories.name.unique():
    name_category = df_categories[df_categories.name == name].category.values
    name = name.lower()

    found = False
    for key, values in init_key_words.items():
        for value in values:
            for category in name_category:
                if re.search(f"[\._]{value}", category.lower()):
                    if key not in key_words:
                        key_words[key] = []
                    key_words[key].append(name)
                    subcategories[key].add(category)

                    found = True
                    break
            if found:
                break
        if found:
            break

In [9]:
print("Categories and their Subcategories:")
for key in sorted(subcategories, key=lambda x: len(subcategories[x]), reverse=True):
    print(f"Category: {key}, Subcategories count: {len(subcategories[key])}")
    for value in subcategories[key]:
        print("--->", value)
    print()

Categories and their Subcategories:
Category: Europe, Subcategories count: 9
---> subject.People.Monarchs_of_Great_Britain
---> subject.History.British_History
---> subject.Geography.Geography_of_Great_Britain
---> subject.Geography.European_Geography.European_Countries
---> subject.Geography.European_Geography
---> subject.History.British_History.British_History_Post_1900
---> subject.History.British_History.British_History_1500_and_before_including_Roman_Britain
---> subject.History.British_History.British_History_17501900
---> subject.History.British_History.British_History_15001750

Category: North America, Subcategories count: 3
---> subject.People.USA_Presidents
---> subject.History.North_American_History
---> subject.Geography.North_American_Geography

Category: Middle East, Subcategories count: 2
---> subject.Geography.Geography_of_the_Middle_East
---> subject.Geography.Geography_of_the_Middle_East.Middle_Eastern_Countries

Category: Africa, Subcategories count: 2
---> subject.

In [10]:
print("Categories Key Words:")
for key, values in key_words.items():
    print(f"Category: {key}, Key words: {', '.join(values)}")

Categories Key Words:
Category: Europe, Key words: %c3%81ed%c3%a1n_mac_gabr%c3%a1in, %c3%85land, %c3%89ire, %c3%93engus_i_of_the_picts, aachen, aarhus, abbey, abbot, aberdeen, aberystwyth, acts_of_union_1707, akrotiri_and_dhekelia, albania, alfred_the_great, amsterdam, andorra, anglo-saxon_literature, anne_of_great_britain, antwerp, armenia, arthur_sullivan, arthur_wellesley%2c_1st_duke_of_wellington, athens, atlantic_slave_trade, austria, avon_gorge, azerbaijan, barcelona, basel, bath%2c_somerset, battle_of_britain, battle_of_hastings, bede, belarus, belfast, belgium, ben_jonson, ben_nevis, benjamin_disraeli%2c_1st_earl_of_beaconsfield, berlin, birmingham, bj%c3%b8rn%c3%b8ya, blackpool, bodyline, bologna, boltysh_crater, bosnia_and_herzegovina, boudica, boyle_roche, bratislava, brighton, bristol, british_east_india_company, british_empire, british_isles, british_isles_%28terminology%29, brussels, bucharest, budapest, bulgaria, cambridge, canterbury, canute_the_great, caratacus, cardif

In [11]:
all_categories = []
for values in subcategories.values():
    all_categories.extend(values)

df_rest = df_categories[~df_categories.category.isin(all_categories)]

article_labeled = {}
for key, values in subcategories.items():
    for value in values:
        for name in df_categories[df_categories.category == value].name.unique():
            if name in article_labeled:
                continue
            article_labeled[name] = key

for key, values in init_key_words.items():
    for value in values:
        for name in df_rest[df_rest.name.str.lower().str.contains(value)].name.unique():
            if name in article_labeled:
                continue
            article_labeled[name] = key

for key, values in key_words.items():
    for value in values:
        for name in df_rest[df_rest.name.str.lower().str.contains(value)].name.unique():
            if name in article_labeled:
                continue
            article_labeled[name] = key

print(f"Number of articles with assigned continent category: {len(article_labeled)}")

Number of articles with assigned continent category: 1318


## Number of articles per each continent group

In [12]:
line_size = 33
col1_size = 13

df_article_labeled = pd.DataFrame(article_labeled.items(), columns=["name", "category"])

print("-" * line_size)
print(f"| {'Continent' : <{col1_size}} | Aricles count |")
print("-" * line_size)
for name, count in df_article_labeled.category.value_counts().items():
    print(f"| {name : <{col1_size}} | {count : <{len('Aricles count')}} |")
print("-" * line_size)



---------------------------------
| Continent     | Aricles count |
---------------------------------
| Europe        | 568           |
| North America | 211           |
| Africa        | 167           |
| Asia          | 147           |
| South America | 93            |
| Australia     | 68            |
| Middle East   | 56            |
| Antarctica    | 8             |
---------------------------------


In [13]:
line_size = 33
col1_size = 13

map_ = {
    "Europe": "Europe",
    "North America": "North America",
    "Asia": "Other",
    "Africa": "Other",
    "South America": "Other",
    "Australia": "Other",
    "Antarctica": "Other",
    "Middle East": "Other",
}



df_article_labeled = pd.DataFrame(article_labeled.items(), columns=["name", "category"])
df_article_labeled["categories_grouped"] = df_article_labeled.category.map(map_)

print("-" * line_size)
print(f"| {'Continent' : <{col1_size}} | Aricles count |")
print("-" * line_size)
for name, count in df_article_labeled.categories_grouped.value_counts().items():
    print(f"| {name : <{col1_size}} | {count : <{len('Aricles count')}} |")
print("-" * line_size)

---------------------------------
| Continent     | Aricles count |
---------------------------------
| Europe        | 568           |
| Other         | 539           |
| North America | 211           |
---------------------------------


## Number of paths with start/target article labeled with continent category

In [14]:
line_size = 41
col1_size = 13
col2_size = 8
col3_size = 10

print("-" * line_size)
print(f"| {'Start' : <{col1_size}} | {'Finished' : <{col2_size}} | {'Unfinished' : <{col3_size}} |")
print("-" * line_size)
for key in subcategories.keys():
    names = df_article_labeled[df_article_labeled.category == key].name.values
    print(f"| {key : <{col1_size}} | {len(df_paths_finished[df_paths_finished.start.isin(names)]) : <{col2_size}} | {len(df_paths_unfinished[df_paths_unfinished.start.isin(names)]) : <{col3_size}} |")

print("-" * line_size)
print(f"| {'Target' : <{col1_size}} | {'Finished' : <{col2_size}} | {'Unfinished' : <{col3_size}} |")
print("-" * line_size)
for key in subcategories.keys():
    names = df_article_labeled[df_article_labeled.category == key].name.values
    print(f"| {key : <{col1_size}} | {len(df_paths_finished[df_paths_finished.target.isin(names)]) : <{col2_size}} | {len(df_paths_unfinished[df_paths_unfinished.target.isin(names)]) : <{col3_size}} |")
print("-" * line_size)

-----------------------------------------
| Start         | Finished | Unfinished |
-----------------------------------------
| Europe        | 5120     | 2325       |
| Middle East   | 584      | 240        |
| Africa        | 1562     | 611        |
| Asia          | 1464     | 658        |
| North America | 1920     | 973        |
| Australia     | 567      | 266        |
| South America | 958      | 460        |
| Antarctica    | 73       | 24         |
-----------------------------------------
| Target        | Finished | Unfinished |
-----------------------------------------
| Europe        | 6512     | 2398       |
| Middle East   | 872      | 153        |
| Africa        | 1731     | 398        |
| Asia          | 1859     | 580        |
| North America | 2693     | 799        |
| Australia     | 490      | 211        |
| South America | 1165     | 320        |
| Antarctica    | 69       | 30         |
-----------------------------------------


In [15]:
line_size = 41
col1_size = 13
col2_size = 8
col3_size = 10

print("-" * line_size)
print(f"| {'Start' : <{col1_size}} | {'Finished' : <{col2_size}} | {'Unfinished' : <{col3_size}} |")
print("-" * line_size)
for key in df_article_labeled.categories_grouped.unique():
    names = df_article_labeled[df_article_labeled.categories_grouped == key].name.values
    print(f"| {key : <{col1_size}} | {len(df_paths_finished[df_paths_finished.start.isin(names)]) : <{col2_size}} | {len(df_paths_unfinished[df_paths_unfinished.start.isin(names)]) : <{col3_size}} |")

print("-" * line_size)
print(f"| {'Target' : <{col1_size}} | {'Finished' : <{col2_size}} | {'Unfinished' : <{col3_size}} |")
print("-" * line_size)
for key in df_article_labeled.categories_grouped.unique():
    names = df_article_labeled[df_article_labeled.categories_grouped == key].name.values
    print(f"| {key : <{col1_size}} | {len(df_paths_finished[df_paths_finished.target.isin(names)]) : <{col2_size}} | {len(df_paths_unfinished[df_paths_unfinished.target.isin(names)]) : <{col3_size}} |")
print("-" * line_size)

-----------------------------------------
| Start         | Finished | Unfinished |
-----------------------------------------
| Europe        | 5120     | 2325       |
| Other         | 5208     | 2259       |
| North America | 1920     | 973        |
-----------------------------------------
| Target        | Finished | Unfinished |
-----------------------------------------
| Europe        | 6512     | 2398       |
| Other         | 6186     | 1692       |
| North America | 2693     | 799        |
-----------------------------------------


# Continent category assignment based on weigted combination of outgoing links

The link weight is the ```1/i``` where ```i``` is the order of link position on the article page.

In [16]:
def get_category_by_links(article_name, df_links, article_labeled):
    links = df_links[df_links.linkSource == article_name].linkTarget.values
    counter = {}
    for link_index, link in enumerate(links, 1):
        if link in article_labeled.keys():
            continent = article_labeled[link]
            if continent in counter.keys():
                counter[continent] += 1.0 / link_index
            else:
                counter[continent] = 1.0 / link_index

    if len(counter.values()) == 0:
        return "Unknown"

    softmax_values = softmax(list(counter.values()))
    softmax_dict = dict(zip(counter.keys(), softmax_values))

    return max(softmax_dict, key=lambda k: softmax_dict[k])

In [17]:

print("Articles with continent category assigned based on outgoing links:")
for article in df_categories[df_categories.category.str.startswith("subject.People")].name.unique():
    print(f"{article :.<50}{get_category_by_links(article, df_links, article_labeled)}")

Articles with continent category assigned based on outgoing links:
%C3%81ed%C3%A1n_mac_Gabr%C3%A1in..................Europe
%C3%89douard_Manet................................Europe
%C3%93engus_I_of_the_Picts........................Europe
A._E._J._Collins..................................Europe
Abbas_I_of_Persia.................................Europe
Abel_Tasman.......................................Australia
Abraham_Goldfaden.................................Europe
Abraham_Lincoln...................................North America
Adam_Smith........................................Europe
Adi_Shankara......................................Asia
Adolf_Hitler......................................Europe
Adriaen_van_der_Donck.............................North America
Agamemnon.........................................Europe
Akbar.............................................Asia
Akhenaten.........................................Africa
Akira_Kurosawa....................................Asia
Alan_Turin

Albert_Einstein...................................Europe
Alberto_Santos-Dumont.............................South America
Albrecht_D%C3%BCrer...............................Europe
Albrecht_Rodenbach................................Europe
Alcibiades........................................Europe
Aleksandr_Pushkin.................................Europe
Aleksandr_Vasilevsky..............................Europe
Aleksey_Arakcheyev................................Europe
Alexander_Graham_Bell.............................Europe
Alexander_Hamilton................................North America
Alexander_the_Great...............................Europe
Alfred_Hitchcock..................................Europe
Alfred_Nobel......................................Europe
Alfred_the_Great..................................Europe
Amda_Seyon_I......................................Africa
Andr%C3%A9s_Nocioni...............................South America
Andrew_Carnegie...................................Europe
Andrew_Dic

# Article/Page rank

We have computed page rank for each article, which could give us interesting insight on articles or be used as feature in observation study matching.

In [18]:
print(f"{'Article' : <37} Page Rank")
for index in range(10):
    print(f"{df_page_rank.iloc[index].Article :.<40}{df_page_rank.iloc[index].PageRank :.5f}")

Article                               Page Rank
United_States...........................0.01426
United_Kingdom..........................0.00768
Scientific_classification...............0.00721
Europe..................................0.00704
England.................................0.00682
France..................................0.00670
Animal..................................0.00659
World_War_II............................0.00554
English_language........................0.00496
London..................................0.00496
