In [2]:
import pandas as pd
import os

In [6]:
continents = pd.read_csv(
    os.path.join("data", "continents.csv"),
)
continents["continent"] = continents["continent"].str.replace("_", " ")
continents["continent"] = continents["continent"].replace({"Oceania": "Australia"})

display(continents)

Unnamed: 0,article,continent
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Europe
1,%C3%85land,Europe
2,%C3%89douard_Manet,Europe
3,%C3%89ire,Europe
4,%C3%93engus_I_of_the_Picts,Europe
...,...,...
4599,Zionism,Asia
4600,Zirconium,International
4601,Zoroaster,Asia
4602,Zuid-Gelders,Europe


In [4]:
categories = pd.read_csv(
    os.path.join("data", "wikispeedia_paths-and-graph", "categories.tsv"),
    delimiter="\t",
    header=None,
    names=["article", "category"],
    skip_blank_lines=True,
    comment="#",
    encoding="UTF-8"
)

categories.head()

Unnamed: 0,article,category
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures
2,%C3%85land,subject.Countries
3,%C3%85land,subject.Geography.European_Geography.European_...
4,%C3%89douard_Manet,subject.People.Artists


## Changing continents based on categories seen

In [67]:
def change_continent(article_name, continent):
    row_index = continents.index[continents.article == article_name].item()
    continents.loc[row_index,'continent'] = continent
    #print(continents[continents.article == article_name])

In [68]:
#Space
change_continent('Apollo_11','North America')
change_continent('Apollo_8','North America')
change_continent('Moon_landing','North America')
change_continent('Phoenix_%28spacecraft%29','North America')
change_continent('Sputnik_1','Europe')
change_continent('Sputnik_2','Europe')
change_continent('Sputnik_program','Europe')
change_continent('Caloris_Basin','International')

In [113]:
#Science
change_continent('Atom','International')

#Animals
change_continent('Box_jellyfish','International')
change_continent('Basking_shark','International')
change_continent('Bass_%28fish%29','International')
change_continent('Brown_trout','International')
change_continent('Cane_Toad', 'International')
change_continent('Goldfish', 'International')
change_continent('Marginated_Tortoise', 'International')
change_continent('Badger', 'International')

#Birds
change_continent('Archaeopteryx','International')
change_continent('Arctic_Tern','International')
change_continent('Barn_Owl','International')
change_continent('Blackbird','International')
change_continent('Coal_Tit', 'International')
change_continent('Common_Cuckoo', 'International')
change_continent('Common_Eider', 'International')
change_continent('Common_Tern', 'International')
change_continent('Fieldfare','International')


In [122]:
#Religion
change_continent('Allegory_in_the_Middle_Ages', 'International')
change_continent('Pentateuch','International')
change_continent('Qur%27an','International')
change_continent('Vinaya','Asia')


In [123]:
#Medicine
change_continent('Achilles_tendon','International')
change_continent('Lyme_disease','International')
change_continent('The_Origin_of_Species','International')

In [131]:
#People
change_continent('David_Heymann','North America')
change_continent('Edmund_Hillary','Australia')
change_continent('Euclid','Europe')
change_continent('Ferdinand_Magellan','Europe')
change_continent('H.D.','North America')
change_continent('Helen','Europe')
change_continent('Rebecca_Helferich_Clarke','Europe')
change_continent('Stephen_Trigg','North America')
change_continent('Vasco_da_Gama','Europe')



In [140]:
#Music
change_continent('AC_DC','Australia')
change_continent('Iron_Maiden','Europe')
change_continent('U2','Europe')
change_continent('Himno_Nacional_Mexicano','South_America')
change_continent('Bassoon','International')

In [170]:
#IT
change_continent('ZX_Spectrum','International')
change_continent('BASIC','International')
change_continent('Forth','International')

#Food
change_continent('Paprika','International')

In [173]:
#Transport
change_continent('Automatic_number_plate_recognition','International')

#Rights
change_continent('Civil_rights','International')

In [157]:
#Plants => All International but Wasabi
for article in categories[categories.category == "subject.Science.Biology.Plants"]['article'].values:
    if article != 'Wasabi':
        change_continent(article,'International')



In [164]:
#Dinosaurs => All international
for article in categories[categories.category == "subject.Science.Biology.Dinosaurs"]['article'].values:
    change_continent(article,'International')

for article in categories[categories.category == "subject.Everyday_life.Recreation"]['article'].values:
    change_continent(article,'International')


In [165]:
articles = pd.merge(categories, continents, on="article", how="inner")
articles = articles.fillna("")
articles.head()

Unnamed: 0,article,category,continent
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...,Europe
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures,Europe
2,%C3%85land,subject.Countries,Europe
3,%C3%85land,subject.Geography.European_Geography.European_...,Europe
4,%C3%89douard_Manet,subject.People.Artists,Europe


In [166]:
category_counts = categories.category.value_counts().reset_index().rename(columns={"count": "total"})

In [5]:
pd.reset_option('display.max_rows')

In [168]:
continents_categories = articles.groupby("continent").value_counts(["category"]).reset_index()
continents_categories = pd.merge(continents_categories, category_counts, on="category", how="left")

continents_categories["percentage"] = continents_categories["count"] / continents_categories["total"]

#display(continents_categories.sort_values("percentage", ascending=False))

- The 1 in Astronomy is Edmund Halley - Europe
- The 1 in Physics is Schrodinger's-Equation - Europe
- The 1 in materials science is Ivory - Africa (Makes sense)
- The 1 in Chemical Elements is Californium - NA

In [169]:
categories_continents = articles.groupby("category").value_counts(["continent"]).groupby(level=0).head(3).reset_index()
categories_continents = pd.merge(categories_continents, category_counts, on="category", how="left")

categories_continents["percentage"] = categories_continents["count"] / categories_continents["total"]

categories_continents.sort_values(["category", "percentage"], ascending=False)

Unnamed: 0,category,continent,count,total,percentage
313,subject.Science.Physics.The_Planets,International,13,13,1.0
310,subject.Science.Physics.Space_transport,International,20,33,0.606061
311,subject.Science.Physics.Space_transport,Europe,6,33,0.181818
312,subject.Science.Physics.Space_transport,North America,5,33,0.151515
308,subject.Science.Physics.Space_Astronomy,International,104,105,0.990476
309,subject.Science.Physics.Space_Astronomy,Europe,1,105,0.009524
306,subject.Science.Physics.General_Physics,International,54,55,0.981818
307,subject.Science.Physics.General_Physics,Europe,1,55,0.018182
305,subject.Science.Physics.Electricity_and_Electr...,International,20,20,1.0
303,subject.Science.Chemistry.Materials_science,International,9,10,0.9


In [24]:
# I am here modifying elements that I find
def change_continent(article_name, continent):
    row_index = continents.index[continents.article == article_name].item()
    continents.loc[row_index,'continent'] = continent
    print(continents[continents.article == article_name])

       article      continent
302  Apollo_11  North America
      article      continent
303  Apollo_8  North America
           article      continent
2833  Moon_landing  North America
