# Project (P2)

## Getting familiar with the data

In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
%matplotlib inline

DATA_FOLDER = 'data/'
WIKISPEEDIA_PATHS = DATA_FOLDER + "wikispeedia_paths-and-graph/"
WIKIDATA_PATH = DATA_FOLDER + "wikidata/"

ARTICLES = WIKISPEEDIA_PATHS + "articles.tsv"
CATEGORIES = WIKISPEEDIA_PATHS + "categories.tsv"
LINKS = WIKISPEEDIA_PATHS + "links.tsv"
PATHS_FINISHED = WIKISPEEDIA_PATHS + "paths_finished.tsv"
PATHS_UNFINISHED = WIKISPEEDIA_PATHS + "paths_unfinished.tsv"

FEMALE_ACTORS = WIKIDATA_PATH + "female_actors.csv"
FEMALE_CELEB = WIKIDATA_PATH + "female_celeb.csv"
FEMALE_MODEL = WIKIDATA_PATH + "female_model.csv"

In [126]:
articles = pd.read_csv(ARTICLES, sep='\t', names=["article"], skiprows = 12)
categories = pd.read_csv(CATEGORIES, sep='\t', names=["article", "category"], skiprows = 13)
links = pd.read_csv(LINKS, sep='\t', names=["link_source", "link_target"], skiprows = 12)
paths_finished = pd.read_csv(PATHS_FINISHED, sep='\t', names=["hashedIpAddress", "timestamp", "durationInSec", "path", "rating"], skiprows = 15)
paths_unfinished = pd.read_csv(PATHS_UNFINISHED, sep='\t', names=["hashedIpAddress", "timestamp", "durationInSec", "path", "target", "type"], skiprows = 17)

In [127]:
female_actors = pd.read_csv(FEMALE_ACTORS, sep=',', names=["women", "women_label"], skiprows = 1)
female_celeb =  pd.read_csv(FEMALE_CELEB, sep=',', names=["women", "women_label"], skiprows = 1)
female_model =  pd.read_csv(FEMALE_MODEL, sep=',', names=["women", "women_label"], skiprows = 1)

### **Articles**
Contains the name of the articles

In [18]:
print(f"Number of articles in the dataset: {len(articles)}")
articles.head(10)

Number of articles in the dataset: 4604


Unnamed: 0,article
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
1,%C3%85land
2,%C3%89douard_Manet
3,%C3%89ire
4,%C3%93engus_I_of_the_Picts
5,%E2%82%AC2_commemorative_coins
6,10th_century
7,11th_century
8,12th_century
9,13th_century


### Categories

Every row contains an article and its corresponding category. 

In [19]:
print(f"Number of rows in the dataset: {len(categories)}")
categories.head()

Number of rows in the dataset: 5204


Unnamed: 0,article,category
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.History.British_History.British_Histor...
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures
2,%C3%85land,subject.Countries
3,%C3%85land,subject.Geography.European_Geography.European_...
4,%C3%89douard_Manet,subject.People.Artists


In [32]:
# Number of articles in each category
n_artcat = categories.groupby(categories["category"]).nunique()

print(f"The number of categories is {len(n_artcat)}")
n_artcat

The number of categories is 129


Unnamed: 0_level_0,article
category,Unnamed: 1_level_1
subject.Art.Art,36
subject.Art.Artists,2
subject.Business_Studies.Business,28
subject.Business_Studies.Companies,18
subject.Business_Studies.Currency,14
...,...
subject.Science.Physics.Electricity_and_Electronics,20
subject.Science.Physics.General_Physics,55
subject.Science.Physics.Space_Astronomy,105
subject.Science.Physics.Space_transport,33


In [43]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(n_artcat)

                                                    article
category                                                   
subject.Art.Art                                          36
subject.Art.Artists                                       2
subject.Business_Studies.Business                        28
subject.Business_Studies.Companies                       18
subject.Business_Studies.Currency                        14
subject.Business_Studies.Economics                       28
subject.Citizenship.Animal_and_Human_Rights              17
subject.Citizenship.Community_organisations              11
subject.Citizenship.Conflict_and_Peace                   10
subject.Citizenship.Culture_and_Diversity                16
subject.Citizenship.Education                            17
subject.Citizenship.Environment                          26
subject.Citizenship.Law                                  19
subject.Citizenship.Media                                11
subject.Citizenship.Politics_and_governm

**Actors_models_and_celebrities**

In [130]:
# Add underscore inbetween first and last names in female_actors dataframe
female_actors.women_label = list(map(lambda x: x.replace(' ', '_'), female_actors.women_label))
female_celeb.women_label = list(map(lambda x: x.replace(' ', '_'), female_celeb.women_label))
female_model.women_label = list(map(lambda x: x.replace(' ', '_'), female_model.women_label))

In [134]:
female_actors.head()

Unnamed: 0,women,women_label
0,http://www.wikidata.org/entity/Q230739,Katie_Couric
1,http://www.wikidata.org/entity/Q449014,Christine_Chubbuck
2,http://www.wikidata.org/entity/Q494466,Andrea_Mitchell
3,http://www.wikidata.org/entity/Q153576,Linh_Nga
4,http://www.wikidata.org/entity/Q79845,Madhu_Shalini


In [95]:
#female_actors = female_actors["women_label"]

In [114]:
# Let's first take a look at the category subject.People.Actors_models_and_celebrities
wikispeedia_actors = categories[categories["category"] == "subject.People.Actors_models_and_celebrities"]
wikispeedia_actors.head()

Unnamed: 0,article,category
276,Andrew_Robinson,subject.People.Actors_models_and_celebrities
638,Bette_Davis,subject.People.Actors_models_and_celebrities
760,Brandon_Routh,subject.People.Actors_models_and_celebrities
787,Britney_Spears,subject.People.Actors_models_and_celebrities
793,Bruce_Lee,subject.People.Actors_models_and_celebrities


In [135]:
act = wikispeedia_actors.merge(female_actors, left_on='article', right_on='women_label')
cele = wikispeedia_actors.merge(female_celeb, left_on='article', right_on='women_label')
mod = wikispeedia_actors.merge(female_model, left_on='article', right_on='women_label')

In [136]:
act

Unnamed: 0,article,category,women,women_label
0,Bette_Davis,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q71206,Bette_Davis
1,Emma_Roberts,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q228598,Emma_Roberts
2,Evan_Rachel_Wood,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q229230,Evan_Rachel_Wood
3,Jane_Fonda,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q41142,Jane_Fonda
4,Keira_Knightley,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q42581,Keira_Knightley
5,Miranda_Otto,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q294975,Miranda_Otto
6,Natalie_Portman,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q37876,Natalie_Portman
7,Sharon_Tate,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q228699,Sharon_Tate
8,Vivien_Leigh,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q129429,Vivien_Leigh


In [137]:
cele

Unnamed: 0,article,category,women,women_label
0,Britney_Spears,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q11975,Britney_Spears


In [138]:
mod

Unnamed: 0,article,category,women,women_label
0,Britney_Spears,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q11975,Britney_Spears
1,Emma_Roberts,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q228598,Emma_Roberts
2,Evan_Rachel_Wood,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q229230,Evan_Rachel_Wood
3,Sharon_Tate,subject.People.Actors_models_and_celebrities,http://www.wikidata.org/entity/Q228699,Sharon_Tate


In [117]:
wikispeedia_actors

Unnamed: 0,article,category
276,Andrew_Robinson,subject.People.Actors_models_and_celebrities
638,Bette_Davis,subject.People.Actors_models_and_celebrities
760,Brandon_Routh,subject.People.Actors_models_and_celebrities
787,Britney_Spears,subject.People.Actors_models_and_celebrities
793,Bruce_Lee,subject.People.Actors_models_and_celebrities
971,Charlie_Chaplin,subject.People.Actors_models_and_celebrities
1017,Christian_Bale,subject.People.Actors_models_and_celebrities
1278,Daniel_Day-Lewis,subject.People.Actors_models_and_celebrities
1344,Diane_Keaton,subject.People.Actors_models_and_celebrities
1547,Emma_Roberts,subject.People.Actors_models_and_celebrities


**TACKLE ALL PEOPLE CATEGORIES**

In [62]:
df = categories
df['categories_people'] = list(
    map(lambda x: x.startswith('subject.People.'), df['category'])) 

In [65]:
ca = df[df["categories_people"] == True]
ca

Unnamed: 0,article,category,categories_people
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,subject.People.Historical_figures,True
4,%C3%89douard_Manet,subject.People.Artists,True
8,%C3%93engus_I_of_the_Picts,subject.People.Historical_figures,True
67,A._E._J._Collins,subject.People.Sports_and_games_people,True
88,Abbas_I_of_Persia,subject.People.Historical_figures,True
...,...,...,...
5188,Zhang_Qian,subject.People.Historical_figures,True
5190,Zheng_He,subject.People.Historical_figures,True
5191,Ziad_Jarrah,subject.People.Historical_figures,True
5197,Zionism,subject.People.Political_People,True


### Links

Contains the source and target article that the dataset contains.

In [34]:
print(f"Number of combinations of source and target articles in the dataset: {len(links)}")
links.head()

Number of combinations of source and target articles in the dataset: 119882


Unnamed: 0,link_source,link_target
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Bede
1,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Columba
2,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,D%C3%A1l_Riata
3,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Great_Britain
4,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,Ireland


### Finished paths

In [36]:
print(f"Number of finished paths: {len(paths_finished)}")
paths_finished.head(5)

Number of finished paths: 51318


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0


### Unfinished paths

In [38]:
print(f"Number of finished paths: {len(paths_unfinished)}")
paths_unfinished.head(5)

Number of finished paths: 24875


Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,target,type
0,2426091a53125110,1297054935,1804,Obi-Wan_Kenobi,Microsoft,timeout
1,26141fd878806294,1297055651,1805,Julius_Caesar,Caracas,timeout
2,2b015fb8181c48f2,1297090819,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout
3,53a53bc244e08a6a,1297094761,49,Paraguay,Mount_St._Helens,restart
4,53a53bc244e08a6a,1297099105,1808,Paraguay;Bolivia,Mount_St._Helens,timeout
