# Entities Analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
REDIRECTS_FILEPATH = "data/raw_data/redirects_lang=en.ttl"
REDIRECTS_TRANSITIVE_FILEPATH = "data/raw_data/redirects_lang=en_transitive.ttl"
REMAINING_TRIPLES_FILEPATH = "data/exported_data/remaining_triples.csv"
ENTITIES_SAMPLE_FILEPATH = "data/exported_data/entities_sample.csv"
MW_PROPS_BS_FILEPATH = "data/exported_data/mutual_wikilinks_properties_both_sides.csv"
MW_PROPS_OS_FILEPATH = "data/exported_data/mutual_wikilinks_properties_one_side.csv"

In [3]:
redirects = pd.read_csv(REDIRECTS_FILEPATH, sep=" ", header=None, names=["subject", "predicate", "object", "."])
redirects = redirects.drop(columns=".")
# remove "<" and ">"
for col in redirects.columns:
    redirects[col] = redirects[col].str[1:-1]

entities_sample = pd.read_csv(ENTITIES_SAMPLE_FILEPATH)

mw_props = pd.concat((pd.read_csv(MW_PROPS_BS_FILEPATH), pd.read_csv(MW_PROPS_OS_FILEPATH)))

remaining_triples = pd.read_csv(REMAINING_TRIPLES_FILEPATH)

In [4]:
redirects.head()

Unnamed: 0,subject,predicate,object
0,http://dbpedia.org/resource/!!!!!!!,http://dbpedia.org/ontology/wikiPageRedirects,http://dbpedia.org/resource/When_We_All_Fall_A...
1,http://dbpedia.org/resource/!!!Fuck_You!!!,http://dbpedia.org/ontology/wikiPageRedirects,http://dbpedia.org/resource/Fuck_You_(EP)
2,http://dbpedia.org/resource/!!!Fuck_You!!!_And...,http://dbpedia.org/ontology/wikiPageRedirects,http://dbpedia.org/resource/Fuck_You_and_Then_...
3,http://dbpedia.org/resource/!!!Fuck_You!!!_and...,http://dbpedia.org/ontology/wikiPageRedirects,http://dbpedia.org/resource/Fuck_You_and_Then_...
4,http://dbpedia.org/resource/!!!_(!!!_album),http://dbpedia.org/ontology/wikiPageRedirects,http://dbpedia.org/resource/!!!_(album)


In [5]:
entities_sample.head()

Unnamed: 0,subject,in_degree_wikilinks,out_degree_wikilinks,in_degree_all_except_wikilinks,out_degree_all_except_wikilinks,n_types,n_mutual_wikilinks
0,http://dbpedia.org/resource/Tamarixia_actis,1.0,0.0,0.0,0.0,0,0.0
1,http://dbpedia.org/resource/Scott_Rothkopf,6.0,21.0,0.0,0.0,0,1.0
2,http://dbpedia.org/resource/Sanvidhan_Divas,0.0,1.0,0.0,0.0,0,0.0
3,http://dbpedia.org/resource/History_of_La_Harp...,0.0,1.0,0.0,0.0,0,0.0
4,http://dbpedia.org/resource/H17_(disambiguation),0.0,1.0,0.0,0.0,0,0.0


In [6]:
remaining_triples.head()

Unnamed: 0,subject,predicate,object
0,http://dbpedia.org/resource/1910–11_Hibernian_...,http://dbpedia.org/ontology/team,http://dbpedia.org/resource/Hibernian_F.C.
1,http://dbpedia.org/resource/1910–11_Hibernian_...,http://dbpedia.org/ontology/position,http://dbpedia.org/resource/1910–11_Scottish_F...
2,http://dbpedia.org/resource/1910–11_Hibernian_...,http://dbpedia.org/ontology/league,http://dbpedia.org/resource/Scottish_First_Div...
3,http://dbpedia.org/resource/1910–11_Hibernian_...,http://dbpedia.org/ontology/manager,http://dbpedia.org/resource/Dan_McMichael
4,http://dbpedia.org/resource/10th_Minnesota_Inf...,http://dbpedia.org/ontology/country,http://dbpedia.org/resource/Union_(American_Ci...


In [7]:
mw_props.head()

Unnamed: 0,object,predicate,subject
0,http://dbpedia.org/resource/1911–12_Hibernian_...,http://dbpedia.org/ontology/predecessor,http://dbpedia.org/resource/1910–11_Hibernian_...
1,http://dbpedia.org/resource/1909–10_Hibernian_...,http://dbpedia.org/ontology/successor,http://dbpedia.org/resource/1910–11_Hibernian_...
2,http://dbpedia.org/resource/Natural_Disaster_(...,http://dbpedia.org/ontology/subsequentWork,"http://dbpedia.org/resource/1,_2,_3,_4_(Plain_..."
3,http://dbpedia.org/resource/Rhythm_of_Love_(Pl...,http://dbpedia.org/ontology/previousWork,"http://dbpedia.org/resource/1,_2,_3,_4_(Plain_..."
4,http://dbpedia.org/resource/1908_FA_Cup_Final,http://dbpedia.org/ontology/followingEvent,http://dbpedia.org/resource/1909_FA_Cup_Final


There are more triples in the graph than Wikipedia pages in the English Wikipedia. Reasons for that are:

- File and Category Entities
- Entities outside of DBpedia (and thus outside of Wikipedia) like linked homepages
- Redirected Wikipedia pages
- Placeholder Wikipedia pages (pages that are linked, but don't contain anything yet)
- Career Stations
- Entities similar to career stations

## Examples of File and Category Entities

File and Category Entities can be recognized by their URI which starts with `http://dbpedia.org/resource/File:` or `http://dbpedia.org/resource/Category:`.

In [8]:
# file entities
entities_sample["file"] = entities_sample["subject"].str[:33] == "http://dbpedia.org/resource/File:"
for val in entities_sample[entities_sample["file"] == True]["subject"].sample(10, random_state=42):
    print(f"- {val}")

- http://dbpedia.org/resource/File:STAS_Spencer_Smythe.jpg
- http://dbpedia.org/resource/File:Twikker_1995.jpg
- http://dbpedia.org/resource/File:Nagaoka_Athletic_Stadium.jpg
- http://dbpedia.org/resource/File:Freydal_02.jpg
- http://dbpedia.org/resource/File:Matilde_Pretel2.png
- http://dbpedia.org/resource/File:Hinged_Door.png
- http://dbpedia.org/resource/File:(Mrs._Josefa_R._(del_Valle)_Forster,_Ignacio_del_Valle,_Jr.,_Mrs._Ysabel_(de_Valle)_Cram,_Reginaldo_F._del_Valle,_and_Ulpiano_F._del_Valle)_(12911554543).jpg
- http://dbpedia.org/resource/File:Southern_Methodist_vs._North_Texas_football_2018_03_(Green_Brigade_Marching_Band).jpg
- http://dbpedia.org/resource/File:Диплом_о_среднем_профобразовании._Россия._2021_год.jpg
- http://dbpedia.org/resource/File:102007-USC-ND-shtd.jpg


In [9]:
# category entities
entities_sample["category"] = entities_sample["subject"].str[:37] == "http://dbpedia.org/resource/Category:"
for val in entities_sample[entities_sample["category"] == True]["subject"].sample(10, random_state=42):
    print(f"- {val}")

- http://dbpedia.org/resource/Category:Buildings_and_structures_in_Washington_County,_New_York
- http://dbpedia.org/resource/Category:Ubon_United_F.C._players
- http://dbpedia.org/resource/Category:Members_of_the_Supreme_Soviet_of_the_Russian_Soviet_Federative_Socialist_Republic,_1947–1951
- http://dbpedia.org/resource/Category:1604_in_the_British_Empire
- http://dbpedia.org/resource/Category:Trotskyist_organisations_in_Italy
- http://dbpedia.org/resource/Category:Research_at_the_University_of_Montana
- http://dbpedia.org/resource/Category:Breton_bishops
- http://dbpedia.org/resource/Category:Iranian_television_shows
- http://dbpedia.org/resource/Category:2021–22_Basketball_Champions_League
- http://dbpedia.org/resource/Category:Macedonian_expatriates_in_the_United_States


## Examples of Entities Outside of DBpedia

Entities outside of DBpedia can also be recognized by their URI which does not start with `http://dbpedia.org/resource/`.

In [10]:
# entities outside of DBpedia
entities_sample["outside"] = entities_sample["subject"].str[:28] != "http://dbpedia.org/resource/"
for val in entities_sample[entities_sample["outside"] == True]["subject"].sample(10, random_state=42):
    print(f"- {val}")

- http://www.genoa-il.com
- http://www.city.fujisawa.kanagawa.jp/hospital/index.html
- http://www.eastboston.com/
- https://www.bahnhof.de/bahnhof-de/Dortmund-Derne-1033478
- http://www.penny.de
- http://www.bahnhof.de/bahnhof-de/Zeulenroda_unt_Bf
- https://cbvrce.ca/rv/
- http://vi.potsdam.ny.us/content
- http://www.tekever.com/
- https://www.iheart.com/live/q1041-1625/


## Examples of Redirected Wikipedia Pages

Redirected URIs are listed in the redirections dataset.

In [11]:
# redirected entities
redirects["redirected"] = True
entities_sample = entities_sample.merge(redirects[["subject", "redirected"]], on="subject", how="left")
entities_sample["redirected"] = entities_sample["redirected"].fillna(False)
for val in entities_sample[entities_sample["redirected"] == True]["subject"].sample(10, random_state=42):
    print(f"- {val}")

- http://dbpedia.org/resource/Brazilian_legislative_election,_1958
- http://dbpedia.org/resource/Philadelphia_Athletic_Club
- http://dbpedia.org/resource/1998–99_Carlisle_United_F._C._season
- http://dbpedia.org/resource/Harpers_(disambiguation)
- http://dbpedia.org/resource/Sir_John_Wriothesley_Russell
- http://dbpedia.org/resource/Bombay(film)
- http://dbpedia.org/resource/Maanasthambam
- http://dbpedia.org/resource/For_Esmé_—_with_Love_and_Squalor
- http://dbpedia.org/resource/Extended_Tektronix_HEX_Format
- http://dbpedia.org/resource/Heywood_&_Royton


## Examples of Placeholder Wikipedia Pages

Placeholder pages can not be recognized by their URI, but they only seem to contain incoming Wikilinks as properties connecting them to other entities in the DBpedia knowledge graph. On the Wikipedia page corresponding to the entity that linked the placeholder page entity there is at least one link to the placeholder page (indicated by the red lettering). The link to the placeholder page can be followed, but there is no content except for a text explaining that the Wikipedia page does not exist yet. A lot of these placeholder pages seem to correspond to plant or animal subspecies.

- http://dbpedia.org/resource/Tamarixia_actis
- http://dbpedia.org/resource/Pietra_Groana
- http://dbpedia.org/resource/Sphagnum_triporosum
- http://dbpedia.org/resource/Bay_cardinalfish
- http://dbpedia.org/resource/ʼM
- http://dbpedia.org/resource/Victor_Kremidas
- http://dbpedia.org/resource/Ozarba_rougeoti
- http://dbpedia.org/resource/Reichenbachia_hirsuta
- http://dbpedia.org/resource/Janusz_Warmiński
- http://dbpedia.org/resource/Glen_Allred

In [12]:
#entities_sample[entities_sample[["file", "category", "outside", "redirected"]].sum(axis=1) == 0].head(50)["subject"]

## Examples of Career Stations

Career station entities can be recognized by their URI which contains the substring `__CareerStation__`.

In [13]:
# career station entities
entities_sample["career_station"] = entities_sample["subject"].str.contains("__CareerStation__")
for val in entities_sample[entities_sample["career_station"]]["subject"].sample(10, random_state=42):
    print(f"- {val}")

- http://dbpedia.org/resource/David_Thompson_(footballer,_born_1962)__CareerStation__3
- http://dbpedia.org/resource/Gylfi_Sigurðsson__CareerStation__10
- http://dbpedia.org/resource/Julie_Ertz__CareerStation__4
- http://dbpedia.org/resource/Sergio_Torres_(footballer,_born_1984)__CareerStation__19
- http://dbpedia.org/resource/Vasco_Varão__CareerStation__15
- http://dbpedia.org/resource/Tommy_Cain__CareerStation__3
- http://dbpedia.org/resource/Eugenio_Pizzuto__CareerStation__3
- http://dbpedia.org/resource/Mohammed_Tabra__CareerStation__4
- http://dbpedia.org/resource/Ángel_Morales__Ángel_Morales__1__CareerStation__11
- http://dbpedia.org/resource/Andrea_Mei__CareerStation__2


## Entities Similar to Career Stations

There are other entities that are similar to career stations. For example there are entities with the type military service which connect a person and time period to counties or battles. Next to that, there are other entities with similar constructs which are not easily recognized by their URI. They seem to contain double underscores in their URI but there does not seem to be a guaranty that these items do not actually correspond to a "normal" entity. The examples below are mixed and some entities are just "normal" entities with double underscores in their URI.

In [14]:
for val in entities_sample[(entities_sample["subject"].str.contains("__")) & ~(entities_sample["subject"].str.contains("__CareerStation__"))]["subject"].sample(10, random_state=42):
    print(f"- {val}")

- http://dbpedia.org/resource/Omar_Bongo__MilitaryService__1
- http://dbpedia.org/resource/BMW_B47__D20__7
- http://dbpedia.org/resource/Hitomi_discography__Title__1
- http://dbpedia.org/resource/1998–99_Celtic_F.C._season__Phil_O'Donnell__1
- http://dbpedia.org/resource/John_D._Vanderhoof__MilitaryService__1
- http://dbpedia.org/resource/San_Diego_Sockers_(2009)__Mitchell_Cardenas__1
- http://dbpedia.org/resource/George_Forbes,_3rd_Earl_of_Granard__Tenure__1
- http://dbpedia.org/resource/Modafen_F.K.__Muhammed_Sura_Cetin__1
- http://dbpedia.org/resource/María_Sánchez_(footballer)__María_Sánchez__1
- http://dbpedia.org/resource/Henry_Salwey__Tenure__1


## Summary

In [15]:
entities_sample[["file", "category", "outside", "redirected", "career_station"]].sum() / len(entities_sample)

file              0.09886
category          0.04854
outside           0.01242
redirected        0.32218
career_station    0.05302
dtype: float64

## Relevance to the Training Data

In [16]:
mw_entities = pd.DataFrame({"entity": np.union1d(mw_props["subject"].unique(), mw_props["object"].unique())})
remaining_triples_entities = pd.DataFrame({"entity": np.union1d(remaining_triples["subject"].unique(), remaining_triples["object"].unique())})

In [17]:
# file entities
mw_entities["file"] = mw_entities["entity"].str[:33] == "http://dbpedia.org/resource/File:"
remaining_triples_entities["file"] = remaining_triples_entities["entity"].str[:33] == "http://dbpedia.org/resource/File:"

# category entities
mw_entities["category"] = mw_entities["entity"].str[:37] == "http://dbpedia.org/resource/Category:"
remaining_triples_entities["category"] = remaining_triples_entities["entity"].str[:37] == "http://dbpedia.org/resource/Category:"

# entities outside of DBpedia
mw_entities["outside"] = mw_entities["entity"].str[:28] != "http://dbpedia.org/resource/"
remaining_triples_entities["outside"] = remaining_triples_entities["entity"].str[:28] != "http://dbpedia.org/resource/"

# redirected entities
redirects["redirected"] = True
mw_entities = mw_entities.merge(redirects[["subject", "redirected"]], left_on="entity", right_on="subject", how="left")
mw_entities = mw_entities.drop(columns="subject")
mw_entities["redirected"] = mw_entities["redirected"].fillna(False)
remaining_triples_entities = remaining_triples_entities.merge(redirects[["subject", "redirected"]], left_on="entity", right_on="subject", how="left")
remaining_triples_entities = remaining_triples_entities.drop(columns="subject")
remaining_triples_entities["redirected"] = remaining_triples_entities["redirected"].fillna(False)

# career station entities
mw_entities["career_station"] = mw_entities["entity"].str.contains("__CareerStation__")
remaining_triples_entities["career_station"] = remaining_triples_entities["entity"].str.contains("__CareerStation__")

In [18]:
mw_entities[["file", "category", "outside", "redirected", "career_station"]].sum() / len(mw_entities)

file              0.000000
category          0.000000
outside           0.000000
redirected        0.003405
career_station    0.000000
dtype: float64

In [19]:
remaining_triples_entities[["file", "category", "outside", "redirected", "career_station"]].sum() / len(remaining_triples_entities)

file              0.000000
category          0.000000
outside           0.049170
redirected        0.049152
career_station    0.221722
dtype: float64