# Extract information from public endpoints

In [20]:
import pandas as pd, json

import geovdata.sparql as sparql
import geovdata.kit as kit

from gmpykit.ipython import infos

## Religions

### Wikidata

In [21]:
sparql.connect_well_known('wikidata')
religions = sparql.query("""
    select ?religionLabel ?religion
    where { 
        ?religion wdt:P31 wd:Q9174 .
                
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
""")

# Filter out those with no label on wikidata
religions = religions[[row['religionLabel'] not in row['religion'] for _, row in religions.iterrows()]]

# Formating
religions['religionLabel'] = religions['religionLabel'].str.lower()


religions_names = religions['religionLabel'].tolist()
json.dumps(religions_names)

>> SPARQL endpoint of Wikidata connected.


'["islam", "buddhism", "discordianism", "hinduism", "jainism", "judaism", "confucianism", "taoism", "zoroastrianism", "bah\\u00e1\\u02bc\\u00ed faith", "jewish renewal", "shamanism", "animism", "rastafari", "unitarianism", "traditional anglican communion", "scientology", "atenism", "catharism", "modern paganism", "historical vedic religion", "b\\u00e1bism", "modern asatru", "laveyan satanism", "sumerian religion", "heathenry", "haitian vodou", "ancient roman religion", "ngor", "australian aboriginal mythology", "gallo-roman religion", "traditional african religion", "hanif", "religion in pre-islamic arabia", "religion of ancient egypt", "midewiwin", "anglo-catholicism", "yuzu nembutsu", "kadam", "mun", "soka gakkai", "jeung san do", "nauruan indigenous religion", "theophilanthropy", "gallicanism", "thelema", "candombl\\u00e9", "aztec religion", "drukpa lineage", "mari religion", "babylonian religion", "nichiren sh\\u014dsh\\u016b", "tengrism", "badimo", "brahmanism", "ryukyuan religion

# Occupations

## 1./ Get all occupation/profession from Wikidata

In [2]:
sparql.connect_well_known('wikidata')

>> SPARQL endpoint of Wikidata connected.


### 1.a/ Professions

In [3]:
professions = sparql.query("""
    select (?item as ?wikidata_uri) (?itemLabel as ?wikidata_label) (?itemDescription as ?wikidata_description)
    where { 
        ?item wdt:P31 wd:Q28640 .
                
        SERVICE wikibase:label { 
            bd:serviceParam wikibase:language "fr". 
            ?item rdfs:label ?itemLabel .
            ?item schema:description ?itemDescription .
        }
    }
""")
# infos(professions)

professions_count = sparql.query("""
    select (?item as ?wikidata_uri) (count(?entity) as ?count)
    where {
        ?item wdt:P31 wd:Q28640 .
        ?entity wdt:P106 ?item .
    }
    group by ?item
""").sort_values('count')
# infos(professions_count)

professions = professions.merge(professions_count, how='left')
professions['count'] = professions['count'].astype(pd.Int64Dtype()) 
professions.sort_values(inplace=True, by='count', ascending=False)


# Display
infos(professions)

Shape:  (9173, 4) - extract:


Unnamed: 0,wikidata_uri,wikidata_label,wikidata_description,count
41,http://www.wikidata.org/entity/Q82955,personnalité politique,personne impliquée dans la vie politique,823083
15,http://www.wikidata.org/entity/Q36180,écrivain ou écrivaine,personne qui rédige des ouvrages littéraires o...,361064
550,http://www.wikidata.org/entity/Q937857,footballeur,sportif ou sportive pratiquant le football (so...,358453
14,http://www.wikidata.org/entity/Q33999,acteur ou actrice,artiste qui prête son physique ou sa voix à un...,325937
2449,http://www.wikidata.org/entity/Q3665646,basketteur,pratiquant de basket-ball,171935


### 1.b/ Occupations

In [4]:
occupations = sparql.query("""
    select (?item as ?wikidata_uri) (?itemLabel as ?wikidata_label) (?itemDescription as ?wikidata_description)
    where { 
        ?item wdt:P31 wd:Q12737077 .
                
        SERVICE wikibase:label { 
            bd:serviceParam wikibase:language "fr". 
            ?item rdfs:label ?itemLabel .
            ?item schema:description ?itemDescription .
        }
    }
""")
# infos(occupations)

occupations_count = sparql.query("""
    select (?item as ?wikidata_uri) (count(?entity) as ?count)
    where {
        ?item wdt:P31 wd:Q12737077 .
        ?entity wdt:P106 ?item .
    }
    group by ?item
""").sort_values('count')
# infos(occupations_count)

occupations = occupations.merge(occupations_count, how='left')
occupations['count'] = occupations['count'].astype(pd.Int64Dtype()) 
occupations.sort_values(inplace=True, by='count', ascending=False)

infos(occupations)

Shape:  (4966, 4) - extract:


Unnamed: 0,wikidata_uri,wikidata_label,wikidata_description,count
522,http://www.wikidata.org/entity/Q1650915,chercheur ou chercheuse,personne qui se consacre à la recherche scient...,1952237
15,http://www.wikidata.org/entity/Q36180,écrivain ou écrivaine,personne qui rédige des ouvrages littéraires o...,360969
314,http://www.wikidata.org/entity/Q937857,footballeur,sportif ou sportive pratiquant le football (so...,358336
13,http://www.wikidata.org/entity/Q33999,acteur ou actrice,artiste qui prête son physique ou sa voix à un...,325829
329,http://www.wikidata.org/entity/Q1028181,peintre,"personne pratiquant la peinture, comme discipl...",201438


## 2./ Add info from DBpedia

In [5]:
# sparql.connect_well_known('dbpedia')

>> SPARQL endpoint of DBpedia connected.


In [6]:
# for i, row in occupations.iterrows():
#     response = sparql.query("""
#     select
#         ?dbpedia_uri ?dbpedia_label ?dbpedia_description (lang(?dbpedia_label) as ?label_lang) (lang(?dbpedia_description) as ?descr_lang)
#     where {
#         ?dbpedia_uri owl:sameAs <""" + row['uri'] + """> .
#         optional{ ?dbpedia_uri rdfs:label ?dbpedia_label .}
#         optional{ ?dbpedia_uri dbo:abstract ?dbpedia_description .}
#     }
#     """)
    
#     display(response)
    
#     if len(response) == 0: continue

    

#     break

KeyError: 'uri'

In [None]:
sparql.query("""
    select *
    where {
        <http://dbpedia.org/resource/Researcher> rdfs:label ?label .
        <http://dbpedia.org/resource/Researcher> dbo:abstract ?abstract .
        
        filter(lang(?abstract) = 'en' && (lang()))
    }
""")[0:50]