# Toponyms and their Countries

By Eduardo Graells-Garrido.

In this notebook we generate a dictionary of {place_uri => country_uri} so in other notebooks, when we need to determine the country of origin of a biography, we do not have to parse the entire database again looking for this information.

The result is one dictionary per each language. Equivalences between languages have not been resolved yet.

In [1]:
from __future__ import print_function, unicode_literals
import gzip
import dbpedia_config
import ujson as json
from dbpedia_utils import iter_entities_from

In [2]:
data_folder = dbpedia_config.DATA_FOLDER
target_folder = dbpedia_config.TARGET_FOLDER
languages = dbpedia_config.LANGUAGES

In [3]:
def create_country_dictionary(language):
    instance_types = '{1}/instance_types_{0}.nt.bz2'.format(language, data_folder)
    properties = '{1}/mappingbased_properties_{0}.nt.bz2'.format(language, data_folder)

    print(instance_types)
    country_names = set()

    for i, ent in enumerate(iter_entities_from(instance_types)):
        if 'http://dbpedia.org/ontology/Country' in ent['22-rdf-syntax-ns#type']:
            country_names.add(ent['resource'])

    print(len(country_names))

    places = set()

    for i, ent in enumerate(iter_entities_from(instance_types)):
        if 'http://dbpedia.org/ontology/Place' in ent['22-rdf-syntax-ns#type']:
            places.add(ent['resource'])

    print(len(places))

    countries = {}

    for i, ent in enumerate(iter_entities_from(properties)):
        resource = ent['resource']

        if resource not in places:
            continue

        if 'country' not in ent:
            continue
        
        try:
            c = next((ent['country'] & country_names).__iter__())
        except StopIteration:
            # doesn't have a valid country URI attribute
            continue

        countries[resource] = c

    print(len(countries))

    with gzip.open('{0}/countries_{1}.json.gz'.format(target_folder, language), 'wt') as f:
        json.dump(countries, f)

In [4]:
import os

In [6]:
for lang in languages:
    if not os.path.exists('{0}/countries_{1}.json.gz'.format(target_folder, lang)):
        create_country_dictionary(lang)
    else:
        print(lang, 'already exists')

en already exists
/media/egraells/113A88F901102CA6/data/dbpedia//instance_types_bg.nt.bz2
reading /media/egraells/113A88F901102CA6/data/dbpedia//instance_types_bg.nt.bz2
279
reading /media/egraells/113A88F901102CA6/data/dbpedia//instance_types_bg.nt.bz2
18659
reading /media/egraells/113A88F901102CA6/data/dbpedia//mappingbased_properties_bg.nt.bz2
0
/media/egraells/113A88F901102CA6/data/dbpedia//instance_types_ca.nt.bz2
reading /media/egraells/113A88F901102CA6/data/dbpedia//instance_types_ca.nt.bz2
0
reading /media/egraells/113A88F901102CA6/data/dbpedia//instance_types_ca.nt.bz2
80017
reading /media/egraells/113A88F901102CA6/data/dbpedia//mappingbased_properties_ca.nt.bz2
0
/media/egraells/113A88F901102CA6/data/dbpedia//instance_types_cs.nt.bz2
reading /media/egraells/113A88F901102CA6/data/dbpedia//instance_types_cs.nt.bz2
300
reading /media/egraells/113A88F901102CA6/data/dbpedia//instance_types_cs.nt.bz2
27944
reading /media/egraells/113A88F901102CA6/data/dbpedia//mappingbased_properti