In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import sys

from Code.UtilityFunctions.get_data_path import get_path

In [3]:
def wikidata_query(sparql_query: str):
    # From https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples#Cats
    user_agent = f"Location Information - USA & Canada/{sys.version_info[0]}.{sys.version_info[1]}"
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql", agent=user_agent)
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results_df = pd.json_normalize(results['results']['bindings'])
    return results_df

# Cities in the USA

In [4]:
# P31 instance of
# P131 located in the administrative territorial entity
# P279 subclass of
# P585 point in time
# P1082 population
# Q515 city
# Q35657 U.S. state

query_us_cities = """
SELECT DISTINCT ?city ?cityLabel ?county ?population
WHERE {
    {?city wdt:P31/wdt:P279* wd:Q532}
    UNION
    {?city wdt:P31/wdt:P279* wd:Q515}
	?city wdt:P17 wd:Q30 .
	OPTIONAL { ?city wdt:P131 ?county . }
	?city p:P1082 ?statement .
	?statement ps:P1082 ?population .
	?statement pq:P585 ?date .
	FILTER NOT EXISTS {
		?city p:P1082/pq:P585 ?date2 .
		FILTER(?date2 > ?date)
	}
	SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . }
}
"""

result_us_cities = wikidata_query(query_us_cities)
result_us_cities = result_us_cities[["city.value", "cityLabel.value", "county.value", "population.value"]]
us_city_renames = {"city.value": "city_iri", "cityLabel.value": "city", "county.value": "county_iri", "population.value": "population"}
result_us_cities.rename(columns=us_city_renames, inplace=True)
result_us_cities.to_csv(path_or_buf=get_path("USA_Cities.csv"), index=False)

# Counties in the USA

In [5]:
query_us_counties = """
SELECT DISTINCT ?county ?countyLabel ?state
WHERE {
	?county wdt:P31/wdt:P279* wd:Q47168 .
	?county wdt:P131 ?state .
    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . }
}
"""

result_us_counties = wikidata_query(query_us_counties)
result_us_counties = result_us_counties[["county.value", "countyLabel.value", "state.value"]]
us_county_renames = {"county.value": "county_iri", "countyLabel.value": "county", "state.value": "state_iri"}
result_us_counties.rename(columns=us_county_renames, inplace=True)
result_us_counties.to_csv(path_or_buf=get_path("USA_Counties.csv"), index=False)

# States in the USA

In [6]:
# States in US
# P31 instance of
# Q35657 U.S. state

query_all_us_states = """
SELECT DISTINCT ?state ?stateLabel ?country
WHERE {
  ?state wdt:P31 wd:Q35657 .
  ?state wdt:P17 ?country .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . }
}
"""

result_us_states = wikidata_query(query_all_us_states)
result_us_states = result_us_states[["state.value", "stateLabel.value", "country.value"]]
us_state_renames = {"state.value": "state_iri", "stateLabel.value": "state", "country.value": "country_iri"}
result_us_states.rename(columns=us_state_renames, inplace=True)
result_us_states.to_csv(path_or_buf=get_path("USA_States.csv"), index=False)

# Cities in Canada

In [54]:
def get_canada_city(location: str) -> str:
    """
    Select all Canadian cities that are located in location.
    :param location: string label of the location
    :return: string representation of the query
    P31 instance of; P279 subclass of; Q515 city; Q17366755 hamlet;
    P17 country; Q16 Canada; P1082 population; P585 point in time;
    P131 located in the administrative territorial entity
    """
    query_canada_cities = f"""
        SELECT DISTINCT ?city ?cityLabel ?province ?population
        WHERE{{
            ?city ?label "{location}"@en.
            {{
                ?city wdt:P17 wd:Q16 .
                ?city wdt:P31/wdt:P279* wd:Q515 .
            }}
            UNION
            {{
                ?city wdt:P17 wd:Q16 .
                ?city wdt:P31/wdt:P279* wd:Q17366755 .
            }}
            ?city p:P1082 ?statement .
            ?statement ps:P1082 ?population .
            ?statement pq:P585 ?date .
            FILTER NOT EXISTS {{
                ?city p:P1082/pq:P585 ?date2 .
                FILTER(?date2 > ?date)
            }}
            ?city wdt:P131 ?province .
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
    """
    return query_canada_cities

canada_locs = ["Edmonton", "Sherwood Park",
                "St. Albert", "Beaumont",
                "Birch Hill Park", "ECN"]
canada_cities = pd.DataFrame()
for location in canada_locs:
    wiki_response = wikidata_query(get_canada_city(location=location))
    canada_cities = pd.concat([canada_cities, wiki_response], axis=0)

result_canada_cities = canada_cities[['city.value', 'cityLabel.value', 'province.value', 'population.value']]
canada_city_renames = {"city.value": "city_iri", "cityLabel.value": "city", "province.value": "state_iri", 'population.value': 'population'}
result_canada_cities = result_canada_cities.rename(columns=canada_city_renames)
result_canada_cities.to_csv(path_or_buf=get_path("Canada_Cities.csv"), index=False)

# Provinces of Canada

In [7]:
# Provinces of Canada
# P17 country
# Q11828004 Canada

query_all_canada_provinces = """
SELECT DISTINCT ?province ?provinceLabel ?country
WHERE {
  ?province wdt:P31 wd:Q11828004 .
  ?province wdt:P17 ?country .
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" . }
}
"""

result_can_provinces = wikidata_query(query_all_canada_provinces)
result_can_provinces = result_can_provinces[["province.value", "provinceLabel.value", "country.value"]]
can_province_renames = {"province.value": "state_iri", "provinceLabel.value": "state", "country.value": "country_iri"}
result_can_provinces.rename(columns=can_province_renames, inplace=True)
result_can_provinces.to_csv(path_or_buf=get_path("Canada_States.csv"), index=False)