In [20]:
from Code.UtilityFunctions.run_query import run_query
from SPARQLWrapper import SPARQLWrapper, JSON
from Code.UtilityFunctions.wikidata_functions import wikidata_query
from Code.UtilityFunctions.get_data_path import get_path
import sys
import pandas as pd

In [7]:

def dbpedia_query(sparql_query: str):
    """
    It takes a SPARQL query as a string, and returns a pandas dataframe of the results
    
    :param sparql_query: the query you want to run
    :type sparql_query: str
    :return: The query returns the wikidata item id, the wikidata item label, the wikidata item
    description, and the wikidata item category.
    """
    user_agent = "Yelp knowledge graph mapping/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper("https://dbpedia.org/sparql", agent=user_agent)
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results_df = pd.json_normalize(results['results']['bindings'])
    return results_df

### How many people live in the city of top ten most prevelent cities in Yelp?

### Wikidata

In [21]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

In [83]:
biz[['city', 'state']].value_counts().head(10)

city           state
Philadelphia   PA       14567
Tucson         AZ        9249
Tampa          FL        9048
Indianapolis   IN        7540
Nashville      TN        6968
New Orleans    LA        6208
Reno           NV        5932
Edmonton       AB        5054
Saint Louis    MO        4827
Santa Barbara  CA        3829
dtype: int64

In [32]:
wikidata_city_dict = {
    'Philadelphia': 'Q1345',
    'Tucson': 'Q18575',
    'Tampa': 'Q49255',
    'Indianapolis': 'Q6346',
    'Nashville': 'Q23197',
    'New Orleans': 'Q34404',
    'Reno': 'Q49225',
    'Edmonton': 'Q2096',
    'Saint Louis': 'Q38022',
    'Santa Barbara': 'Q159288'
    }

{'Philadelphia': 'Q1345',
 'Tucson': 'Q18575',
 'Tampa': 'Q49255',
 'Indianapolis': 'Q6346',
 'Nashville': 'Q23197',
 'New Orleans': 'Q34404',
 'Reno': 'Q49225',
 'Edmonton': 'Q2096',
 'Saint Louis': 'Q38022',
 'Santa Barbara': 'Q159288'}

In [71]:
list_of_cities = "wd:Q1345 wd:Q18575 wd:Q49255 wd:Q6346 wd:Q23197 wd:Q34404 wd:Q49225 wd:Q2096 wd:Q38022 wd:Q159288"
sparql_query = f"""
SELECT ?city ?population ?cityLabel 
WHERE {{
  ?city p:P1082 ?statement .
  ?statement ps:P1082 ?population .
  ?statement pq:P585 ?date .
  FILTER NOT EXISTS {{
    ?city p:P1082/pq:P585 ?date2 .
    FILTER(?date2 > ?date)
  }}
  VALUES ?city {{{list_of_cities}}} .
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }}
}}
"""

wikidata_pop = wikidata_query(sparql_query=sparql_query)

### DBpedia

In [67]:
dbpedia_city_dict = {
    'Philidelphia': '"Philadelphia"',
    'Tucson': '"Tucson, Arizona"',
    'Tampa': '"Tampa, Florida"',
    'Indianapolis': '"Indianapolis"',
    'Nashville': '"Nashville, Tennessee"',
    'New Orleans': '"New Orleans"',
    'Reno': '"Reno, Nevada"',
    'Edmonton': '"Edmonton"',
    'Saint Louis': '"St. Louis"',
    'Santa Barbara': '"Santa Barbara, California"@en'}

In [68]:
"@en ".join([i for i in dbpedia_city_dict.values()])

'"Philadelphia"@en "Tucson, Arizona"@en "Tampa, Florida"@en "Indianapolis"@en "Nashville, Tennessee"@en "New Orleans"@en "Reno, Nevada"@en "Edmonton"@en "St. Louis"@en "Santa Barbara, California"@en'

In [70]:
# list_of_cities = '"Philadelphia"@en "Tucson, Arizona"@en "Tampa"@en "Indianapolis"@en "Nashville"@en "New Orleans"@en "Reno"@en "Edmonton"@en "Saint Louis"@en "Santa Barbara"@en'
list_of_cities = "@en ".join([i for i in dbpedia_city_dict.values()])
query = f"""
SELECT DISTINCT ?city ?population ?cityname
WHERE {{
    ?city a dbo:City .
    ?city dbo:populationTotal ?population .
    ?city rdfs:label ?cityname .
    VALUES ?cityname{{{list_of_cities}}}
}}
"""
dbpedia_pop = dbpedia_query(query)

### compare WikiData and DBpedia

In [81]:
wikidata_pop[['cityLabel.value', 'population.value']].sort_values(by='population.value', ascending=False)

Unnamed: 0,cityLabel.value,population.value
3,Indianapolis,887642
0,Santa Barbara,88665
5,Nashville,684410
4,Tucson,542629
9,Tampa,384959
6,New Orleans,383997
7,St. Louis,301578
8,Reno,264165
1,Philadelphia,1603797
2,Edmonton,1010899


In [82]:
dbpedia_pop[['cityname.value', 'population.value']].sort_values(by='population.value', ascending=False)

Unnamed: 0,cityname.value,population.value
9,Indianapolis,887642
0,"Santa Barbara, California",88665
3,"Nashville, Tennessee",715884
4,"Tucson, Arizona",542629
1,"Tampa, Florida",384959
5,New Orleans,383997
8,St. Louis,301578
2,"Reno, Nevada",264165
6,Philadelphia,1603797
7,Edmonton,1010899


### How many people live in the state of Maryland?

In [None]:
sparql_query = """

SELECT ?population ?date
WHERE {
    wdt:Q1391 wdt:P1082 ?population . #maryland population
    ?population wdt:P585 ?date # point in time
}
SORT BY DESC(?date)
LIMIT 1
"""

wikidata_query(query=sparql_query, as_dataframe=True)

### What drinks exist if I want a hot beverage?

In [None]:
sparql_query = """

SELECT ?drink
WHERE {
    ?drink wdt:P279 wdt:Q19359564 # subclass of hot beverage
}
"""

wikidata_query(query=sparql_query, as_dataframe=True)

### How many cities are there in California?

In [None]:
sparql_query = """

SELECT DISTINCT COUNT(?city)
WHERE {
    ?city wdt:P131 wdt:Q99 . # located in the administrative territorial entity California
}
"""

wikidata_query(query=sparql_query, as_dataframe=True)

### How many cities are not part of the USA?

In [None]:
sparql_query = """

SELECT DISTINCT COUNT(?city)
WHERE {
    ?city wdt:P17 ?country . # country
    FILTER NOT EXISTS {?city wdt:P17 wdt:Q30 } #country USA
}
"""

wikidata_query(query=sparql_query, as_dataframe=True)