In [47]:
import sys
sys.path.append(sys.path[0][:sys.path[0].find('DVML-P7') + len('DVML-P7')])
from Code.UtilityFunctions.run_query import run_query
from SPARQLWrapper import SPARQLWrapper, JSON
from Code.UtilityFunctions.wikidata_functions import wikidata_query
from Code.UtilityFunctions.get_data_path import get_path
import pandas as pd

In [20]:
def dbpedia_query(sparql_query: str):
    """
    It takes a SPARQL query as a string, and returns a pandas dataframe of the results
    
    :param sparql_query: the query you want to run
    :type sparql_query: str
    :return: The query returns the wikidata item id, the wikidata item label, the wikidata item
    description, and the wikidata item category.
    """
    user_agent = "Yelp knowledge graph mapping/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper("https://dbpedia.org/sparql", agent=user_agent)
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    results_df = pd.json_normalize(results['results']['bindings'])
    return results_df

### How many people live in the city of top ten most prevelent cities in Yelp?

### Wikidata

In [21]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

In [22]:
biz[['city', 'state']].value_counts().head(10)

city           state
Philadelphia   PA       14567
Tucson         AZ        9249
Tampa          FL        9048
Indianapolis   IN        7540
Nashville      TN        6968
New Orleans    LA        6208
Reno           NV        5932
Edmonton       AB        5054
Saint Louis    MO        4827
Santa Barbara  CA        3829
dtype: int64

In [23]:
wikidata_city_dict = {
    'Philadelphia': 'Q1345',
    'Tucson': 'Q18575',
    'Tampa': 'Q49255',
    'Indianapolis': 'Q6346',
    'Nashville': 'Q23197',
    'New Orleans': 'Q34404',
    'Reno': 'Q49225',
    'Edmonton': 'Q2096',
    'Saint Louis': 'Q38022',
    'Santa Barbara': 'Q159288'
    }

In [24]:
list_of_cities = "wd:Q1345 wd:Q18575 wd:Q49255 wd:Q6346 wd:Q23197 wd:Q34404 wd:Q49225 wd:Q2096 wd:Q38022 wd:Q159288"
sparql_query = f"""
SELECT ?city ?population ?cityLabel 
WHERE {{
  ?city p:P1082 ?statement .
  ?statement ps:P1082 ?population .
  ?statement pq:P585 ?date .
  FILTER NOT EXISTS {{
    ?city p:P1082/pq:P585 ?date2 .
    FILTER(?date2 > ?date)
  }}
  VALUES ?city {{{list_of_cities}}} .
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }}
}}
"""

wikidata_pop = wikidata_query(sparql_query=sparql_query)

### DBpedia

In [25]:
dbpedia_city_dict = {
    'Philidelphia': '"Philadelphia"',
    'Tucson': '"Tucson, Arizona"',
    'Tampa': '"Tampa, Florida"',
    'Indianapolis': '"Indianapolis"',
    'Nashville': '"Nashville, Tennessee"',
    'New Orleans': '"New Orleans"',
    'Reno': '"Reno, Nevada"',
    'Edmonton': '"Edmonton"',
    'Saint Louis': '"St. Louis"',
    'Santa Barbara': '"Santa Barbara, California"@en'}

In [26]:
"@en ".join([i for i in dbpedia_city_dict.values()])

'"Philadelphia"@en "Tucson, Arizona"@en "Tampa, Florida"@en "Indianapolis"@en "Nashville, Tennessee"@en "New Orleans"@en "Reno, Nevada"@en "Edmonton"@en "St. Louis"@en "Santa Barbara, California"@en'

In [27]:
# list_of_cities = '"Philadelphia"@en "Tucson, Arizona"@en "Tampa"@en "Indianapolis"@en "Nashville"@en "New Orleans"@en "Reno"@en "Edmonton"@en "Saint Louis"@en "Santa Barbara"@en'
list_of_cities = "@en ".join([i for i in dbpedia_city_dict.values()])
query = f"""
SELECT DISTINCT ?city ?population ?cityname
WHERE {{
    ?city a dbo:City .
    ?city dbo:populationTotal ?population .
    ?city rdfs:label ?cityname .
    VALUES ?cityname{{{list_of_cities}}}
}}
"""
dbpedia_pop = dbpedia_query(query)

In [28]:
list_of_cities

'"Philadelphia"@en "Tucson, Arizona"@en "Tampa, Florida"@en "Indianapolis"@en "Nashville, Tennessee"@en "New Orleans"@en "Reno, Nevada"@en "Edmonton"@en "St. Louis"@en "Santa Barbara, California"@en'

### compare WikiData and DBpedia

In [29]:
wikidata_pop[['city.value','cityLabel.value', 'population.value']].sort_values(by='population.value', ascending=False)

Unnamed: 0,city.value,cityLabel.value,population.value
3,http://www.wikidata.org/entity/Q6346,Indianapolis,887642
0,http://www.wikidata.org/entity/Q159288,Santa Barbara,88665
5,http://www.wikidata.org/entity/Q23197,Nashville,684410
4,http://www.wikidata.org/entity/Q18575,Tucson,542629
9,http://www.wikidata.org/entity/Q49255,Tampa,384959
6,http://www.wikidata.org/entity/Q34404,New Orleans,383997
7,http://www.wikidata.org/entity/Q38022,St. Louis,301578
8,http://www.wikidata.org/entity/Q49225,Reno,264165
1,http://www.wikidata.org/entity/Q1345,Philadelphia,1603797
2,http://www.wikidata.org/entity/Q2096,Edmonton,1010899


In [30]:
dbpedia_pop[['city.value','cityname.value', 'population.value']].sort_values(by='population.value', ascending=False)

Unnamed: 0,city.value,cityname.value,population.value
9,http://dbpedia.org/resource/Indianapolis,Indianapolis,887642
0,"http://dbpedia.org/resource/Santa_Barbara,_Cal...","Santa Barbara, California",88665
3,"http://dbpedia.org/resource/Nashville,_Tennessee","Nashville, Tennessee",715884
4,"http://dbpedia.org/resource/Tucson,_Arizona","Tucson, Arizona",542629
1,"http://dbpedia.org/resource/Tampa,_Florida","Tampa, Florida",384959
5,http://dbpedia.org/resource/New_Orleans,New Orleans,383997
8,http://dbpedia.org/resource/St._Louis,St. Louis,301578
2,"http://dbpedia.org/resource/Reno,_Nevada","Reno, Nevada",264165
6,http://dbpedia.org/resource/Philadelphia,Philadelphia,1603797
7,http://dbpedia.org/resource/Edmonton,Edmonton,1010899


## CQ 2: How many of 10 random cities from the yelp dataset are in both Wikidata and DBpedia?
### How many of these cities have a population?

### Wikidata

In [31]:
city_sample = biz[['city','state']].drop_duplicates().sample(10, random_state=42)
city_sample

Unnamed: 0,city,state
90691,Edgemont,PA
699,Safety Harbor,FL
95702,Folsom,NJ
136,Land O Lakes,FL
955,Fort Washington,PA
6538,Avondale,LA
4226,Willingboro,NJ
1145,Glen Carbon,IL
943,Mount Laurel,NJ
90,Plainfield,IN


In [32]:
sample_dict = city_sample.set_index('city')['state'].to_dict()

In [33]:
state_update = {'PA': 'Pennsylvania',
'FL': 'Florida',
'NJ': 'New Jersey',
'LA': 'Louisiana',
'IL': 'Illinois',
'IN': 'Indiana'}

sample_dict_updated = {}
# Iterate through the key-value pairs in the first dictionary
for key, value in sample_dict.items():
    # Update the value in the first dictionary with the value from the second dictionary
    sample_dict_updated[key] = state_update[value]
sample_dict_updated["Land O' Lakes"] = sample_dict_updated.pop('Land O Lakes')

sample_dict_updated

{'Edgemont': 'Pennsylvania',
 'Safety Harbor': 'Florida',
 'Folsom': 'New Jersey',
 'Fort Washington': 'Pennsylvania',
 'Avondale': 'Louisiana',
 'Willingboro': 'New Jersey',
 'Glen Carbon': 'Illinois',
 'Mount Laurel': 'New Jersey',
 'Plainfield': 'Indiana',
 "Land O' Lakes": 'Florida'}

In [34]:
df = pd.DataFrame()
for key, value in sample_dict_updated.items():
    sparql_query = f"""
    SELECT DISTINCT ?city ?cityLabel ?state ?stateLabel
    WHERE{{
    FILTER(STRSTARTS(?cityLabel, "{key}"@en)).
    VALUES ?stateLabel {{"{value}"@en}}
    ?city rdfs:label ?cityLabel.
    ?city wdt:P131/wdt:P131 | wdt:P131 ?state .
    ?city wdt:P31/wdt:P279* wd:Q486972 .
    ?state rdfs:label ?stateLabel .
    SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }} 
    }}
    LIMIT 1
    """
    df2 = wikidata_query(sparql_query=sparql_query)
    df = pd.concat([df, df2], ignore_index=True)
df[['city.value', 'cityLabel.value', 'state.value', 'stateLabel.value']]

Unnamed: 0,city.value,cityLabel.value,state.value,stateLabel.value
0,http://www.wikidata.org/entity/Q5337787,Edgemont,http://www.wikidata.org/entity/Q1400,Pennsylvania
1,http://www.wikidata.org/entity/Q952992,Safety Harbor,http://www.wikidata.org/entity/Q812,Florida
2,http://www.wikidata.org/entity/Q1083022,Folsom,http://www.wikidata.org/entity/Q1408,New Jersey
3,http://www.wikidata.org/entity/Q1133576,Fort Washington,http://www.wikidata.org/entity/Q1400,Pennsylvania
4,http://www.wikidata.org/entity/Q1994608,Avondale,http://www.wikidata.org/entity/Q1588,Louisiana
5,http://www.wikidata.org/entity/Q1072819,"Willingboro Township, New Jersey",http://www.wikidata.org/entity/Q1408,New Jersey
6,http://www.wikidata.org/entity/Q1375820,Glen Carbon,http://www.wikidata.org/entity/Q1204,Illinois
7,http://www.wikidata.org/entity/Q1072657,Mount Laurel,http://www.wikidata.org/entity/Q1408,New Jersey
8,http://www.wikidata.org/entity/Q986631,Plainfield,http://www.wikidata.org/entity/Q1415,Indiana
9,http://www.wikidata.org/entity/Q2375799,Land O' Lakes,http://www.wikidata.org/entity/Q812,Florida


In [35]:
list_city_qid = " ".join(df['city.value'].apply(lambda x: 'wd:' + x.split('/')[-1]).to_list())
list_city_qid

'wd:Q5337787 wd:Q952992 wd:Q1083022 wd:Q1133576 wd:Q1994608 wd:Q1072819 wd:Q1375820 wd:Q1072657 wd:Q986631 wd:Q2375799'

In [36]:
sparql_query = f"""
SELECT ?city ?population ?cityLabel 
WHERE {{
  ?city p:P1082 ?statement .
  ?statement ps:P1082 ?population .
  ?statement pq:P585 ?date .
  FILTER NOT EXISTS {{
    ?city p:P1082/pq:P585 ?date2 .
    FILTER(?date2 > ?date)
  }}
  VALUES ?city {{{list_city_qid}}} .
  SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" . }}
}}
"""

wikidata_query(sparql_query=sparql_query)[['city.value','cityLabel.value', 'population.value']]

Unnamed: 0,city.value,cityLabel.value,population.value
0,http://www.wikidata.org/entity/Q952992,Safety Harbor,17072
1,http://www.wikidata.org/entity/Q986631,Plainfield,34625
2,http://www.wikidata.org/entity/Q1072657,Mount Laurel,44633
3,http://www.wikidata.org/entity/Q1072819,Willingboro Township,31889
4,http://www.wikidata.org/entity/Q1083022,Folsom,1811
5,http://www.wikidata.org/entity/Q1133576,Fort Washington,5910
6,http://www.wikidata.org/entity/Q1375820,Glen Carbon,13842
7,http://www.wikidata.org/entity/Q1994608,Avondale,4582
8,http://www.wikidata.org/entity/Q2375799,Land O' Lakes,35929


## DBPedia

In [37]:

df = pd.DataFrame()
for key, value in sample_dict_updated.items():
    sparql_query = f"""
    SELECT ?val ?city ?cityName ?county ?state ?stateName
    WHERE {{
    VALUES ?val {{"{key}"}}
    VALUES ?val2 {{"{value}"}}
    ?city a dbo:City ;
            dbp:name ?cityName  .
            ?county dbo:county ?city .
    ?county dbo:state ?state .
    ?state dbp:name ?stateName .
    FILTER(contains(str(?cityName), ?val) && contains(str(?stateName), ?val2)).
    }}

    """
    df2 = dbpedia_query(sparql_query=sparql_query)
    df = pd.concat([df, df2], ignore_index=True)
df

## CQ 3: How many cities are in the 10 most prevelent states/provinces in Yelp?

### Wikidata

In [38]:
yelp_state = biz['state'].value_counts().head(10)
yelp_state

PA    34039
FL    26330
TN    12056
IN    11247
MO    10913
LA     9924
AZ     9912
NJ     8536
NV     7715
AB     5573
Name: state, dtype: int64

In [39]:
wd_state_dict = {
'PA': 'wd:Q1400',
'FL': 'wd:Q812',
'TN': 'wd:Q1509',
'IN': 'wd:Q1415',
'MO': 'wd:Q1581',
'LA': 'wd:Q1588',
'AZ': 'wd:Q816',
'NJ': 'wd:Q1408',
'NV': 'wd:Q1227',
'AB': 'wd:Q1951',
}

In [40]:
wd_state_list = " ".join([v for v in wd_state_dict.values()])
sparql_query = f"""
SELECT DISTINCT ?stateLabel ?numCities
WHERE {{
VALUES ?state {{{wd_state_list}}}
{{?state wdt:P31 wd:Q35657 .}}
UNION
{{?state wdt:P31 wd:Q11828004 .}}

{{ SELECT ?state (COUNT(DISTINCT ?city) as ?numCities)
WHERE
{{
    {{?city wdt:P31/wdt:P279* wd:Q1093829.}}
    UNION
    {{?city wdt:P31/wdt:P279* wd:Q515.}}
    UNION
    {{?city wdt:P31/wdt:P279* wd:Q15127012.}}
    ?city wdt:P131/wdt:P131 ?state .
}}
GROUP BY ?state
}}
SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
}}
"""

wikidata_query(sparql_query=sparql_query)[['stateLabel.value', 'numCities.value']]

Unnamed: 0,stateLabel.value,numCities.value
0,Alberta,4
1,Florida,387
2,Nevada,20
3,Tennessee,324
4,Missouri,645
5,Indiana,491
6,Arizona,88
7,Louisiana,111
8,New Jersey,92
9,Pennsylvania,58


### DBpedia

In [41]:
# Cities of list entities in DBpedia from https://dbpedia.org/page/Template:Cities_in_the_United_States
sparql_query = """
SELECT (COUNT(DISTINCT ?city) AS ?count) ?var
WHERE {
    ?city a dbo:City .
    ?city dct:subject ?location .
    ?location rdfs:label ?var
    VALUES ?var {
"Cities in Florida"@en  
"Cities in Nevada"@en
"Cities in Tennessee"@en
"Cities in Missouri"@en
"Cities in Indiana"@en
"Cities in Arizona"@en
"Cities in Louisiana"@en
"Cities in New Jersey"@en
"Cities in Pennsylvania"@en
"Cities in Alberta"@en
}
}
"""
dbpedia_query(sparql_query=sparql_query)[['count.value', 'var.value']]

Unnamed: 0,count.value,var.value
0,57,Cities in Pennsylvania
1,267,Cities in Florida
2,19,Cities in Alberta
3,118,Cities in Indiana
4,20,Cities in Nevada
5,181,Cities in Tennessee
6,671,Cities in Missouri
7,70,Cities in Louisiana
8,47,Cities in Arizona


## CQ 4: What drinks exist that are beverages?
## TODO: check more categories

In [49]:
category_occurences = pd.read_csv(get_path('category_occurences.csv'))

In [53]:
category_occurences.head(20)

Unnamed: 0,category,occurences,split_category,SchemaType,schema_or_yelp_category,qid,qid_label
0,Restaurants,52268,Restaurant,Restaurant,Restaurant,Q11707,restaurant
1,Food,27781,Food,,Food,Q2095,food
2,Shopping,24395,Shopping,Retail,Retail,Q126793,retail
3,Home Services,14356,HomeService,Service,Service,Q44127,server
4,Beauty & Spas,14292,Beauty,DaySpa,DaySpa,,
5,Beauty & Spas,14292,Spa,DaySpa,DaySpa,,
6,Nightlife,12281,Nightlife,NightClub,NightClub,Q622425,nightclub
7,Health & Medical,11890,Health,Physician,Physician,Q33749,Ivan Alev
8,Health & Medical,11890,Medical,Physician,Physician,Q33749,Ivan Alev
9,Local Services,11198,LocalService,LocalBusiness,LocalBusiness,,


- Q40050: Drink
- Q177: Pizza
- Q11707: Restaurant
- Q62849941: LocalBusiness
- Q786803: Car dealership

In [59]:
sparql_query = """
SELECT ?value ?valueLabel (COUNT (DISTINCT ?category) AS ?count)
    WHERE {
        VALUES ?value {wd:Q40050 wd:Q177 wd:Q11707 wd:Q4830453 wd:Q786803}
        ?category wdt:P279+ ?value .
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
    }
    GROUP BY ?value ?valueLabel
"""
wikidata_query(sparql_query=sparql_query)[['value.value', 'valueLabel.value', 'count.value']]

Unnamed: 0,value.value,valueLabel.value,count.value
0,http://www.wikidata.org/entity/Q11707,restaurant,234
1,http://www.wikidata.org/entity/Q177,pizza,60
2,http://www.wikidata.org/entity/Q40050,drink,6662
3,http://www.wikidata.org/entity/Q786803,car dealership,1
4,http://www.wikidata.org/entity/Q4830453,business,3518


In [58]:
sparql_query = """
SELECT ?value (COUNT (DISTINCT ?category) AS ?count)
WHERE {
    VALUES ?value {dbo:Beverage dbo:Car_dealership dbo:Business dbo:Restaurant dbo:Pizza}
    ?category rdf:type ?value
}
GROUP BY ?value
"""
dbpedia_query(sparql_query=sparql_query)[['value.value', 'count.value']]

Unnamed: 0,value.value,count.value
0,http://dbpedia.org/ontology/Beverage,1884
1,http://dbpedia.org/ontology/Restaurant,2838
