In [80]:
import json
import requests

import pandas as pd

from Code.UtilityFunctions.wikidata_functions import wikidata_query
from Code.UtilityFunctions.get_data_path import get_path

In [4]:
states = {
    'AK': 'Alaska',
    'AL': 'Alabama',
    'AR': 'Arkansas',
    'AS': 'American Samoa',
    'AZ': 'Arizona',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DC': 'District of Columbia',
    'DE': 'Delaware',
    'FL': 'Florida',
    'GA': 'Georgia',
    'GU': 'Guam',
    'HI': 'Hawaii',
    'IA': 'Iowa',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'MA': 'Massachusetts',
    'MD': 'Maryland',
    'ME': 'Maine',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MO': 'Missouri',
    'MP': 'Northern Mariana Islands',
    'MS': 'Mississippi',
    'MT': 'Montana',
    'NA': 'National',
    'NC': 'North Carolina',
    'ND': 'North Dakota',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'NJ': 'New Jersey',
    'NM': 'New Mexico',
    'NV': 'Nevada',
    'NY': 'New York',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'PR': 'Puerto Rico',
    'RI': 'Rhode Island',
    'SC': 'South Carolina',
    'SD': 'South Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VA': 'Virginia',
    'VI': 'Virgin Islands',
    'VT': 'Vermont',
    'WA': 'Washington',
    'WI': 'Wisconsin',
    'WV': 'West Virginia',
    'WY': 'Wyoming',
    # CANADA
    'AB': 'Alberta',
    # Bad Naming
    'XMS': 'Missouri',
}

In [81]:
q_codes = {
    "county": "Q28575",
    "state": "Q7275",
    "province": "Q11828004",
    "country": "Q6256"
}

In [82]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

In [59]:
biz["city_og"] =  biz["city"]
biz["state_og"] = biz["state"]
biz["city"] =  biz["city"].apply(lambda x: x.partition(",")[0])
biz["state"] = biz["state"].apply(lambda x: states[x])

In [44]:
city_state_keys = biz[["city", "state", "city_og", "state_og"]].drop_duplicates()

In [8]:
df = biz.groupby(["city", "state"])[["latitude", "longitude"]].mean().reset_index()
df["location"] = df["longitude"].round(decimals=2).astype(str) + "," + df["latitude"].round(decimals=2).astype(str)

In [9]:
df["search_string"] = df.apply(lambda x: x[0] + ", " + x[1], axis=1).str.replace(" ", "%20")

In [10]:
def return_city_q_ids(search_string):
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&language=en&type=item&continue=0&search={search_string}"

    response = requests.get(url)
    data = json.loads(response.text)

    q_ids = [Q["id"] for Q in data["search"]]

    if not q_ids:  # Empty – no result given
        url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&language=en&type=item&continue=0&search={search_string.partition(',')[0]}"

        response = requests.get(url)
        data = json.loads(response.text)

        q_ids = [Q["id"] for Q in data["search"]]

    str_q_ids = " ".join(["wd:" + qid for qid in q_ids])

    return str_q_ids

In [11]:
df["city_q_ids"] = df["search_string"].apply(return_city_q_ids)

In [12]:
def return_state_q_ids(search_string):
    url = f"https://www.wikidata.org/w/api.php?action=wbsearchentities&format=json&language=en&type=item&continue=0&search={search_string}"

    response = requests.get(url)
    data = json.loads(response.text)

    q_ids = [Q["id"] for Q in data["search"]]

    str_q_ids = " ".join(["wd:" + qid for qid in q_ids])

    return str_q_ids

In [13]:
df["state_q_ids"] = df["state"].apply(return_state_q_ids)

In [14]:
df

Unnamed: 0,city,state,latitude,longitude,location,search_string,city_q_ids,state_q_ids
0,AB Edmonton,Alberta,53.541407,-113.491451,"-113.49,53.54","AB%20Edmonton,%20Alberta",,wd:Q1951 wd:Q687536 wd:Q1965138 wd:Q4063377 wd...
1,AMBLER,Pennsylvania,40.153880,-75.223794,"-75.22,40.15","AMBLER,%20Pennsylvania",wd:Q372248,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...
2,ARDMORE,Pennsylvania,40.004962,-75.285838,"-75.29,40.0","ARDMORE,%20Pennsylvania",wd:Q1132213,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...
3,AVON,Indiana,39.764524,-86.380229,"-86.38,39.76","AVON,%20Indiana",wd:Q791269,wd:Q1415 wd:Q1184769 wd:Q6023245 wd:Q141305 wd...
4,Abington,Pennsylvania,40.124513,-75.123322,"-75.12,40.12","Abington,%20Pennsylvania",wd:Q88491559,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...
...,...,...,...,...,...,...,...,...
1440,wilmington,Delaware,39.737432,-75.554725,"-75.55,39.74","wilmington,%20Delaware",wd:Q174224 wd:Q64156483 wd:Q64156489 wd:Q85815841,wd:Q1393 wd:Q986183 wd:Q82048 wd:Q622910 wd:Q2...
1441,wimauma,Florida,27.712212,-82.298825,"-82.3,27.71","wimauma,%20Florida",wd:Q1843074,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...
1442,​Clayton,Missouri,38.649840,-90.336491,"-90.34,38.65","​Clayton,%20Missouri",wd:Q966922,wd:Q1581 wd:Q5419 wd:Q1939219 wd:Q111823126 wd...
1443,​Largo,Florida,27.918354,-82.760554,"-82.76,27.92","​Largo,%20Florida",wd:Q932577,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...


## Cities
From Yelp cities

In [15]:
def city_query(q_ids, location):
    query = f"""
    SELECT DISTINCT ?qid ?qidLabel
    WHERE {{
        VALUES ?qid {{{q_ids}}}
        {{?qid wdt:P31/wdt:P279* wd:Q486972.}} # Human Settlement
        SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }}

        SERVICE wikibase:around {{
            ?qid wdt:P625 ?location .
            bd:serviceParam wikibase:center "Point({location})"^^geo:wktLiteral   .
            bd:serviceParam wikibase:radius "100" .
            bd:serviceParam wikibase:distance ?distance .}} .
    }}
    ORDER BY ?distance
    LIMIT 1
    """
    return query

In [16]:
def qid_city(q_ids: str, location: str):
    if not q_ids:
        return None, None

    returned = wikidata_query(city_query(q_ids, location))

    if returned.empty:
        returned_qid = None
        returned_label = None
    else:
        returned_qid = returned["qid.value"].str.removeprefix("http://www.wikidata.org/entity/").values[0]
        returned_label = returned["qidLabel.value"].values[0]

    return returned_qid, returned_label

In [17]:
df[["city_qid", "city_label"]] = df.apply(lambda x: qid_city(x["city_q_ids"], x["location"]), result_type='expand', axis=1)

In [18]:
df

Unnamed: 0,city,state,latitude,longitude,location,search_string,city_q_ids,state_q_ids,city_qid,city_label
0,AB Edmonton,Alberta,53.541407,-113.491451,"-113.49,53.54","AB%20Edmonton,%20Alberta",,wd:Q1951 wd:Q687536 wd:Q1965138 wd:Q4063377 wd...,,
1,AMBLER,Pennsylvania,40.153880,-75.223794,"-75.22,40.15","AMBLER,%20Pennsylvania",wd:Q372248,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,Q372248,Ambler
2,ARDMORE,Pennsylvania,40.004962,-75.285838,"-75.29,40.0","ARDMORE,%20Pennsylvania",wd:Q1132213,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,Q1132213,Ardmore
3,AVON,Indiana,39.764524,-86.380229,"-86.38,39.76","AVON,%20Indiana",wd:Q791269,wd:Q1415 wd:Q1184769 wd:Q6023245 wd:Q141305 wd...,Q791269,Avon
4,Abington,Pennsylvania,40.124513,-75.123322,"-75.12,40.12","Abington,%20Pennsylvania",wd:Q88491559,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,,
...,...,...,...,...,...,...,...,...,...,...
1440,wilmington,Delaware,39.737432,-75.554725,"-75.55,39.74","wilmington,%20Delaware",wd:Q174224 wd:Q64156483 wd:Q64156489 wd:Q85815841,wd:Q1393 wd:Q986183 wd:Q82048 wd:Q622910 wd:Q2...,Q174224,Wilmington
1441,wimauma,Florida,27.712212,-82.298825,"-82.3,27.71","wimauma,%20Florida",wd:Q1843074,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...,Q1843074,Wimauma
1442,​Clayton,Missouri,38.649840,-90.336491,"-90.34,38.65","​Clayton,%20Missouri",wd:Q966922,wd:Q1581 wd:Q5419 wd:Q1939219 wd:Q111823126 wd...,Q966922,Clayton
1443,​Largo,Florida,27.918354,-82.760554,"-82.76,27.92","​Largo,%20Florida",wd:Q932577,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...,Q932577,Largo


## States
From Yelp states

In [19]:
def state_query(q_ids):
    query = f"""
    SELECT ?qid ?qidLabel
    WHERE
    {{
      VALUES ?Q {{{q_ids}}}

      ?Q wdt:P131* ?qid .
      {{?qid wdt:P31/wdt:P279* wd:{q_codes["state"]}.}}
      UNION
      {{?qid wdt:P31/wdt:P279* wd:{q_codes["province"]}.}}

      FILTER NOT EXISTS {{
        ?qid wdt:P31/wdt:P279* wd:{q_codes["country"]}.
      }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }}
    }}
    """
    return query

In [24]:
# def qid_state(q_ids: str):
#     if not q_ids:
#         return None, None
#
#     returned = wikidata_query(state_query(q_ids))
#
#     if returned.empty:
#         returned_qid = None
#         returned_label = None
#     else:
#         returned_qid = returned["qid.value"].str.removeprefix("http://www.wikidata.org/entity/").values[0]
#         returned_label = returned["qidLabel.value"].values[0]
#
#     return returned_qid, returned_label
#
#
# df[["state_qid", "state_label"]] = df.apply(lambda x: qid_state(x["state_q_ids"]), result_type='expand', axis=1) # 09:08
def qid_state(row: str):
    returned_table = wikidata_query(state_query(row["state_q_ids"]))
    q_ids_list = [x[3:] for x in row["state_q_ids"].split(" ")]
    if returned_table.empty:
        returned_qids = []
    else:
        returned_qids = returned_table["qid.value"].str.removeprefix("http://www.wikidata.org/entity/").tolist()

    try:
        first_common_qid = next(og_list for og_list in q_ids_list if og_list in returned_qids)
    except StopIteration:
        first_common_qid = None

    returned_label = None if not first_common_qid else returned_table[returned_table["qid.value"] == f"http://www.wikidata.org/entity/{first_common_qid}"]["qidLabel.value"].values[0]

    return first_common_qid, returned_label

In [25]:
df[["state_qid", "state_label"]] = df.apply(qid_state, result_type='expand', axis=1)

In [26]:
df

Unnamed: 0,city,state,latitude,longitude,location,search_string,city_q_ids,state_q_ids,city_qid,city_label,state_qid,state_label
0,AB Edmonton,Alberta,53.541407,-113.491451,"-113.49,53.54","AB%20Edmonton,%20Alberta",,wd:Q1951 wd:Q687536 wd:Q1965138 wd:Q4063377 wd...,,,Q1951,Alberta
1,AMBLER,Pennsylvania,40.153880,-75.223794,"-75.22,40.15","AMBLER,%20Pennsylvania",wd:Q372248,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,Q372248,Ambler,Q1400,Pennsylvania
2,ARDMORE,Pennsylvania,40.004962,-75.285838,"-75.29,40.0","ARDMORE,%20Pennsylvania",wd:Q1132213,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,Q1132213,Ardmore,Q1400,Pennsylvania
3,AVON,Indiana,39.764524,-86.380229,"-86.38,39.76","AVON,%20Indiana",wd:Q791269,wd:Q1415 wd:Q1184769 wd:Q6023245 wd:Q141305 wd...,Q791269,Avon,Q1415,Indiana
4,Abington,Pennsylvania,40.124513,-75.123322,"-75.12,40.12","Abington,%20Pennsylvania",wd:Q88491559,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,,,Q1400,Pennsylvania
...,...,...,...,...,...,...,...,...,...,...,...,...
1440,wilmington,Delaware,39.737432,-75.554725,"-75.55,39.74","wilmington,%20Delaware",wd:Q174224 wd:Q64156483 wd:Q64156489 wd:Q85815841,wd:Q1393 wd:Q986183 wd:Q82048 wd:Q622910 wd:Q2...,Q174224,Wilmington,Q1393,Delaware
1441,wimauma,Florida,27.712212,-82.298825,"-82.3,27.71","wimauma,%20Florida",wd:Q1843074,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...,Q1843074,Wimauma,Q812,Florida
1442,​Clayton,Missouri,38.649840,-90.336491,"-90.34,38.65","​Clayton,%20Missouri",wd:Q966922,wd:Q1581 wd:Q5419 wd:Q1939219 wd:Q111823126 wd...,Q966922,Clayton,Q1581,Missouri
1443,​Largo,Florida,27.918354,-82.760554,"-82.76,27.92","​Largo,%20Florida",wd:Q932577,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...,Q932577,Largo,Q812,Florida


## Counties
From cities

In [28]:
unique_cities = pd.Series(df["city_qid"].unique())


def county_query(q_id):
    query = f"""
    SELECT ?qid ?qidLabel
    WHERE
    {{
      wd:{q_id} wdt:P131* ?qid .
      ?qid wdt:P31/wdt:P279* wd:{q_codes["county"]}.

      FILTER NOT EXISTS {{
        ?qid wdt:P31/wdt:P279* wd:{q_codes["state"]}.
      }}
      FILTER NOT EXISTS {{
        ?qid wdt:P31/wdt:P279* wd:{q_codes["country"]}.
      }}
      FILTER NOT EXISTS {{
        ?qid wdt:P31/wdt:P279* wd:Q3301053. # consolidated city-county
      }}
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }}

    }}
    """
    return query


def qid_return(q_id):
    returned_qid = wikidata_query(county_query(q_id))
    if returned_qid.empty:
        return None, None
    else:
        return (returned_qid["qid.value"][0].removeprefix("http://www.wikidata.org/entity/"),
                returned_qid["qidLabel.value"][0])


county_qids, county_labels = zip(*unique_cities.apply(qid_return))

In [29]:
df = df.merge(pd.DataFrame(data={"city_qid": unique_cities,
                                 "county_qid": county_qids,
                                 "county_label": county_labels}), how="left", on="city_qid")

## Countries
From states

In [30]:
unique_states = pd.Series(df["state_qid"].unique())


def country_query(q_id):
    query = f"""
    SELECT ?qid ?qidLabel
    WHERE
    {{
      wd:{q_id} wdt:P131* ?qid .
      ?qid wdt:P31/wdt:P279* wd:{q_codes["country"]}.

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en" }}
    }}
    """
    return query


def qid_return(q_id):
    returned_qid = wikidata_query(country_query(q_id))
    if returned_qid.empty:
        return None, None
    else:
        return (returned_qid["qid.value"][0].removeprefix("http://www.wikidata.org/entity/"),
                returned_qid["qidLabel.value"][0])


country_qids, country_labels = zip(*unique_states.apply(qid_return))

In [31]:
df = df.merge(pd.DataFrame(data={"state_qid": unique_states,
                                 "country_qid": country_qids,
                                 "country_label": country_labels}), how="left", on="state_qid")

In [32]:
df

Unnamed: 0,city,state,latitude,longitude,location,search_string,city_q_ids,state_q_ids,city_qid,city_label,state_qid,state_label,county_qid,county_label,country_qid,country_label
0,AB Edmonton,Alberta,53.541407,-113.491451,"-113.49,53.54","AB%20Edmonton,%20Alberta",,wd:Q1951 wd:Q687536 wd:Q1965138 wd:Q4063377 wd...,,,Q1951,Alberta,,,Q16,Canada
1,AMBLER,Pennsylvania,40.153880,-75.223794,"-75.22,40.15","AMBLER,%20Pennsylvania",wd:Q372248,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,Q372248,Ambler,Q1400,Pennsylvania,Q378527,Montgomery County,Q30,United States of America
2,ARDMORE,Pennsylvania,40.004962,-75.285838,"-75.29,40.0","ARDMORE,%20Pennsylvania",wd:Q1132213,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,Q1132213,Ardmore,Q1400,Pennsylvania,Q378527,Montgomery County,Q30,United States of America
3,AVON,Indiana,39.764524,-86.380229,"-86.38,39.76","AVON,%20Indiana",wd:Q791269,wd:Q1415 wd:Q1184769 wd:Q6023245 wd:Q141305 wd...,Q791269,Avon,Q1415,Indiana,Q506489,Hendricks County,Q30,United States of America
4,Abington,Pennsylvania,40.124513,-75.123322,"-75.12,40.12","Abington,%20Pennsylvania",wd:Q88491559,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,,,Q1400,Pennsylvania,,,Q30,United States of America
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440,wilmington,Delaware,39.737432,-75.554725,"-75.55,39.74","wilmington,%20Delaware",wd:Q174224 wd:Q64156483 wd:Q64156489 wd:Q85815841,wd:Q1393 wd:Q986183 wd:Q82048 wd:Q622910 wd:Q2...,Q174224,Wilmington,Q1393,Delaware,Q156156,New Castle County,Q30,United States of America
1441,wimauma,Florida,27.712212,-82.298825,"-82.3,27.71","wimauma,%20Florida",wd:Q1843074,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...,Q1843074,Wimauma,Q812,Florida,Q488874,Hillsborough County,Q30,United States of America
1442,​Clayton,Missouri,38.649840,-90.336491,"-90.34,38.65","​Clayton,%20Missouri",wd:Q966922,wd:Q1581 wd:Q5419 wd:Q1939219 wd:Q111823126 wd...,Q966922,Clayton,Q1581,Missouri,Q498034,St. Louis County,Q30,United States of America
1443,​Largo,Florida,27.918354,-82.760554,"-82.76,27.92","​Largo,%20Florida",wd:Q932577,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...,Q932577,Largo,Q812,Florida,Q494556,Pinellas County,Q30,United States of America


# Population

In [33]:
def city_population_query(city_qid: str):
    try:
        city_population_query = f"""
        SELECT DISTINCT ?population
        WHERE {{
            ?city p:P1082 ?statement .
            VALUES ?city {{wd:{city_qid}}}
            ?statement ps:P1082 ?population .
            ?statement pq:P585 ?date .
            FILTER NOT EXISTS {{
                ?city p:P1082/pq:P585 ?date2 .
                FILTER(?date2 > ?date)
	        }}
        }}
        """
        a = wikidata_query(city_population_query)
        return int(a['population.value'][0])
    except:
        return None

In [34]:
df["population"] = df["city_qid"].apply(city_population_query)

In [35]:
df

Unnamed: 0,city,state,latitude,longitude,location,search_string,city_q_ids,state_q_ids,city_qid,city_label,state_qid,state_label,county_qid,county_label,country_qid,country_label,population
0,AB Edmonton,Alberta,53.541407,-113.491451,"-113.49,53.54","AB%20Edmonton,%20Alberta",,wd:Q1951 wd:Q687536 wd:Q1965138 wd:Q4063377 wd...,,,Q1951,Alberta,,,Q16,Canada,
1,AMBLER,Pennsylvania,40.153880,-75.223794,"-75.22,40.15","AMBLER,%20Pennsylvania",wd:Q372248,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,Q372248,Ambler,Q1400,Pennsylvania,Q378527,Montgomery County,Q30,United States of America,6807.0
2,ARDMORE,Pennsylvania,40.004962,-75.285838,"-75.29,40.0","ARDMORE,%20Pennsylvania",wd:Q1132213,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,Q1132213,Ardmore,Q1400,Pennsylvania,Q378527,Montgomery County,Q30,United States of America,13566.0
3,AVON,Indiana,39.764524,-86.380229,"-86.38,39.76","AVON,%20Indiana",wd:Q791269,wd:Q1415 wd:Q1184769 wd:Q6023245 wd:Q141305 wd...,Q791269,Avon,Q1415,Indiana,Q506489,Hendricks County,Q30,United States of America,21474.0
4,Abington,Pennsylvania,40.124513,-75.123322,"-75.12,40.12","Abington,%20Pennsylvania",wd:Q88491559,wd:Q1400 wd:Q2475732 wd:Q49117 wd:Q3960056 wd:...,,,Q1400,Pennsylvania,,,Q30,United States of America,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1440,wilmington,Delaware,39.737432,-75.554725,"-75.55,39.74","wilmington,%20Delaware",wd:Q174224 wd:Q64156483 wd:Q64156489 wd:Q85815841,wd:Q1393 wd:Q986183 wd:Q82048 wd:Q622910 wd:Q2...,Q174224,Wilmington,Q1393,Delaware,Q156156,New Castle County,Q30,United States of America,70898.0
1441,wimauma,Florida,27.712212,-82.298825,"-82.3,27.71","wimauma,%20Florida",wd:Q1843074,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...,Q1843074,Wimauma,Q812,Florida,Q488874,Hillsborough County,Q30,United States of America,9467.0
1442,​Clayton,Missouri,38.649840,-90.336491,"-90.34,38.65","​Clayton,%20Missouri",wd:Q966922,wd:Q1581 wd:Q5419 wd:Q1939219 wd:Q111823126 wd...,Q966922,Clayton,Q1581,Missouri,Q498034,St. Louis County,Q30,United States of America,17355.0
1443,​Largo,Florida,27.918354,-82.760554,"-82.76,27.92","​Largo,%20Florida",wd:Q932577,wd:Q812 wd:Q842472 wd:Q643617 wd:Q2255913 wd:Q...,Q932577,Largo,Q812,Florida,Q494556,Pinellas County,Q30,United States of America,82485.0


In [None]:
df = city_state_keys.merge(df, how="left", on=["city", "state"])
df.drop(columns=["latitude", "longitude", "location", "search_string", "city_q_ids", "state_q_ids", "city", "state"], inplace=True)
df.rename(columns={"city_og": "city", "state_og": "state"}, inplace=True)

In [52]:
df.to_csv(path_or_buf=get_path('location_mappings_search_location.csv'), index=False)

In [83]:
df = wikidata_df = pd.read_csv(get_path("location_mappings_search_location.csv"))

In [60]:
biz.drop(columns=["city", "state"], inplace=True)
biz.rename(columns={"city_og": "city", "state_og": "state"}, inplace=True)

In [84]:
data = biz.merge(df, how="left", on=["city", "state"])

In [85]:
data

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,hours,city_qid,city_label,state_qid,state_label,county_qid,county_label,country_qid,country_label,population
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,...,,Q159288,Santa Barbara,Q99,California,Q108106,Santa Barbara County,Q30,United States of America,88665.0
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,...,"{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...",Q382985,Affton,Q1581,Missouri,Q498034,St. Louis County,Q30,United States of America,20417.0
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,...,"{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...",Q18575,Tucson,Q816,Arizona,Q58688,Pima County,Q30,United States of America,542629.0
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,...,"{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",Q1345,Philadelphia,Q1400,Pennsylvania,Q496900,Philadelphia County,Q30,United States of America,1603797.0
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,...,"{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...",Q1183476,Green Lane,Q1400,Pennsylvania,Q378527,Montgomery County,Q30,United States of America,490.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,...,"{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3...",Q2096,Edmonton,Q1951,Alberta,,,Q16,Canada,1010899.0
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,...,"{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3...",Q23197,Nashville,Q1509,Tennessee,Q1177705,Davidson County,Q30,United States of America,684410.0
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,...,,Q6346,Indianapolis,Q1415,Indiana,Q506230,Marion County,Q30,United States of America,887642.0
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,...,"{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...",Q577939,Edwardsville,Q1204,Illinois,Q486448,Madison County,Q30,United States of America,26808.0


In [88]:
from rdflib import Namespace, Graph, URIRef, Literal, XSD
from rdflib.namespace import RDFS

In [89]:
# Define a namespace for your graph (optional)
schema = Namespace("https://schema.org/")
skos = Namespace("https://www.w3.org/2004/02/skos/core#")
business_uri = Namespace("https://www.yelp.com/biz/")
user_uri = Namespace("https://www.yelp.com/user_details?userid=")
yelpcat = Namespace("https://purl.archive.org/purl/yelp/business_categories#")
yelpont = Namespace("https://purl.archive.org/purl/yelp/ontology#")
yelpent = Namespace("https://purl.archive.org/purl/yelp/yelp_entities#")
wiki = Namespace("https://www.wikidata.org/entity/")
population_predicate = wiki + "P1082"  # P1082 = population
location_predicate = wiki + "P131"  # P131 = located in the administrative territorial entity
instance_of_predicate = wiki + "P31"  # P31 = instance of

In [95]:
df

Unnamed: 0,city,state,city_qid,city_label,state_qid,state_label,county_qid,county_label,country_qid,country_label,population
0,Santa Barbara,CA,Q159288,Santa Barbara,Q99,California,Q108106,Santa Barbara County,Q30,United States of America,88665.0
1,Affton,MO,Q382985,Affton,Q1581,Missouri,Q498034,St. Louis County,Q30,United States of America,20417.0
2,Tucson,AZ,Q18575,Tucson,Q816,Arizona,Q58688,Pima County,Q30,United States of America,542629.0
3,Philadelphia,PA,Q1345,Philadelphia,Q1400,Pennsylvania,Q496900,Philadelphia County,Q30,United States of America,1603797.0
4,Green Lane,PA,Q1183476,Green Lane,Q1400,Pennsylvania,Q378527,Montgomery County,Q30,United States of America,490.0
...,...,...,...,...,...,...,...,...,...,...,...
1462,Slidell,LA,Q988156,Slidell,Q1588,Louisiana,,,Q30,United States of America,28781.0
1463,Wales,PA,,,Q1400,Pennsylvania,,,Q30,United States of America,
1464,UPPER MORELAND,PA,Q7898809,Upper Moreland Township,Q1400,Pennsylvania,Q378527,Montgomery County,Q30,United States of America,26116.0
1465,Aston,DE,,,Q1393,Delaware,,,Q30,United States of America,


In [97]:
eval(f"df.{'state'}_qid")

0         Q99
1       Q1581
2        Q816
3       Q1400
4       Q1400
        ...  
1462    Q1588
1463    Q1400
1464    Q1400
1465    Q1393
1466     Q812
Name: state_qid, Length: 1467, dtype: object

In [107]:
def add_to_graph(row, lower_level, higher_level, higher_instance):
    graph = Graph()

    G.add((URIRef(wiki[eval(f"row.{lower_level}_qid")]), URIRef(location_predicate), URIRef(wiki[eval(f"row.{higher_level}_qid")])))
    G.add((URIRef(wiki[eval(f"row.{higher_level}_qid")]), URIRef(RDFS.label), Literal(eval(f"row.{higher_level}_label"), datatype=XSD.string)))
    G.add((URIRef(wiki[eval(f"row.{higher_level}_qid")]), URIRef(instance_of_predicate), URIRef(wiki + higher_instance)))

    return graph

In [108]:
G = Graph()
for row in data.itertuples():
    if row.city_qid:
        G.add((URIRef(yelpont[row.business_id]), URIRef(schema['location']), URIRef(wiki[row.city_qid])))
        G.add((URIRef(wiki[row.city_qid]), URIRef(RDFS.label), Literal(row.city_label, datatype=XSD.string)))
        if row.population:
            G.add((URIRef(wiki[row.city_qid]), URIRef(population_predicate), Literal(row.population, datatype=XSD.integer)))

        if row.county_qid:
            G += add_to_graph(row, "city", "county", "Q28575")
            if row.state_qid:
                G += add_to_graph(row, "county", "state", "Q7275")
                if row.country_qid:
                    G += add_to_graph(row, "state", "country", "Q6256") # to state
            elif row.country_qid:
                G += add_to_graph(row, "county", "country", "Q6256") # to county
        elif row.state_qid:
            G += add_to_graph(row, "city", "state", "Q7275") # to city
            if row.country_qid:
                G += add_to_graph(row, "state", "country", "Q6256") # to state
        elif row.country_qid:
            G += add_to_graph(row, "city", "country", "Q6256") # to city
    elif row.state_qid:
        G.add((URIRef(yelpont[row.business_id]), URIRef(schema['location']), URIRef(wiki[row.state_qid])))
        G.add((URIRef(wiki[row.state_qid]), URIRef(RDFS.label), Literal(row.state_label, datatype=XSD.string)))
        G.add((URIRef(wiki[row.state_id]), URIRef(instance_of_predicate), URIRef(wiki[row.state_id])))
        if row.country_qid:
            G += add_to_graph(row, "state", "country", "Q6256") # to state