In [55]:
import pandas as pd
import numpy as np
from Code.UtilityFunctions.wikidata_functions import wikidata_query, category_query, min_qid, get_all_wikidata_claims, compare_qids, categories_dict_singular, get_qid_label
from Code.UtilityFunctions.get_data_path import get_path
from rdflib import Namespace, Graph, URIRef, Literal, XSD
from rdflib.namespace import RDFS
import os
import gzip

In [22]:
location_mappings = pd.read_csv(get_path('location_mappings_expanded.csv'))

In [23]:
location_mappings

Unnamed: 0,coordinates,city_qid,cityLabel,population,county_qid,countyLabel,state_qid,stateLabel,country_qid,countryLabel
0,"-119.71,34.43",Q159288,Santa Barbara,88665.0,Q108106,Santa Barbara County,Q99,California,Q30,United States of America
1,"-90.34,38.55",Q602686,Lakeshire,1554.0,Q498034,St. Louis County,Q1581,Missouri,Q30,United States of America
2,"-110.88,32.22",Q18575,Tucson,542629.0,Q58688,Pima County,Q816,Arizona,Q30,United States of America
3,"-75.16,39.96",Q1345,Philadelphia,1603797.0,Q496900,Philadelphia County,Q1400,Pennsylvania,Q30,United States of America
4,"-75.47,40.34",Q1183476,Green Lane,490.0,Q378527,Montgomery County,Q1400,Pennsylvania,Q30,United States of America
...,...,...,...,...,...,...,...,...,...,...
11410,"-82.68,27.87",Q2151669,Pinellas Park,53093.0,Q494556,Pinellas County,Q812,Florida,Q30,United States of America
11411,"-119.8,39.32",Q40881,Carson City,58639.0,,,Q1227,Nevada,Q30,United States of America
11412,"-90.05,30.02",Q34404,New Orleans,383997.0,Q486231,Orleans Parish,Q1588,Louisiana,Q30,United States of America
11413,"-90.23,38.66",Q38022,St. Louis,301578.0,,,Q1581,Missouri,Q30,United States of America


In [24]:
location_mappings = location_mappings.dropna(subset=['city_qid']).sort_values(by=['cityLabel'])
location_mappings

Unnamed: 0,coordinates,city_qid,cityLabel,population,county_qid,countyLabel,state_qid,stateLabel,country_qid,countryLabel
5474,"-75.28,39.93",Q1131419,Aldan,4244.0,Q27844,Delaware County,Q1400,Pennsylvania,Q30,United States of America
730,"-75.3,39.92",Q1131419,Aldan,4244.0,Q27844,Delaware County,Q1400,Pennsylvania,Q30,United States of America
1004,"-75.29,39.92",Q1131419,Aldan,4244.0,Q27844,Delaware County,Q1400,Pennsylvania,Q30,United States of America
8823,"-75.28,39.92",Q1131419,Aldan,4244.0,Q27844,Delaware County,Q1400,Pennsylvania,Q30,United States of America
5525,"-90.15,38.91",Q443852,Alton,25676.0,Q486448,Madison County,Q1204,Illinois,Q30,United States of America
...,...,...,...,...,...,...,...,...,...,...
8011,"-82.23,28.15",Q2155683,Zephyrhills,17194.0,Q500992,Pasco County,Q812,Florida,Q30,United States of America
9396,"-82.22,28.14",Q2155683,Zephyrhills,17194.0,Q500992,Pasco County,Q812,Florida,Q30,United States of America
9448,"-82.22,28.23",Q2155683,Zephyrhills,17194.0,Q500992,Pasco County,Q812,Florida,Q30,United States of America
10485,"-82.2,28.28",Q2155683,Zephyrhills,17194.0,Q500992,Pasco County,Q812,Florida,Q30,United States of America


In [25]:
location_mappings.loc[location_mappings['cityLabel'].str.startswith('Q'), 'cityLabel'].unique()

array(['Q12530186', 'Q12536050', 'Q12551744', 'Q31455252', 'Q31455253',
       'Q31455254', 'Quakertown'], dtype=object)

# Create triples for locations on business ids

### Issue 1: some entities not being found, or not being mapped correctly
This is mostly because it is not being mapped to the borough or "census designated place" in wikidata, because we only look for the cities in radius of 20 km. This could possibly be handled by creating a new column searching for the borough or "census designated place" in wikidata, and then using that to create the triples.

### Issue 2: Canada entities are not correctly placed in the mapped dataframe
This is because the hierarchy is different from the US.

### Step 1: Load the business data from yelp

In [26]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

In [28]:
biz2 = biz.copy()

Add "long_lat_round" column to the dataframe for mapping to wikidata

In [29]:
biz2['long_lat_round'] = (biz2["longitude"].apply(round, args=(2,)).astype(str) + "," + biz2["latitude"].apply(round, args=(2,)).astype(str))

Select only the columns we need

In [31]:
biz2 = biz2[['business_id','long_lat_round', 'address', 'city', 'state']]

Merge the business data with the location data on the "long_lat_round" column

In [32]:
biz_location_mapping_merge = biz2.merge(location_mappings, left_on='long_lat_round', right_on='coordinates', how='left')

Create the triples from the merged dataframe

In [74]:
schema = Namespace("https://schema.org/")
example = Namespace("https://example.org/")
wiki = Namespace("https://www.wikidata.org/entity/")

## If file exists, delete it ##
remove_files="/home/ubuntu/vol1/virtuoso/import/wikidata_location_mappings.nt.gz"
if os.path.isfile(remove_files):
    os.remove(remove_files)
else:    ## Show an error ##
    print("Error: %s file not found" % remove_files)

location_predicate = wiki + "P131" # P131 = located in the administrative territorial entity
population_predicate = wiki + "P1082" # P1082 = population
instance_of_predicate = wiki + "P31" # P31 = instance of
city_object = wiki + "Q515" # Q515 = city
county_object = wiki + "Q28575" # Q28575 = county
state_object = wiki + "Q35657" # Q35657 = U.S. state
province_object = wiki + "Q11828004" # Q11828004 = province of Canada
country_object = wiki + "Q6256" # Q6256 = country

list_of_us_states = list(wikidata_query(sparql_query="SELECT ?state WHERE{?state wdt:P31 wd:Q35657.}")['state.value'].apply(lambda x: x[31:]))
list_of_canada_provinces = list(wikidata_query(sparql_query="SELECT ?province WHERE{?province wdt:P31 wd:Q11828004.}")['province.value'].apply(lambda x: x[31:]))

# triple_file = gzip.open(filename="/home/ubuntu/vol1/virtuoso/import/wikidata_location_mappings.nt.gz", mode="at", encoding="utf-8")

G = Graph()
for i in biz_location_mapping_merge.itertuples():
    # G.add((URIRef(example[i.business_id]), URIRef(schema.geo), Literal(i.long_lat_round)))
    if not pd.isna(i.city_qid):
        G.add((URIRef(example[i.business_id]), URIRef(location_predicate), URIRef(wiki[i.city_qid])))
        G.add((URIRef(wiki[i.city_qid]), URIRef(RDFS.label), Literal(i.cityLabel, datatype=XSD.string)))
        G.add((URIRef(wiki[i.city_qid]), URIRef(instance_of_predicate), URIRef(city_object)))
        if not pd.isna(i.population):
            G.add((URIRef(wiki[i.city_qid]), URIRef(population_predicate), Literal(i.population, datatype=XSD.integer)))
    if not pd.isna(i.county_qid):
        G.add((URIRef(example[i.business_id]), URIRef(location_predicate), URIRef(wiki[i.county_qid])))
        G.add((URIRef(wiki[i.county_qid]), URIRef(RDFS.label), Literal(i.countyLabel, datatype=XSD.string)))
        G.add((URIRef(wiki[i.county_qid]), URIRef(instance_of_predicate), URIRef(county_object)))
    if not pd.isna(i.state_qid):
        G.add((URIRef(example[i.business_id]), URIRef(location_predicate), URIRef(wiki[i.state_qid])))
        G.add((URIRef(wiki[i.state_qid]), URIRef(RDFS.label), Literal(i.stateLabel, datatype=XSD.string)))
        if i.state_qid in list_of_us_states:
            G.add((URIRef(wiki[i.state_qid]), URIRef(instance_of_predicate), URIRef(state_object)))
        elif i.state_qid in list_of_canada_provinces:
            G.add((URIRef(wiki[i.state_qid]), URIRef(instance_of_predicate), URIRef(province_object)))
    if not pd.isna(i.country_qid):
        G.add((URIRef(example[i.business_id]), URIRef(location_predicate), URIRef(wiki[i.country_qid])))
        G.add((URIRef(wiki[i.country_qid]), URIRef(RDFS.label), Literal(i.countryLabel, datatype=XSD.string)))
        G.add((URIRef(wiki[i.country_qid]), URIRef(instance_of_predicate), URIRef(country_object)))
        
G.serialize(destination="wikidata_location_mappings.nt", format="nt", encoding="utf-8")
# triple_file.write(G.serialize(format="nt"))
# triple_file.close()

Error: /home/ubuntu/vol1/virtuoso/import/wikidata_location_mappings.nt.gz file not found


<Graph identifier=N212cb9f41ef5475e912f062742046f0a (<class 'rdflib.graph.Graph'>)>

# Evaluate the results