In [3]:
import pandas as pd
import numpy as np
from Code.UtilityFunctions.wikidata_functions import wikidata_query, category_query, min_qid, get_all_wikidata_claims, compare_qids, categories_dict_singular, get_qid_label
from Code.UtilityFunctions.get_data_path import get_path
from rdflib import Namespace, Graph, URIRef, Literal, XSD
from rdflib.namespace import RDFS
import os
import gzip

# Create triples for locations on business ids

# Wanted triples:

<business_id> <has_location> <location_id>
<example:business_id> <wiki:P131> <wiki:city_qid>  
<example:business_id> <wiki:P131> <wiki:county_qid>  
<example:business_id> <wiki:P131> <wiki:state_qid>  
<example:business_id> <wiki:P131> <wiki:country_qid>  

<wiki:city_qid> <wiki:P1082> <wiki:population>  

<wiki:city_qid> <rdfs:label> <wiki:city_label>  
<wiki:county_qid> <rdfs:label> <wiki:county_label>  
<wiki:state_qid> <rdfs:label> <wiki:state_label>  
<wiki:country_qid> <rdfs:label> <wiki:country_label>  

<wiki:city_qid> <wiki:P31> <wiki:Q515>  
<wiki:county_qid> <wiki:P31> <wiki:Q28575>  
<wiki:state_qid> <wiki:P31> <wiki:Q35657> or <wiki:Q11828004>  
<wiki:country_qid> <wiki:P31> <wiki:Q6256>  

### Issue 1: some entities not being found, or not being mapped correctly
This is mostly because it is not being mapped to the borough or "census designated place" in wikidata, because we only look for the cities in radius of 20 km. This could possibly be handled by creating a new column searching for the borough or "census designated place" in wikidata, and then using that to create the triples.

### Issue 2: Canada entities are not correctly placed in the mapped dataframe
This is because the hierarchy is different from the US.

### Issue 3: Location is being mapped to Google API data in [create_nt_files](https://github.com/christiannielsen98/DVML-P7/blob/main/Code/create_nt_files.py)


In [4]:
biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)
biz['long_lat_round'] = (biz["longitude"].apply(round, args=(2,)).astype(str) + "," + biz["latitude"].apply(round, args=(2,)).astype(str))
biz['long_lat_exact'] = (biz["longitude"].astype(str) + "," + biz["latitude"].astype(str))

In [5]:
biz

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,long_lat_round,long_lat_exact
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",,"-119.71,34.43","-119.7111968,34.4266787"
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ...","-90.34,38.55","-90.335695,38.551126"
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ...","-110.88,32.22","-110.880452,32.223236"
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...","-75.16,39.96","-75.1555641,39.9555052"
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2...","-75.47,40.34","-75.4716585,40.3381827"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3...","-113.49,53.47","-113.4920537,53.4684188"
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3...","-86.77,36.12","-86.766925,36.115118"
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",,"-86.07,39.91","-86.065088,39.908707"
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ...","-89.95,38.78","-89.9505584,38.7823508"


In [6]:
def yelp_wiki_location_mappings():
    # load the wikidata location mappings
    location_mappings = pd.read_csv(get_path('location_mappings_expanded.csv'))
    # Load the business data from yelp
    biz = pd.read_json(get_path("yelp_academic_dataset_business.json"), lines=True)

    # Add "long_lat_round" column to the dataframe for mapping to wikidata
    biz['long_lat_round'] = (biz["longitude"].apply(round, args=(2,)).astype(str) + "," + biz["latitude"].apply(round, args=(2,)).astype(str))
    # Select only the columns we need
    biz2 = biz[['business_id','long_lat_round', 'address', 'city', 'state']]
    # Merge the business data with the location data on the "long_lat_round" column
    biz_location_mapping_merge = biz2.merge(location_mappings, left_on='long_lat_round', right_on='coordinates', how='left')
    return biz_location_mapping_merge

In [42]:
def create_wikidata_location_mappings():
    # Create the triples from the merged dataframe
    #TODO: replace the example namespace with the PURL namespace

    ## If file exists, delete it ##
    remove_files="/home/ubuntu/vol1/virtuoso/import/wikidata_location_mappings.nt.gz"
    if os.path.isfile(remove_files):
        os.remove(remove_files)
    else:    ## Show an error ##
        print("Error: %s file not found" % remove_files)
    
    schema = Namespace("https://schema.org/")
    example = Namespace("https://example.org/")
    wiki = Namespace("https://www.wikidata.org/entity/")

    location_predicate = wiki + "P131" # P131 = located in the administrative territorial entity
    population_predicate = wiki + "P1082" # P1082 = population
    instance_of_predicate = wiki + "P31" # P31 = instance of
    city_object = wiki + "Q515" # Q515 = city
    county_object = wiki + "Q28575" # Q28575 = county
    state_object = wiki + "Q35657" # Q35657 = U.S. state
    province_object = wiki + "Q11828004" # Q11828004 = province of Canada
    country_object = wiki + "Q6256" # Q6256 = country

    list_of_us_states = list(wikidata_query(sparql_query="SELECT ?state WHERE{?state wdt:P31 wd:Q35657.}")['state.value'].apply(lambda x: x[31:]))
    list_of_canada_provinces = list(wikidata_query(sparql_query="SELECT ?province WHERE{?province wdt:P31 wd:Q11828004.}")['province.value'].apply(lambda x: x[31:]))

    triple_file = gzip.open(filename="/home/ubuntu/vol1/virtuoso/import/wikidata_location_mappings.nt.gz", mode="at", encoding="utf-8")

    G = Graph()
    for i in yelp_wiki_location_mappings().itertuples():
        if not pd.isna(i.city_qid):
            G.add((URIRef(example[i.business_id]), URIRef(location_predicate), URIRef(wiki[i.city_qid])))
            G.add((URIRef(wiki[i.city_qid]), URIRef(RDFS.label), Literal(i.cityLabel, datatype=XSD.string)))
            G.add((URIRef(wiki[i.city_qid]), URIRef(instance_of_predicate), URIRef(city_object)))
            if not pd.isna(i.population):
                G.add((URIRef(wiki[i.city_qid]), URIRef(population_predicate), Literal(i.population, datatype=XSD.integer)))
        if not pd.isna(i.county_qid):
            G.add((URIRef(example[i.business_id]), URIRef(location_predicate), URIRef(wiki[i.county_qid])))
            G.add((URIRef(wiki[i.county_qid]), URIRef(RDFS.label), Literal(i.countyLabel, datatype=XSD.string)))
            G.add((URIRef(wiki[i.county_qid]), URIRef(instance_of_predicate), URIRef(county_object)))
        if not pd.isna(i.state_qid):
            G.add((URIRef(example[i.business_id]), URIRef(location_predicate), URIRef(wiki[i.state_qid])))
            G.add((URIRef(wiki[i.state_qid]), URIRef(RDFS.label), Literal(i.stateLabel, datatype=XSD.string)))
            if i.state_qid in list_of_us_states:
                G.add((URIRef(wiki[i.state_qid]), URIRef(instance_of_predicate), URIRef(state_object)))
            elif i.state_qid in list_of_canada_provinces:
                G.add((URIRef(wiki[i.state_qid]), URIRef(instance_of_predicate), URIRef(province_object)))
        if not pd.isna(i.country_qid):
            G.add((URIRef(example[i.business_id]), URIRef(location_predicate), URIRef(wiki[i.country_qid])))
            G.add((URIRef(wiki[i.country_qid]), URIRef(RDFS.label), Literal(i.countryLabel, datatype=XSD.string)))
            G.add((URIRef(wiki[i.country_qid]), URIRef(instance_of_predicate), URIRef(country_object)))
        
    triple_file.write(G.serialize(format="nt"))
    triple_file.close()

Error: /home/ubuntu/vol1/virtuoso/import/wikidata_location_mappings.nt.gz file not found


# Evaluate the results

In [7]:
biz_location_mapping_merge = yelp_wiki_location_mappings()

In [8]:
len(biz_location_mapping_merge['city'].drop_duplicates()), len(biz_location_mapping_merge['cityLabel'].drop_duplicates())

(1416, 245)

In [9]:
biz_location_mapping_merge[['city', 'cityLabel']].drop_duplicates()

Unnamed: 0,city,cityLabel
0,Santa Barbara,Santa Barbara
1,Affton,Grantwood Village
2,Tucson,Tucson
3,Philadelphia,Philadelphia
4,Green Lane,
...,...,...
149841,Wales,
150054,UPPER MORELAND,Beverly
150094,Glenside,Beverly
150183,St.Louis,Country Club Hills


In [10]:
pd.DataFrame([biz_location_mapping_merge['city'].value_counts(), biz_location_mapping_merge['cityLabel'].value_counts()]).T.head(20)

Unnamed: 0,city,cityLabel
Philadelphia,14569.0,15291.0
Tucson,9250.0,5648.0
Tampa,9050.0,6110.0
Indianapolis,7540.0,1743.0
Nashville,6971.0,3623.0
New Orleans,6209.0,5277.0
Reno,5935.0,5311.0
Edmonton,5054.0,2834.0
Saint Louis,4827.0,
Santa Barbara,3829.0,3608.0


## Precision, recall, F1 and F10 score for city names

In [11]:
tp = biz_location_mapping_merge[biz_location_mapping_merge['city'] == biz_location_mapping_merge['cityLabel']][['city','cityLabel']].shape[0]
fp = biz_location_mapping_merge[biz_location_mapping_merge['city'] != biz_location_mapping_merge['cityLabel']][['city','cityLabel']].shape[0]
fn = biz_location_mapping_merge[biz_location_mapping_merge['cityLabel'].isna() & biz_location_mapping_merge['city'].notna()][['city','cityLabel']].shape[0]
fn

8547

In [12]:
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_1 = (1+1**2) * (precision * recall) / (1**2 * precision + recall)
f_10 = (1+10**2) * (precision * recall) / (10**2 * precision + recall)

In [13]:
precision, recall, f_1, f_10

(0.4794141513575353,
 0.8939906976744186,
 0.6241303020725546,
 0.8864013765751848)

## Precision, recall, F1 and F10 score for state names

In [59]:
state_abv = {'ALABAMA':'AL',
'ALASKA':'AK',
'AMERICAN SAMOA': 'AS',
'ARIZONA':'AZ',
'ARKANSAS':'AR',
'CALIFORNIA':'CA',
'COLORADO':'CO',
'CONNECTICUT':'CT',
'DELAWARE':'DE',
'DISTRICT OF COLUMBIA':	'DC',
'FLORIDA':'FL',
'GEORGIA':'GA',
'GUAM':'GU',
'HAWAII':'HI',
'IDAHO':'ID',
'ILLINOIS':'IL',
'INDIANA':'IN',
'IOWA':'IA',
'KANSAS':'KS',
'KENTUCKY':'KY',
'LOUISIANA':'LA',
'MAINE':'ME',
'MARYLAND':'MD',
'MASSACHUSETTS':'MA',
'MICHIGAN':'MI',
'MINNESOTA':'MN',
'MISSISSIPPI':'MS',
'MISSOURI':'MO',
'MONTANA':'MT',
'NEBRASKA':'NE',
'NEVADA':'NV',
'NEW HAMPSHIRE': 'NH',
'NEW JERSEY': 'NJ',
'NEW MEXICO': 'NM',
'NEW YORK':	'NY',
'NORTHCAROLINA': 'NC',
'NORTH DAKOTA': 'ND',
'NORTHERN MARIANA IS': 'MP',
'OHIO':'OH',
'OKLAHOMA':'OK',
'OREGON':'OR',
'PENNSYLVANIA': 'PA',
'PUERTO RICO': 'PR',
'RHODE ISLAND':	'RI',
'SOUTH CAROLINA': 'SC',
'SOUTH DAKOTA':	'SD',
'TENNESSEE': 'TN',
'TEXAS': 'TX',
'UTAH': 'UT',
'VERMONT': 'VT',
'VIRGINIA': 'VA',
'VIRGIN ISLANDS': 'VI',
'WASHINGTON':'WA',
'WEST VIRGINIA': 'WV',
'WISCONSIN':'WI',
'WYOMING':'WY'}