In [1]:
import numpy as np
import pandas as pd
import folium
import geocoder
from requests import Session

In [14]:
import qwikidata.sparql

def get_city_wikidata(city, country):
    query = """
    SELECT ?city ?cityLabel ?country ?countryLabel ?population
    WHERE
    {
      ?city rdfs:label '%s'@en.
      ?city wdt:P1082 ?population.
      ?city wdt:P17 ?country.
      ?city rdfs:label ?cityLabel.
      ?country rdfs:label ?countryLabel.
      FILTER(LANG(?cityLabel) = "en").
      FILTER(LANG(?countryLabel) = "en").
      FILTER(CONTAINS(?countryLabel, "%s")).
    }
    """ % (city, country)

    res = qwikidata.sparql.return_sparql_query_results(query)
    out = res['results']['bindings'][0]
    return out

In [12]:
numbeo_beer = pd.read_table('numbeo_beer.txt', names=["rank", "city", "beer"], index_col="rank")
numbeo_bread = pd.read_table('numbeo_bread.txt', names=["rank", "city", "bread"], index_col="rank")
numbeo_coffee = pd.read_table("numbeo_coffee.txt", names=['rank', 'city', 'coffee'], index_col="rank")
numbeo = pd.merge(numbeo_beer, numbeo_bread, how="outer", on="city")
numbeo = pd.merge(numbeo, numbeo_coffee, how="outer", on="city")

Unnamed: 0,city,beer,bread
0,"Doha, Qatar",8.51,1.45
1,"Amman, Jordan",5.30,0.41
2,"Newcastle, Australia",4.70,1.67
3,"Muscat, Oman",4.66,1.08
4,"Melbourne, Australia",4.57,2.34
...,...,...,...
606,"Rawalpindi, Pakistan",,0.47
607,"Karachi, Pakistan",,0.44
608,"Kozhikode (Calicut), India",,0.41
609,"Kabul, Afghanistan",,0.39


In [3]:
numbeo['country'] = numbeo['city'].str.split(", ").str[1]
numbeo['city'] = numbeo['city'].str.split(', ').str[0]

In [46]:
numbeo.loc[numbeo['city'] == "Tromso", "city"] = "Tromsø"
numbeo.loc[numbeo['city'] == "Washington", 'city'] = "Washington DC"

AttributeError: 'Series' object has no attribute 'length'

In [51]:
import time

populations = []
for city, country in zip(numbeo["city"], numbeo['country']):
    # TODO replace this with a proper check against a list of US state abbr.
    if len(country) == 2:
        print(country)
        country = "United States"
    try:
        data = get_city_wikidata(city, country)
        populations.append(int(data['population']['value']))
    except IndexError:
        populations.append("NaN")
    time.sleep(1)

MA
NY
PA
CA
CO
NC
TX
CA
CA
OR
CA
CO
LA
FL
NE
NY
MI
GA
CA
TX
WA
FL
NY
FL
CA
HI
MN
DC
MD
IN
OH
IL
NC
TX
MI
MO
TN
KY
MN
UT
KY
AZ
TN
CA
TX
WA
NY
MO
NV
IA
ID
WI
AZ
NV
CA
FL
TX
OH
SC
WA
CO


In [52]:
populations

['1312947',
 '4007526',
 '71665',
 '1421409',
 '4529500',
 '22063',
 '183378',
 '2360241',
 '196037',
 '200181',
 '5888926',
 '129300',
 '40471',
 '2502715',
 '1467800',
 '257087',
 '53199',
 '693494',
 '223170',
 '92515',
 '1000000',
 '187604',
 '143986',
 '497097',
 '642045',
 '222356',
 '1173179',
 '208669',
 '333871',
 '617594',
 '418500',
 '19795791',
 'NaN',
 '7477',
 'NaN',
 '13942856',
 '138905',
 '1025350',
 '58319',
 'NaN',
 '416427',
 '474069',
 '638090',
 '79504',
 '789194',
 '157474',
 '626108',
 '638516',
 '547627',
 '1197816',
 '269802',
 '200680',
 '4488',
 '1239220',
 '246376',
 '631486',
 '705244',
 '281087',
 '80032',
 '4975000',
 '494665',
 '66937',
 '123867',
 '881549',
 '158454',
 '482',
 '42615',
 '378715',
 '152000',
 '518090',
 '919438',
 '8908081',
 '417650',
 '4370000',
 '200526',
 '466893',
 '2592149',
 '2187526',
 '10348348',
 '309346',
 '680250',
 '531902',
 '486290',
 '3976322',
 '1138',
 '274400',
 '608660',
 '225118',
 '2731571',
 '932546',
 '377200',
 

In [None]:
locations = []
with Session() as s:
    for city, country in zip(numbeo["city"], numbeo['country']):
        locations.append(geocoder.osm(city + ", " + country, session=s).latlng)

In [6]:
# For some reason, one city consistently returns None during the for loop
i = locations.index(None)
locations[i] = geocoder.osm(numbeo["city"][i]).latlng

In [7]:
# visually checking my data is complete
locations

[[25.2856329, 51.5264162],
 [31.9515694, 35.9239625],
 [-32.9272881, 151.7812534],
 [23.5882019, 58.3829448],
 [-37.8142176, 144.9631608],
 [-34.9281805, 138.5999312],
 [63.4305658, 10.3951929],
 [-27.4689682, 153.0234991],
 [-35.2975906, 149.1012676],
 [-33.8548157, 151.2164539],
 [1.357107, 103.8194992],
 [59.1020129, 5.712611357275702],
 [69.649208, 18.9543434],
 [25.0750095, 55.18876088183319],
 [-36.852095, 174.7631803],
 [60.3943055, 5.3259192],
 [-31.9527121, 115.8604796],
 [59.9133301, 10.7389701],
 [49.1977086, -123.1912406],
 [44.648618, -63.5859487],
 [24.4538352, 54.3774014],
 [60.4517531, 22.2670522],
 [40.5508527, -105.0668085],
 [1.4953041, 103.7550839],
 [60.1674881, 24.9427473],
 [-42.8825088, 147.3281233],
 [53.3497645, -6.2602732],
 [51.898627, -8.4705942],
 [54.5964411, -5.9302761],
 [42.3602534, -71.0582912],
 [-41.2887953, 174.7772114],
 [40.7127281, -74.0060152],
 [32.0852997, 34.7818064],
 [39.9527237, -75.1635262],
 [64.145981, -21.9422367],
 [35.6828387, 139.7

In [8]:
numbeo["coordinates"] = locations
numbeo['population'] = populations

In [10]:
m = folium.Map(location=numbeo['coordinates'][1], zoom_level=1)
for coord, price in zip(numbeo['coordinates'], numbeo['price']):
    folium.Marker(location=coord, popup=price, tooltip=price).add_to(m)

In [11]:
m