In [1]:
import numpy as np
import pandas as pd
import json
import decimal

In [2]:
# given a df, a column to search, a search string, a column to edit, and a string to edit in,
# return the df with the column edited appropriately
def clean(df, find_str, find_col="city_ascii", edit_str=None, edit_col=None):
    if not edit_str:
        edit_str = find_str
    if not edit_col:
        edit_col = find_col
    df.loc[df[find_col].str.contains(find_str), edit_col] = edit_str
    return df

In [3]:
# given a city name and a row of data for that city, merge the existing row in lost with the new data
def update_row(wrong_city, right_data):
    right_data["city"] = wrong_city
    
    y = lost[lost['city_ascii'] == wrong_city]
    y = y.drop(columns=['admin_name', 'region', 'latitude', 'longitude', 'population'])
    
    merge_cols = ['city_ascii', 'admin_name', 'region', 'latitude', 'longitude', 'population', 'city']
    z = pd.merge(y, right_data[merge_cols], how='left', left_on=['city_ascii'], right_on=['city'])
    # drop the wrong city name
    z = z.drop(columns=['city_ascii_x', 'city'])
    # rename correct city name column
    z = z.rename(columns={'city_ascii_y': 'city_ascii'})
    # reorder columns
    z = z.reindex(columns=admin_cols)
    
    return z

In [3]:
# load in city, lat/lng, pop data
city_data = pd.read_csv('data/city-data/world-cities-data.csv')
city_data['region'] = city_data['region'].str.title()

# a quick look at wrong country names
city_data[city_data.country.str.contains(',')].groupby(by="country").count()

Unnamed: 0_level_0,city_ascii,region,latitude,longitude,population,iso2,iso3,admin_name,city
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"Bahamas, The",3,3,3,3,3,3,3,2,3
"Gambia, The",9,9,9,9,8,9,9,9,9
"Korea, North",37,37,37,37,36,37,37,37,37
"Korea, South",84,84,84,84,80,84,84,84,84
"Micronesia, Federated States Of",5,5,5,5,1,5,5,5,5
"Saint Helena, Ascension, And Tristan Da Cunha",2,0,2,2,0,2,2,2,2


In [4]:
# corrections
city_data.loc[city_data.country.str.contains("Bahamas"), "country"] = "Bahamas"
city_data.loc[city_data.country.str.contains("Korea, South"), "country"] = "South Korea"

In [5]:
# load in numbeo data
numbeo = pd.read_csv('data/numbeo/numbeo_clean.csv').drop(columns=["Unnamed: 0"])
numbeo = numbeo.reindex(columns=['city_ascii', 'admin_name', 'country', 'region', 'beer_market', 'bread', 'coffee', 'beer_pub', 'latitude', 'longitude', 'population'])

numbeo.head()

Unnamed: 0,city_ascii,admin_name,country,region,beer_market,bread,coffee,beer_pub,latitude,longitude,population
0,Richmond,Virginia,United States,North America,3.71,2.63,4.54,5.75,37.5295,-77.4756,1075798.0
1,Fort Collins,Colorado,United States,North America,3.47,2.93,4.29,4.5,40.5478,-105.0656,312666.0
2,Boston,Massachusetts,United States,North America,3.37,3.16,4.31,7.0,42.3188,-71.0846,4688346.0
3,New York,New York,United States,North America,3.29,3.78,4.82,7.35,40.6943,-73.9249,18713220.0
4,Philadelphia,Pennsylvania,United States,North America,3.26,2.83,3.92,5.0,40.0077,-75.1339,5649300.0


In [6]:
# corrections
# numbeo = clean(numbeo, find_str="Tel Aviv")
# numbeo = clean(numbeo, find_str="Jeddah")
# numbeo = clean(numbeo, find_str="Arhus", edit_str="Aarhus")
numbeo = clean(numbeo, find_str="Freiburg")
# numbeo = clean(numbeo, find_str="Calicut")
# numbeo = clean(numbeo, find_str="Lucknow")
numbeo = clean(numbeo, find_str="Mumbai")
numbeo = clean(numbeo, find_str="Padova", edit_str="Padua")
numbeo = clean(numbeo, find_str="Astana")
# numbeo = clean(numbeo, find_col="country", find_str="Kosovo")
# numbeo = clean(numbeo, find_str="The Hague")
# numbeo = clean(numbeo, find_str="Krakow")
# numbeo = clean(numbeo, find_str="Zaragoza")
# numbeo = clean(numbeo, find_str="Seville")
# numbeo = clean(numbeo, find_str="Kyiv")
# numbeo = clean(numbeo, find_str="Odessa")
# numbeo = clean(numbeo, find_str="Novgorod", edit_str="Nizhniy Novgorod")


In [7]:
# load in expatistan data
with open("data/expatistan/expatistan-data.json") as f:
    data = json.load(f)
expat = pd.json_normalize(data)
expat.columns = ['city_ascii', 'country', 'beer_market', 'bread', 'coffee', 'beer_pub']

expat.head()

Unnamed: 0,city_ascii,country,beer_market,bread,coffee,beer_pub
0,Kabul,Afghanistan,2.61,0.5,2.81,5.8
1,Tirana,Albania,1.06,0.78,1.56,3.58
2,Algiers,Algeria,0.88,0.23,2.29,2.1
3,Athens,Greece,1.53,0.91,4.72,5.27
4,Buenos Aires,Argentina,0.94,0.49,2.1,1.86


In [8]:
# merge expatistan and numbeo data
on_cols = ['city_ascii', 'country']
df_all = pd.merge(numbeo, expat, how="outer", on=on_cols, suffixes=("_n", "_e"))

In [9]:
# rows that have expatistan but not numbeo data
lost_n = df_all[df_all["beer_pub_n"].isna() & df_all["bread_n"].isna() & df_all['coffee_n'].isna() & df_all["beer_market_n"].isna()]


In [10]:
lost_n.sort_values("country")

Unnamed: 0,city_ascii,admin_name,country,region,beer_market_n,bread_n,coffee_n,beer_pub_n,latitude,longitude,population,beer_market_e,bread_e,coffee_e,beer_pub_e
620,Santa Rosa,,Argentina,,,,,,,,,1.15,0.4,1.16,2.24
621,Chittagong,,Bangladesh,,,,,,,,,2.4,0.52,1.92,3.63
645,Kingston,,Canada,,,,,,,,,3.33,1.75,3.73,5.21
622,Santa Clara,,Cuba,,,,,,,,,1.22,1.5,3.53,5.91
623,Brno,,Czech Republic,,,,,,,,,0.81,0.62,2.52,1.7
624,Olomouc,,Czech Republic,,,,,,,,,0.73,0.67,2.04,1.49
625,Ostrava,,Czech Republic,,,,,,,,,0.83,0.85,1.95,1.32
626,Prague,,Czech Republic,,,,,,,,,0.8,0.83,3.41,1.79
627,Hanover,,Germany,,,,,,,,,1.06,1.41,3.49,4.05
629,Patras,,Greece,,,,,,,,,1.38,1.1,4.14,3.92


In [12]:
# rows that have numbeo but not expatistan data
lost_e = df_all[df_all["beer_pub_e"].isna() & df_all["bread_e"].isna() & df_all['coffee_e'].isna() & df_all["beer_market_e"].isna()]


In [13]:
lost_e.sort_values("country").iloc[25:50]

Unnamed: 0,city_ascii,country,beer_market_n,bread_n,coffee_n,beer_pub_n,beer_market_e,bread_e,coffee_e,beer_pub_e
287,Breda,Netherlands,1.44,2.25,3.12,6.04,,,,
377,Quezon City,Philippines,1.03,1.3,2.24,1.25,,,,
268,Makati,Philippines,1.52,,2.78,1.46,,,,
460,Lodz,Poland,0.82,0.78,2.27,2.13,,,,
483,Katowice,Poland,0.74,0.91,2.09,1.86,,,,
390,Gdynia,Poland,1.0,0.75,2.64,2.13,,,,
417,Kosice,Slovakia,0.92,1.16,2.24,1.45,,,,
203,Lund,Sweden,1.88,3.01,4.33,7.74,,,,
491,Al Marsa,Tunisia,0.72,0.18,1.28,1.84,,,,
608,Ajman,United Arab Emirates,,0.71,4.4,,,,,


In [14]:
# load in pintprice data
pp = pd.read_csv("data/pintprice/pintprice.csv")
pp = pp.drop(columns=['Unnamed: 0']).dropna()

pp.head()

Unnamed: 0,city_ascii,country,beer_pub
1,Aberdeen,United Kingdom,5.07
2,Aberystwyth,United Kingdom,3.58
3,Acton,United Kingdom,6.26
4,Airdrie,United Kingdom,3.43
5,Aldershot,United Kingdom,4.47


In [15]:
# corrections
pp = clean(pp, find_str="ManäMa", edit_str="Manama")
pp = clean(pp, find_str="Jakarta")
pp = clean(pp, find_str="Quenec City", edit_str="Quebec City")
pp = clean(pp, find_str="Washonhton", edit_str="Washington")
pp = clean(pp, find_str="Newcastle Upon Tyne", edit_str="Newcastle upon Tyne")
pp = clean(pp, find_str="Marrakesh", edit_str="Marrakech")
pp = clean(pp, find_str="Bengaluru", edit_str="Bangalore")
pp = clean(pp, find_str="Taipei")
pp = clean(pp, find_str="Thessalonika", edit_str="Thessaloniki")
pp = clean(pp, find_str="Rijeka")
pp = clean(pp, find_str="Guatemala", edit_str="Guatemala City")
pp = clean(pp, find_str="Dimashq", edit_str="Damascus")
pp = clean(pp, find_str="Rio De Janeiro", edit_str="Rio de Janeiro")
pp = clean(pp, find_str="Dar Es Salaam", edit_str="Dar es Salaam")
pp = clean(pp, find_str="Goiã¢Nia", edit_str="Goiania")
pp = clean(pp, find_str="Palma De Mallorca", edit_str="Palma de Mallorca")
pp = clean(pp, find_str="Constanța", edit_str="Constanta")
pp = clean(pp, find_str="Dnepropetrovsk", edit_str="Dnipro")
pp = clean(pp, find_str="Fredricton", edit_str="Fredericton")
pp = clean(pp, find_str="Rochester, Ny", edit_str="Rochester")
pp = clean(pp, find_str="Salem, Or", edit_str="Salem")
pp = clean(pp, find_str="St John", edit_str="St. John's")
pp = clean(pp, find_str="Stoke-On-Trent", edit_str="Stoke-on-Trent")
pp = clean(pp, find_str="Malã©", edit_str="Male")
pp = clean(pp, find_str="Santa Cruz De Tenerife", edit_str="Santa Cruz de Tenerife")

In [16]:
# merge pintprice and previous data
df_all = pd.merge(df_all, pp, how="left", on=on_cols)

In [17]:
# rows missing pintprice data
df_all = df_all.rename(columns={"beer_pub": "beer_pub_pp"})
lost_pp = df_all[df_all['beer_pub_pp'].isna()]

In [18]:
len(lost_pp.sort_values("country"))

164

In [19]:
# rows of pp where the city doesn't appear in city_data - probably misspelled
lost_cities_pp = pd.merge(pp, city_data, how="left", on=on_cols)
misspelled = lost_cities_pp[lost_cities_pp['latitude'].isna()]

In [23]:
# for each row missing pintprice data, use edit distance to identify 
# the most likely matching but misspelled city with matching country
from fuzzywuzzy import fuzz, process

for city, country in zip(lost_pp['city_ascii'], lost_pp["country"]):
    candidates = misspelled.loc[(misspelled.city_ascii.str.contains(city[0])) & (misspelled.country == country), "city_ascii"]
    answer = process.extractOne(city, candidates)
    if answer:
        print(city, answer[0])

San Jose Saint Louis
Dallas Salem/Dallas
Fresno Disney World Fl
Denver Disney World Fl
Brooklyn Bloomer
St. Paul St Louis
Mississauga Fort Mcmurray
Indianapolis Kent Island
Charlotte Oak Cliff
Vadodara Vijayawada
Lexington Saint Louis
Sacramento Saint Louis
Bergamo Bellagio
Genoa Garda
Nagpur New Dehli
Kansas City Kent Island
Des Moines Ditnietis
Gurgaon Goa
Madison Portland Maine
Phoenix Portland Maine
Berkeley Bloomer
Patras Parga
Ghaziabad Goa
Guwahati Goa
Noida New Dehli
Charleston Oak Cliff
Spokane Saint Louis
Visakhapatnam Vijayawada
Kingston upon Hull Kingston
Makati Malay
Cagliari Conversano
Funchal Figueira Da Foz
Palermo Palestrina
Merida Mexico
Shenzhen Shekou
Dumaguete Davao City
Guadalajara Leon, Gt (Mx)
Oradea Poiana BraèOv
Arad Curtea De Arges
Erlangen Ellingen
Hangzhou Hainan
Essen Ellingen
Timisoara Targu Mures
Lodz Leå¼Ajsk
Ploiesti Poiana BraèOv
Mykolaiv Melitopol
Santa Rosa Saint Louis
Irvine Kent Island
Fairfax Disney World Fl
Fremont Disney World Fl
Coquitlam Ca

In [31]:
len(df_all), len(numbeo_clean)

(623, 620)

In [26]:
# corrections prior to merging with city_data
corrections = [['Newcastle', "Newcastle"],
              ["Penang", "George Town"],
              ['Heraklion', 'Irakleio'],
              ['Ahmedabad', 'Ahmadabad'],
              ['Patras', 'Patra'],
              ['Pattaya', 'Phatthaya'],
              ['Visakhapatnam', 'Vishakhapatnam'],
              ['Hanover', 'Hannover'],
              ['Yangon', 'Rangoon'],
              ['Rostov', 'Rostov'],
              ["Goa", "Panaji"],
              ['Ain', "Al `Ayn"],
              ["Santa Cruz", "Santa Cruz"],
              ["Ajman", "`Ajman"],
              ['Chittagong', 'Chattogram'],
              ['Macao', 'Macau'],
              ['Marsa', 'La Marsa']]

for find, edit in corrections:
    df_all = clean(df_all, find_str=find, edit_str=edit)

In [28]:
df_all.loc[df_all.city_ascii.str.contains("Rangoon"), "country"] = "Burma"
df_all.loc[df_all.city_ascii.str.contains("Macau"), "country"] = "Macau"

df_all.loc[df_all.country.str.contains("Cz"), "country"] = "Czechia"
df_all.loc[df_all.country.str.contains("Mace"), "country"] = "Macedonia"
df_all.loc[df_all.country.str.contains("Ivory"), "country"] = "Côte D’Ivoire"
df_all.loc[df_all.country.str.contains("Kosovo"), "country"] = "Kosovo"

In [None]:
pd.merge()

In [25]:
db_beer = pd.read_csv("data/deutschebank/beer-clean.csv").drop(columns=['Unnamed: 0'])
db_cafe = pd.read_csv("data/deutschebank/cappuccino-clean.csv").drop(columns=['Unnamed: 0'])

In [32]:
db_beer = db_beer.rename(columns={"price": "beer_pub"})
db_cafe = db_cafe.rename(columns={"price": "coffee"})
db = pd.merge(db_beer, db_cafe, how="outer", on=["city_ascii", "country", "region", "latitude", "longitude", "population"])
db = db.reindex(columns=["city_ascii", "country", "region", "coffee", "beer_pub", "latitude", "longitude", "population"])

db.head()

Unnamed: 0,city_ascii,country,region,coffee,beer_pub,latitude,longitude,population
0,Dubai,United Arab Emirates,Middle East,6.0,11.4,25.2697,55.3094,2502715
1,Oslo,Norway,Europe,5.5,9.8,59.9111,10.7528,693494
2,Copenhagen,Denmark,Europe,6.3,8.5,55.6786,12.5635,1085000
3,Hong Kong,Hong Kong,Asia & Pacific,5.7,8.2,22.305,114.185,7347000
4,Singapore,Singapore,Asia & Pacific,5.4,8.1,1.3,103.8,5745000


In [None]:
beer_pub['avg'] = beer_pub[['beer_pub_numbeo', 'beer_pub_expat']].sum(axis=1) / beer_pub[['beer_pub_numbeo', 'beer_pub_expat']].count(axis=1)
beer_pub['avg_round'] = beer_pub['avg'].apply(lambda x: round(x, 2))

In [3]:
numbeo.sort_values('beer', ascending=False).head(25)

Unnamed: 0,city_ascii,admin_name,country,region,bread,beer,coffee,latitude,longitude,population
63,Doha,Ad Dawḩah,Qatar,Middle East,1.45,8.51,4.63,25.3,51.5333,1312947.0
64,Amman,Al ‘Āşimah,Jordan,Middle East,0.41,5.3,4.14,31.95,35.9333,4007526.0
65,Newcastle,New South Wales,Australia,Asia & Pacific,1.67,4.7,3.52,-32.9167,151.75,322278.0
66,Muscat,Masqaţ,Oman,Middle East,1.08,4.66,4.61,23.6139,58.5922,1421409.0
67,Melbourne,Victoria,Australia,Asia & Pacific,2.34,4.57,3.4,-37.8136,144.9631,5078193.0
68,Adelaide,South Australia,Australia,Asia & Pacific,2.1,4.54,3.31,-34.9289,138.6011,1345777.0
69,Trondheim,Sør-Trøndelag,Norway,Europe,3.66,4.52,5.21,63.44,10.4,183378.0
70,Brisbane,Queensland,Australia,Asia & Pacific,2.12,4.5,3.55,-27.4678,153.0281,2514184.0
71,Canberra,Australian Capital Territory,Australia,Asia & Pacific,2.57,4.26,3.45,-35.2931,149.1269,426704.0
72,Sydney,New South Wales,Australia,Asia & Pacific,2.05,4.09,3.27,-33.865,151.2094,5312163.0


In [6]:
import altair as alt

data = numbeo_clean.loc[numbeo_clean['city_ascii'] == "Seoul", ['beer', 'bread', 'coffee']]
d = data.transpose()
d = d.reset_index()
d = d.rename(columns={'index': "Seoul", 141: "price ($)"})

chart = alt.Chart(d).mark_bar().encode(x=d.columns[0], y=d.columns[1])

In [7]:
chart

In [8]:
chart.save("Seoul.json")
chart.save("Seoul.html")
c = chart.to_json()

In [104]:
import json

seoul_chart = json.loads(c)
seoul_html = open('Seoul.html').read()
popup = f.Popup(max_width=450)
f.VegaLite(seoul_chart).add_to(popup)

n = f.Map([30,-80],zoom_start=2)
f.Marker([30, -80], popup=popup).add_to(n)
n

In [33]:
import bokeh
bokeh.io.output_notebook()

In [44]:
e = bokeh.models.sources.ColumnDataSource(d)
bokeh.io.output_file("Seoul_bokeh.html")
plot = bokeh.plotting.figure(width=200, height=150)
plot.vbar(x=[1, 2, 3], bottom=0, width=0.75, top=e.data['price ($)'], color="firebrick")
bokeh.plotting.show(plot)

# e.data

In [98]:
# b_html = open("Seoul_bokeh.html").read()
b_html = f.Html("<em>Example HTML</em>")
# iframe = f.IFrame(html=b_html, width=250, height=200)
popup = f.Popup(html=b_html, max_width=450)

In [99]:
m = f.Map(location=(numbeo_clean["latitude"][1], numbeo_clean["longitude"][1]), zoom_start=2)

fg = f.FeatureGroup(name="markers").add_to(m)
m_data = zip(numbeo_clean['city_ascii'], zip(numbeo_clean['latitude'], numbeo_clean['longitude']), numbeo_clean["beer"])
for city, coord, price in m_data:
    fg.add_child(
        f.CircleMarker(location=coord,
                        radius=3,
                        popup=popup, 
                        tooltip=city, 
                        color="red", 
                        fill=True, 
                        fill_color="red"))

In [22]:
for i in range(len(lost)):
    row = lost.iloc[i]
    city = row['city_ascii']
    country = row['country']
    index = row.name
    x = pp[pp.city_ascii.str.contains(city.strip("()")[-4:]) & (pp['country'] == country)]
    y = pp[pp.city_ascii.str.contains(city.strip("()")[:4]) & (pp['country'] == country)]
    if len(x) == 1:
        print(city, x)
#         lost[lost.index == index] = update_row(city, x).values
    elif len(y) == 1:
        print(city, y)
#         lost[lost.index == index] = update_row(city, y).values

Tel Aviv-Yafo      city_ascii country  beer_pub
1606   Tel Aviv  Israel      9.39
Dallas         city_ascii        country  beer_pub
2798  Salem/Dallas  United States      1.74
Jakarta                   city_ascii    country  beer_pub
1503  Jakarta Capital Region  Indonesia      3.16
Quebec City       city_ascii country  beer_pub
862  Quenec City  Canada      8.76
Jacksonville      city_ascii        country  beer_pub
2747    Jackson  United States      4.08
St. Paul      city_ascii        country  beer_pub
2810  St. Louis  United States      3.17
Washington          city_ascii        country  beer_pub
2817  Washonhton Dc  United States      6.36
Ann Arbor        city_ascii        country  beer_pub
2785  Palm Harbor  United States       1.8
Navi Mumbai      city_ascii country  beer_pub
1488     Mumbai   India      1.31
Raipur      city_ascii country  beer_pub
1481     Jaipur   India      1.48
Newcastle upon Tyne               city_ascii         country  beer_pub
238  Newcastle Upon Tyne

In [100]:
f.LayerControl().add_to(m)
m

In [77]:
latlng = m.get_bounds()

In [78]:
latlng

[[-43.5309, -157.846], [69.6489, 174.7833]]

In [58]:
def between(x, bounds):
    return bounds[0] < x < bounds[1]

In [76]:
numbeo_clean['latitude'].apply(between, bounds=[latlng[0][0], latlng[1][0]])

0      True
1      True
2      True
3      True
4      True
       ... 
487    True
488    True
489    True
490    True
491    True
Name: latitude, Length: 492, dtype: bool

In [79]:
dir(m)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_children',
 '_env',
 '_get_self_bounds',
 '_id',
 '_name',
 '_parent',
 '_png_image',
 '_repr_html_',
 '_repr_png_',
 '_template',
 '_to_png',
 'add_child',
 'add_children',
 'add_to',
 'choropleth',
 'control_scale',
 'crs',
 'default_css',
 'default_js',
 'fit_bounds',
 'get_bounds',
 'get_name',
 'get_root',
 'global_switches',
 'height',
 'keep_in_front',
 'left',
 'location',
 'objects_to_stay_in_front',
 'options',
 'png_enabled',
 'position',
 'render',
 'save',
 'to_dict',
 'to_json',
 'top',
 'width']

In [95]:
popup.render()

AssertionError: You cannot render this Element if it is not in a Figure.