In [1]:
import numpy as np
import pandas as pd

In [21]:
numbeo_beer_market = pd.read_table('numbeo_beer_market.txt', names=["rank", "city", "beer_market"], index_col="rank")
numbeo_beer_pub = pd.read_table("numbeo_beer_pub.txt", names=["rank", "city", "beer_pub"], index_col="rank")
numbeo_bread = pd.read_table('numbeo_bread.txt', names=["rank", "city", "bread"], index_col="rank")
numbeo_coffee = pd.read_table("numbeo_coffee.txt", names=['rank', 'city', 'coffee'], index_col="rank")

numbeo = pd.merge(numbeo_beer_market, numbeo_beer_pub, how="outer", on="city")
numbeo = pd.merge(numbeo, numbeo_bread, how="outer", on="city")
numbeo = pd.merge(numbeo, numbeo_coffee, how="outer", on="city")

city_data = pd.read_csv('../city-data/world-cities-data.csv')
city_data['region'] = city_data['region'].str.title()

In [22]:
# create a separate country column
split = numbeo['city'].str.split(", ")
numbeo['country'] = split.str[-1]
numbeo['city'] = split.str[0]

cols = ["city_ascii", 'country','region', 'latitude', 'longitude', 'population', 'admin_name']
admin_cols = ['city_ascii', 'admin_name', 'country', 'region', 'beer_market', 'bread', 'coffee', 'beer_pub', 'latitude', 'longitude', 'population']

In [23]:
# a quick look at wrong country names
# city_data[city_data.country.str.contains(',')].groupby(by="country").count()

In [24]:
# to align with city_data column names
numbeo = numbeo.rename(columns={"city": "city_ascii"})

# some data cleaning
numbeo.loc[numbeo.city_ascii.str.contains("Jeddah"), "city_ascii"] = "Jeddah"
numbeo.loc[numbeo.city_ascii.str.contains("Calicut"), "city_ascii"] = "Calicut"
numbeo.loc[numbeo.city_ascii.str.contains("Newcastle"), "city_ascii"] = "Newcastle"
numbeo.loc[numbeo.city_ascii.str.contains("Penang"), "city_ascii"] = "George Town"
numbeo.loc[numbeo.city_ascii.str.contains("Heraklion"), "city_ascii"] = "Irakleio"
numbeo.loc[numbeo.city_ascii.str.contains("Ahmedabad"), "city_ascii"] = "Ahmadabad"
numbeo.loc[numbeo.city_ascii.str.contains("Patras"), "city_ascii"] = "Patra"
numbeo.loc[numbeo.city_ascii.str.contains("Pattaya"), "city_ascii"] = "Phatthaya"
numbeo.loc[numbeo.city_ascii.str.contains("Visakhapatnam"), "city_ascii"] = "Vishakhapatnam"
numbeo.loc[numbeo.city_ascii.str.contains("Hague"), "city_ascii"] = "The Hague"
numbeo.loc[numbeo.city_ascii.str.contains("Hanover"), "city_ascii"] = "Hannover"
numbeo.loc[numbeo.city_ascii.str.contains("Yangon"), "city_ascii"] = "Rangoon"
numbeo.loc[numbeo.city_ascii.str.contains("Rangoon"), "country"] = "Burma"
numbeo.loc[numbeo.city_ascii.str.contains("Rostov"), "city_ascii"] = "Rostov"
numbeo.loc[numbeo.city_ascii.str.contains("Novgorod"), "city_ascii"] = "Nizhniy Novgorod"
numbeo.loc[numbeo.city_ascii.str.contains("Goa"), "city_ascii"] = "Panaji"
numbeo.loc[numbeo.city_ascii.str.contains("Ain"), "city_ascii"] = "Al `Ayn"
numbeo.loc[numbeo.city_ascii.str.contains("Santa Cruz"), "city_ascii"] = "Santa Cruz"
numbeo.loc[numbeo.city_ascii.str.contains("Ajman"), "city_ascii"] = "`Ajman"
numbeo.loc[numbeo.city_ascii.str.contains("Chittagong"), "city_ascii"] = "Chattogram"
numbeo.loc[numbeo.city_ascii.str.contains("Macao"), "city_ascii"] = "Macau"
numbeo.loc[numbeo.city_ascii.str.contains("Macau"), "country"] = "Macau"
numbeo.loc[numbeo.city_ascii.str.contains("Marsa"), "city_ascii"] = "La Marsa"

numbeo.loc[numbeo.country.str.contains("Cz"), "country"] = "Czechia"
numbeo.loc[numbeo.country.str.contains("Mace"), "country"] = "Macedonia"
numbeo.loc[numbeo.country.str.contains("Ivory"), "country"] = "Côte D’Ivoire"
numbeo.loc[numbeo.country.str.contains("Kosovo"), "country"] = "Kosovo"
numbeo.city_ascii = numbeo.city_ascii.str.replace("Saint", "St.")

numbeo.loc[numbeo.city_ascii.str.contains("Petersburg"), "city_ascii"] = "Saint Petersburg"

city_data.loc[city_data.country.str.contains("Bahamas"), "country"] = "Bahamas"
city_data.loc[city_data.country.str.contains("Korea, South"), "country"] = "South Korea"

In [25]:
# for US cities, generate a list of unabbreviated state names
def state_unabbr(state):
    abbr = pd.read_table("../city-data/us-state-abbr.tsv", names=['abbr', 'name'], header=0)
    name = abbr.loc[abbr['abbr'] == state, 'name']
    if len(name) > 0:
        return name.values[0]
    else:
        return np.nan

states = []
for i in split:
    if i[-1] == "United States":
        states.append(state_unabbr(i[1]))

In [26]:
# separate numbeo into US and non-US cities
# for US cities, add states under admin_name, then merge with city_data latitude, longitude, region, and population
numbeo_us = numbeo[numbeo['country'] == 'United States']
numbeo_us["admin_name"] = states

numbeo_us = pd.merge(numbeo_us, city_data[cols], how='left', on=['city_ascii', 'country', 'admin_name'])
numbeo_us = numbeo_us.reindex(columns=admin_cols)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [27]:
len(numbeo_us) == len(numbeo[numbeo['country'] == 'United States'])

True

In [28]:
# for non-US cities, don't match on admin_name
numbeo_globe = numbeo[numbeo['country'] != "United States"]

numbeo_globe = pd.merge(numbeo_globe, city_data[cols], how='left', on=['city_ascii', 'country'])
numbeo_globe = numbeo_globe.reindex(columns=admin_cols)

In [29]:
x = numbeo[numbeo['country'] != "United States"]
for city, country in zip(x['city_ascii'], x['country']):
    y = numbeo_globe.loc[(numbeo_globe.city_ascii == city) & (numbeo_globe.country == country)]
    if len(y) != 1:
        print(y)

   city_ascii admin_name country   region  beer_market  bread  coffee  \
75    Uppsala    Uppsala  Sweden  Europe          2.47   2.92    4.11   
76    Uppsala  Stockholm  Sweden  Europe          2.47   2.92    4.11   

    beer_pub  latitude  longitude  population  
75      8.34   59.8498    17.6389    164535.0  
76      8.34   59.8601    17.6400    133117.0  
    city_ascii admin_name country          region  beer_market  bread  coffee  \
132    Windsor    Ontario  Canada  North America          2.01   2.15    3.27   
133    Windsor     Quebec  Canada  North America          2.01   2.15    3.27   

     beer_pub  latitude  longitude  population  
132      4.72   42.2833      -83.0    276165.0  
133      4.72   45.5667      -72.0      5367.0  
    city_ascii           admin_name country                region  \
309      Natal             Amazonas  Brazil  South/Latin America    
310      Natal  Rio Grande do Norte  Brazil  South/Latin America    

     beer_market  bread  coffee  beer

In [30]:
numbeo_globe = numbeo_globe.drop(index=[76, 133, 309, 388])

In [31]:
def update_row(wrong_city, right_data):
    right_data["city"] = wrong_city
    
    y = lost[lost['city_ascii'] == wrong_city]
    y = y.drop(columns=['admin_name', 'region', 'latitude', 'longitude', 'population'])
    
    merge_cols = ['city_ascii', 'admin_name', 'region', 'latitude', 'longitude', 'population', 'city']
    z = pd.merge(y, right_data[merge_cols], how='left', left_on=['city_ascii'], right_on=['city'])
    z = z.drop(columns=['city_ascii_x', 'city'])
    z = z.rename(columns={'city_ascii_y': 'city_ascii'})
    z = z.reindex(columns=admin_cols)
    
    return z

In [32]:
lost = numbeo_globe[np.isnan(numbeo_globe['latitude'])]

for i in range(len(lost)):
    row = lost.iloc[i]
    city = row['city_ascii']
    country = row['country']
    index = row.name
    x = city_data[city_data.city.str.contains(city.strip("()")[-4:]) & (city_data['country'] == country)]
    y = city_data[city_data.city.str.contains(city.strip("()")[:4]) & (city_data['country'] == country)]
    if len(x) == 1:
        lost[lost.index == index] = update_row(city, x).values
    elif len(y) == 1:
        lost[lost.index == index] = update_row(city, y).values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume

In [33]:
import qwikidata.sparql

def get_city_wikidata(city, country):
    query = """
    SELECT ?city ?cityLabel ?country ?countryLabel ?population
    WHERE
    {
      ?city rdfs:label '%s'@en.
      ?city wdt:P1082 ?population.
      ?city wdt:P17 ?country.
      ?city rdfs:label ?cityLabel.
      ?country rdfs:label ?countryLabel.
      FILTER(LANG(?cityLabel) = "en").
      FILTER(LANG(?countryLabel) = "en").
      FILTER(CONTAINS(?countryLabel, "%s")).
    }
    """ % (city, country)

    res = qwikidata.sparql.return_sparql_query_results(query)
    out = res['results']['bindings'][0]
    return out

In [34]:
import geocoder
import time
from requests import Session

with Session() as s:
    for row in lost[np.isnan(lost.latitude)].values:
        city = row[0]
        country = row[2]
        try:
            latlng = geocoder.osm(city + ", " + country, session=s).latlng
            if latlng:
#             print(city, country, latlng)
                lost.loc[lost.city_ascii == city, 'latitude'] = latlng[0]
                lost.loc[lost.city_ascii == city, "longitude"] = latlng[1]
            
            wikidata = get_city_wikidata(city, country)
            lost.loc[lost.city_ascii == city, "population"] = int(wikidata['population']['value'])
        except:
            pass
        time.sleep(1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [35]:
for i in range(len(lost)):
    row = lost.iloc[i]
    index = row.name
    numbeo_globe[numbeo_globe.index == index] = row.values

In [40]:
numbeo_clean = pd.concat([numbeo_us, numbeo_globe])
len(numbeo) == len(numbeo_clean)

True

In [41]:
numbeo.to_csv("numbeo.csv")

numbeo_clean.index = list(range(len(numbeo_clean)))
numbeo_clean.to_csv("numbeo_clean.csv")
numbeo_clean.to_json("numbeo_clean.json")

In [39]:
numbeo_clean[np.isnan(numbeo_clean.latitude)]

Unnamed: 0,city_ascii,admin_name,country,region,beer_market,bread,coffee,beer_pub,latitude,longitude,population
