In [1]:
import numpy as np
import pandas as pd
import geocoder
from requests import Session

In [2]:
# Load the cleaned CSV from the cell above
# Only run the subsequent cells if you want to gather fresh data
numbeo_beer = pd.read_table('numbeo_beer.txt', names=["rank", "city", "beer"], index_col="rank")
numbeo_bread = pd.read_table('numbeo_bread.txt', names=["rank", "city", "bread"], index_col="rank")
numbeo_coffee = pd.read_table("numbeo_coffee.txt", names=['rank', 'city', 'coffee'], index_col="rank")
numbeo = pd.merge(numbeo_beer, numbeo_bread, how="outer", on="city")
numbeo = pd.merge(numbeo, numbeo_coffee, how="outer", on="city")

city_data = pd.read_csv('../city-data/world-cities-data.csv')
city_data['region'] = city_data['region'].str.title()

In [3]:
# create a separate country column
split = numbeo['city'].str.split(", ")
numbeo['country'] = split.str[-1]
numbeo['city'] = split.str[0]

In [50]:
# a quick look at wrong country names
city_data[city_data.country.str.contains(',')].groupby(by="country").count()

Unnamed: 0_level_0,city_ascii,region,latitude,longitude,population,iso2,iso3,admin_name,city
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"Bahamas, The",3,3,3,3,3,3,3,2,3
"Gambia, The",9,9,9,9,8,9,9,9,9
"Korea, North",37,37,37,37,36,37,37,37,37
"Korea, South",84,84,84,84,80,84,84,84,84
"Micronesia, Federated States Of",5,5,5,5,1,5,5,5,5
"Saint Helena, Ascension, And Tristan Da Cunha",2,0,2,2,0,2,2,2,2


In [4]:
# to align with city_data column names
numbeo = numbeo.rename(columns={"city": "city_ascii"})

# some data cleaning
numbeo.loc[numbeo.city_ascii.str.contains("Jeddah"), "city_ascii"] = "Jeddah"
numbeo.loc[numbeo.city_ascii.str.contains("Calicut"), "city_ascii"] = "Calicut"
numbeo.city_ascii = numbeo.city_ascii.str.replace("Saint", "St.")

city_data.loc[city_data.country.str.contains("Bahamas"), "country"] = "Bahamas"
city_data.loc[city_data.country.str.contains("Korea, South"), "country"] = "South Korea"

In [5]:
# for US cities, generate a list of unabbreviated state names
def state_unabbr(state):
    abbr = pd.read_table("us-state-abbr.tsv", names=['abbr', 'name'], header=0)
    name = abbr.loc[abbr['abbr'] == state, 'name']
    if len(name) > 0:
        return name.values[0]
    else:
        return np.nan

states = []
for i in split:
    if i[-1] == "United States":
        states.append(state_unabbr(i[1]))

In [6]:
# separate numbeo into US and non-US cities
# for US cities, add states under admin_name, then merge with city_data latitude, longitude, region, and population
numbeo_us = numbeo[numbeo['country'] == 'United States']
numbeo_us["admin_name"] = states
us_cols = ["city_ascii", 'country','region', 'latitude', 'longitude', 'population', 'admin_name']
numbeo_us = pd.merge(numbeo_us, city_data[us_cols], how='left', on=['city_ascii', 'country', 'admin_name'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [7]:
# for non-US cities, don't match on admin_name
numbeo_globe = numbeo[numbeo['country'] != "United States"]
globe_cols = ["city_ascii", 'country','region', 'latitude', 'longitude', 'population', "admin_name"]
numbeo_globe = pd.merge(numbeo_globe, city_data[globe_cols], how='left', on=['city_ascii', 'country'])

In [8]:
# I don't know why they aren't the same length after they're reunited
# probably some duplicated values in numbeo_globe due to ambiguities in city, country
numbeo_admin = pd.concat([numbeo_us, numbeo_globe])

# reorder columns
admin_cols = ['city_ascii', 'admin_name', 'country', 'region', 'bread', 'beer', 'coffee', 'latitude', 'longitude', 'population']
numbeo_admin = numbeo_admin.reindex(columns=admin_cols)

len(numbeo) == len(numbeo_admin)

False

In [10]:
lost = numbeo_admin[np.isnan(numbeo_admin['latitude'])]
len(lost)

56

In [11]:
def update_row(wrong_city, right_data):
    right_data["city"] = wrong_city
    
    y = lost[lost['city_ascii'] == wrong_city]
    y = y.drop(columns=['admin_name', 'region', 'latitude', 'longitude', 'population'])
    
    merge_cols = ['city_ascii', 'admin_name', 'region', 'latitude', 'longitude', 'population', 'city']
    z = pd.merge(y, right_data[merge_cols], how='left', left_on=['city_ascii'], right_on=['city'])
    z = z.drop(columns=['city_ascii_x', 'city'])
    z = z.rename(columns={'city_ascii_y': 'city_ascii'})
    z = z.reindex(columns=admin_cols)
    
    return z

In [12]:
found = pd.DataFrame()
for i in range(len(lost)):
#     print(lost.iloc[i])
    row = lost.iloc[i]
    city = row['city_ascii']
    x = city_data[city_data.city.str.contains(city.strip("()")[-4:]) & (city_data['country'] == row[2])]
    y = city_data[city_data.city.str.contains(city.strip("()")[:4]) & (city_data['country'] == row[2])]
    if len(x) == 1:
        found = pd.concat([found, update_row(city, x)])
    elif len(y) == 1:
        found = pd.concat([found, update_row(city, y)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
len(found)

20

In [17]:
numbeo_admin = pd.concat([numbeo_admin, found])
numbeo_clean = numbeo_admin.dropna()

In [19]:
numbeo.to_csv("numbeo.csv")
numbeo_admin.to_csv("numbeo_admin.csv")
numbeo_clean.to_csv("numbeo_clean.csv")

In [18]:
numbeo_clean

Unnamed: 0,city_ascii,admin_name,country,region,bread,beer,coffee,latitude,longitude,population
0,Richmond,Virginia,United States,North America,2.63,3.71,4.54,37.5295,-77.4756,1075798.0
1,Fort Collins,Colorado,United States,North America,2.93,3.47,4.29,40.5478,-105.0656,312666.0
2,Boston,Massachusetts,United States,North America,3.16,3.37,4.31,42.3188,-71.0846,4688346.0
3,New York,New York,United States,North America,3.78,3.29,4.82,40.6943,-73.9249,18713220.0
4,Philadelphia,Pennsylvania,United States,North America,2.83,3.26,3.92,40.0077,-75.1339,5649300.0
...,...,...,...,...,...,...,...,...,...,...
0,Lorca,Murcia,Spain,Europe,1.46,0.90,2.52,37.6833,-1.7000,94404.0
0,Sevilla,Andalusia,Spain,Europe,1.27,0.79,1.74,37.3828,-5.9731,1212045.0
0,Odesa,Odes’ka Oblast’,Ukraine,Europe,0.54,0.73,1.20,46.4775,30.7326,1010783.0
0,Kyiv,"Kyyiv, Misto",Ukraine,Europe,0.59,0.72,1.21,50.4500,30.5236,2967000.0


In [None]:
import qwikidata.sparql

def get_city_wikidata(city, country):
    query = """
    SELECT ?city ?cityLabel ?country ?countryLabel ?population
    WHERE
    {
      ?city rdfs:label '%s'@en.
      ?city wdt:P1082 ?population.
      ?city wdt:P17 ?country.
      ?city rdfs:label ?cityLabel.
      ?country rdfs:label ?countryLabel.
      FILTER(LANG(?cityLabel) = "en").
      FILTER(LANG(?countryLabel) = "en").
      FILTER(CONTAINS(?countryLabel, "%s")).
    }
    """ % (city, country)

    res = qwikidata.sparql.return_sparql_query_results(query)
    out = res['results']['bindings'][0]
    return out

In [None]:
# This cell takes >10min to run D:
import time

numbeo['population2'] = np.zeros(len(numbeo), dtype=int)
for city, country in zip(numbeo["city_ascii"], numbeo['country']):
    try:
        data = get_city_wikidata(city, country)
        numbeo.loc[(numbeo['city_ascii'] == city) & (numbeo['country'] == country), "population2"] = int(data['population']['value'])
    except:
        numbeo.loc[(numbeo['city_ascii'] == city) & (numbeo['country'] == country), "population2"] = np.nan
    time.sleep(1)

In [None]:
# This cell also takes a while
numbeo["latitude2"] = np.zeros(len(numbeo))
numbeo['longitude2'] = np.zeros(len(numbeo))
with Session() as s:
    for city, country in zip(numbeo["city_ascii"], numbeo['country']):
        latlng = geocoder.osm(city + ", " + country, session=s).latlng
        if latlng:
            numbeo.loc[(numbeo['city_ascii'] == city) & (numbeo['country'] == country), 'latitude2'] = latlng[0]
            numbeo.loc[(numbeo['city_ascii'] == city) & (numbeo['country'] == country), 'longitude2'] = latlng[1]
        time.sleep(1)

In [None]:
# For some reason, one city consistently returns None during the for loop
# while None in locations:
#     i = locations.index(None)
#     locations[i] = geocoder.osm(numbeo["city"][i]).latlng