In [1]:
from os import getenv
from datetime import date

import pandas as pd
import requests
import country_converter as coco
import folium
from bs4 import BeautifulSoup
from sqlalchemy import create_engine

In [2]:
engine = create_engine(getenv('POSTGRES_CONN'))

# I get it from Wikipedia and save country codes and the U.S. state codes tables in the database.
And make those codes the primary keys.

### Table of all countries and Alpha 3 codes

In [3]:
url_countries = 'https://en.wikipedia.org/wiki/ISO_3166-1'

df_countries = pd.read_html(url_countries)[1]
df_countries.rename(columns={'English short name (using title case)': 'country', 
                             'Alpha-3 code': 'country_code', 
                             'Independent': 'independent'}, inplace=True)
df_countries = df_countries.iloc[:, [2, 0, 5]]
for country_code, row in df_countries.iterrows():
    if country_code != 'VGB' and country_code != 'VIR':
        row['country'] = row['country'].strip('"')
        row['country'] = row['country'].split(' (')[0]
        row['country'] = row['country'].split('[')[0]
        row['country'] = row['country'].split(',')[0]

df_countries.head(2)

Unnamed: 0,country_code,country,independent
0,AFG,Afghanistan,Yes
1,ALA,Åland Islands,No


In [4]:
df_countries.to_sql('countries', engine, index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE countries ADD CONSTRAINT PK_country_code PRIMARY KEY (country_code);""")

### Table of all U.S. states in ISO format

In [5]:
url_states = 'https://en.wikipedia.org/wiki/ISO_3166-2:US'
df_states = pd.read_html(url_states)[0]
df_states.columns = ['state_code', 'state_name', 'category']
df_states.head(2)

Unnamed: 0,state_code,state_name,category
0,US-AL,Alabama,State
1,US-AK,Alaska,State


In [6]:
df_states.to_sql('states', engine, index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE states ADD CONSTRAINT PK_state_code PRIMARY KEY (state_code);""")

# Numbeo. Analyzing and scraping cities and their indices.

As a basis I take the table Numbeo "Current Cost of Living Index", which currently has 522 cities and, so far, it is not clear how many countries and how many cities are in these countries.

In [7]:
url = 'https://www.numbeo.com/cost-of-living/rankings_current.jsp'
df = pd.read_html(url)[1]
df.head(2)

Unnamed: 0,Rank,City,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
0,,"Hamilton, Bermuda",145.04,94.69,120.85,150.27,142.75,84.02
1,,"Zug, Switzerland",116.6,59.0,88.92,117.28,119.03,149.67


I add other Numbeo indexes related to cities. The indexes are duplicated in different tables but cover a different number of cities. I choose the ones with more cities.

In [8]:
indexes_dict = {
    'https://www.numbeo.com/quality-of-life/rankings_current.jsp': 
        ['City',
         'Quality of Life Index',
         'Property Price to Income Ratio',
         'Traffic Commute Time Index',
         'Climate Index'],
    'https://www.numbeo.com/crime/rankings_current.jsp': ['City', 'Safety Index'],
    'https://www.numbeo.com/health-care/rankings_current.jsp': ['City', 'Health Care Index'],
    'https://www.numbeo.com/pollution/rankings_current.jsp': ['City', 'Pollution Index']
}

In [9]:
for key, value in indexes_dict.items():
    df_temp = pd.read_html(key)[1][value]
    df = df.merge(df_temp, how='left', on='City')
    
df.drop(['Rank', 'Restaurant Price Index'], axis=1, inplace=True)
df.columns = list(map(lambda x: x.removesuffix(' Index').lower().replace(' ', '_'), df.columns))
df.rename(columns={'city': 'city_country'}, inplace=True)
#df.head(3)

I add links to the table to the pages of these cities with a strict indication of the USD currency 

In [10]:
response = requests.get(url).text
soup = BeautifulSoup(response, 'lxml')
links = soup.find_all('td', class_='cityOrCountryInIndicesTable')

href_list = []
for i in links:
    href_list.append([i.find('a').text, i.find('a').get('href') + '?displayCurrency=USD'])
    
city_link = pd.DataFrame(href_list, columns=['city_country', 'link'])

df = df.merge(city_link, how='left', on='city_country')

### Split City - State - Country

In [11]:
split_table1 = df['city_country'].str.rsplit(', ', n=1, expand=True)
split_table1.columns=['city_state', 'country']

split_table2 = split_table1['city_state'].str.split(', ', n=1, expand=True)
split_table2.columns=['city', 'state_code']

df = pd.concat([split_table2, split_table1['country'], df],axis=1)

df['city'] = df.apply(lambda row: row.city.split(' (')[0], axis = 1) # removing duplicate cities in brackets

#df.head()

### Adding alpha_3 code to countries

I will delete Kosovo first because it has no alpha3 code.

In [12]:
df = df.drop(df.loc[df['country'] == 'Kosovo (Disputed Territory)'].index)

In [13]:
new_df = pd.DataFrame({'country': df['country'].unique()})
new_df['country_code'] = new_df.apply(lambda row: coco.convert(names=row.country, to='ISO3') , axis = 1)
df = df.merge(new_df, how='left', on='country')

#df['country_code'] = df.apply(lambda row: coco.convert(names=row.country, to='ISO3') , axis = 1) - can be on one line, but it takes 4 times as long
df.head(3)

Unnamed: 0,city,state_code,country,city_country,cost_of_living,rent,cost_of_living_plus_rent,groceries,local_purchasing_power,quality_of_life,property_price_to_income_ratio,traffic_commute_time,climate,safety,health_care,pollution,link,country_code
0,Hamilton,,Bermuda,"Hamilton, Bermuda",145.04,94.69,120.85,150.27,84.02,,,,,,,,https://www.numbeo.com/cost-of-living/in/Hamil...,BMU
1,Zug,,Switzerland,"Zug, Switzerland",116.6,59.0,88.92,117.28,149.67,,,,,,,,https://www.numbeo.com/cost-of-living/in/Zug?d...,CHE
2,Basel,,Switzerland,"Basel, Switzerland",116.34,41.87,80.56,122.3,116.43,201.27,7.98,20.43,82.82,80.08,,,https://www.numbeo.com/cost-of-living/in/Basel...,CHE


### Adding a code to USA states
There are three non-U.S. state values.
I will assign all of these values to None.
I will add "US- " to the U.S. states to bring them to the ISO standard for U.S. states.

In [14]:
df[df['state_code'].notnull() & (df['country'] != "United States")]

Unnamed: 0,city,state_code,country,city_country,cost_of_living,rent,cost_of_living_plus_rent,groceries,local_purchasing_power,quality_of_life,property_price_to_income_ratio,traffic_commute_time,climate,safety,health_care,pollution,link,country_code
50,Nanaimo,BC,Canada,"Nanaimo, BC, Canada",75.62,35.22,56.21,76.61,88.48,,,,,50.77,68.65,,https://www.numbeo.com/cost-of-living/in/Nanai...,CAN
62,St. John's,Newfoundland and Labrador,Canada,"St. John's, Newfoundland and Labrador, Canada",74.58,20.17,48.43,72.64,151.18,,,,,53.01,,,https://www.numbeo.com/cost-of-living/in/St-Jo...,CAN
441,Qingdao,Shandong,China,"Qingdao, Shandong, China",32.51,10.64,22.0,34.81,74.03,160.28,19.13,25.12,70.4,,77.48,51.65,https://www.numbeo.com/cost-of-living/in/Qingd...,CHN
453,Batumi,Ajara,Georgia,"Batumi, Ajara, Georgia",31.65,12.44,22.42,30.99,27.42,,,,,,,,https://www.numbeo.com/cost-of-living/in/Batum...,GEO


In [15]:
indexes = df[df['state_code'].notnull() & (df['country'] != "United States")]['state_code'].index
df.loc[indexes, 'state_code'] = None

df['state_code'] = df.apply(lambda row: f"US-{row['state_code']}" if row['state_code'] != None else None, axis = 1)

# Map
There are 505 cities in the Numbeo index, but it is not clear which countries, many or few. How many cities are in these countries and what proportions? The total numbers won't tell me much, it would be much clearer to show them on a map.

In [16]:
map_df = df.groupby('country_code')['city'].nunique().reset_index().sort_values('city', ascending=False)

m = folium.Map(location=[37.87820990704326, 6.555063556986549], zoom_start=1.5)
folium.Choropleth(
    geo_data='scripts/data/world.geojson',
    name="choropleth",
    data=map_df,
    columns=['country_code', 'city'],
    key_on="feature.properties.ISO_A3", 
    fill_color="YlGn",
    nan_fill_color='pink',
    fill_opacity=0.8,
    bins=7,
    reset=True,
    highlight=True,
    legend_name='Count of cities'
).add_to(m)

<folium.features.Choropleth at 0x7f3ccb497640>

In [17]:
#m - show interactive map by folium

![title](./data/map.png)

On GitHub the interactive map is not displayed, so instead of displaying a map, I inserted a picture.

Now I can see which countries are in the index and which are not, and how many cities are in each country. I'm fine with it, except I don't need so many cities in India. I will remove the unnecessary (save 2 cities).

In [18]:
india = df.loc[(df['country'] == 'India') & 
              (df['city'] != 'Delhi') & 
              (df['city'] !='Mumbai')]

df = df.drop(india.index)

I will prepare the indexes (Primary Key) for the future table in postgres, so I can immediately load it into the database.

In [19]:
df.reset_index(drop=True, inplace=True)
df.insert(0, 'city_id', df.index +1)
df.head(3)

Unnamed: 0,city_id,city,state_code,country,city_country,cost_of_living,rent,cost_of_living_plus_rent,groceries,local_purchasing_power,quality_of_life,property_price_to_income_ratio,traffic_commute_time,climate,safety,health_care,pollution,link,country_code
0,1,Hamilton,,Bermuda,"Hamilton, Bermuda",145.04,94.69,120.85,150.27,84.02,,,,,,,,https://www.numbeo.com/cost-of-living/in/Hamil...,BMU
1,2,Zug,,Switzerland,"Zug, Switzerland",116.6,59.0,88.92,117.28,149.67,,,,,,,,https://www.numbeo.com/cost-of-living/in/Zug?d...,CHE
2,3,Basel,,Switzerland,"Basel, Switzerland",116.34,41.87,80.56,122.3,116.43,201.27,7.98,20.43,82.82,80.08,,,https://www.numbeo.com/cost-of-living/in/Basel...,CHE


I add to the table 'state_name' from the database. And I save with the right columns and Primary Keys in the index for scraping to a file.

In [20]:
states_tbl = pd.read_sql('states', engine).loc[:, ['state_code', 'state_name']]
df = df.merge(states_tbl, on='state_code', how='left')
df_to_file = df.iloc[:, [0, 1, 2, -1, 3, -2, 4, -3]].copy()
df_to_file.set_index('city_id', inplace=True)
df_to_file.to_pickle("./scripts/data/numbeo_links.pkl")

I form and load a table of cities in the database with information about who added the data to the table and assign keys.

In [21]:
df_cities = df.iloc[:, [0, 1, 2, -2]].copy()
df_cities.to_sql('cities', engine, if_exists='append', index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE cities ADD CONSTRAINT PK_city_id PRIMARY KEY (city_id);
                ALTER TABLE cities ADD CONSTRAINT FK_country_code FOREIGN KEY (country_code) REFERENCES public.countries (country_code);
                ALTER TABLE cities ADD CONSTRAINT FK_state_code FOREIGN KEY (state_code) REFERENCES public.states (state_code);""")

# I form a table of indices numbeo by city + upload it to the database

In [22]:
df_city_index = df.iloc[:, [0] + list(range(5,17))].copy()
# добавляю столбцы когда и кем добавлены данные в таблицу
df_city_index.loc[:, ('sys_updated_date', 'sys_updated_by')] = (date.today(), getenv('NB_USER'))
df_city_index.head(2)

Unnamed: 0,city_id,cost_of_living,rent,cost_of_living_plus_rent,groceries,local_purchasing_power,quality_of_life,property_price_to_income_ratio,traffic_commute_time,climate,safety,health_care,pollution,sys_updated_date,sys_updated_by
0,1,145.04,94.69,120.85,150.27,84.02,,,,,,,,2022-07-17,analyst_k2
1,2,116.6,59.0,88.92,117.28,149.67,,,,,,,,2022-07-17,analyst_k2


In [23]:
df_city_index.to_sql('numbeo_indexes_by_city', engine, index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE numbeo_indexes_by_city ADD CONSTRAINT FK_numbeo_indexes_by_city FOREIGN KEY (city_id) REFERENCES public.cities (city_id);""")

# I form a table of indices nambeo by country + upload it to the database

#### From Cost of Living Index by Country 2022

In [24]:
df_col_index = pd.read_html('https://www.numbeo.com/cost-of-living/rankings_by_country.jsp')[1]
df_col_index.drop('Rank', axis=1, inplace=True)
df_col_index.columns = list(map(lambda x: x.removesuffix(' Index').lower().replace(' ', '_'), df_col_index.columns))

#### From Quality of Life Index by Country

In [25]:
df_qol_index = pd.read_html('https://www.numbeo.com/quality-of-life/rankings_by_country.jsp')[1]
df_qol_index.drop(['Rank', 'Cost of Living Index', 'Safety Index'], axis=1, inplace=True)
df_qol_index.columns = list(map(lambda x: x.removesuffix(' Index').lower().replace(' ', '_'), df_qol_index.columns))

#### From Average Monthly Net Salary by Country

In [26]:
df_salary_index = pd.read_html('https://www.numbeo.com/cost-of-living/country_price_rankings?itemId=105')[1]
df_salary_index.drop([0, 2], axis=1, inplace=True)
df_salary_index[3] = df_salary_index[3].apply(lambda row: float(row.rstrip('\xa0$').replace(',', '')))
df_salary_index.rename(columns={1: 'country', 3: 'avg_salary(usd)'}, inplace=True)

#### From Crime Index by Country

In [27]:
df_safety_index = pd.read_html('https://www.numbeo.com/crime/rankings_by_country.jsp')[1]
df_safety_index = df_safety_index[['Country', 'Safety Index']]
df_safety_index.columns=['country', 'safety']

#### Merge 4 Country Tables

In [28]:
df_country_index = df_col_index.merge(df_qol_index, how='left', on='country')
df_country_index = df_country_index.merge(df_salary_index, how='left', on='country')
df_country_index = df_country_index.merge(df_safety_index, how='left', on='country')
# добавляю столбцы когда и кем добавлены данные в таблицу
df_country_index.loc[:, ('sys_updated_date', 'sys_updated_by')] = (date.today(), getenv('NB_USER'))

Removing Kosovo

In [29]:
df_country_index = df_country_index.drop(df_country_index.loc[df_country_index['country'] == 'Kosovo (Disputed Territory)'].index)

In [30]:
df_country_index['country_code'] = df_country_index.apply(lambda row: coco.convert(names=row.country, to='ISO3') , axis = 1)
df_country_index = df_country_index.reindex(columns=['country_code'] + list(df_country_index.columns[1:-1]))
df_country_index.head(2)

Unnamed: 0,country_code,cost_of_living,rent,cost_of_living_plus_rent,groceries,restaurant_price,local_purchasing_power,quality_of_life,purchasing_power,health_care,property_price_to_income_ratio,traffic_commute_time,pollution,climate,avg_salary(usd),safety,sys_updated_date,sys_updated_by
0,BMU,141.74,98.96,121.39,142.62,144.74,82.34,,,,,,,,,,2022-07-17,analyst_k2
1,CHE,110.34,50.21,81.73,113.35,104.3,116.19,195.06,116.19,75.32,8.86,28.37,19.39,79.56,5979.26,77.88,2022-07-17,analyst_k2


In [31]:
df_country_index.to_sql('numbeo_indexes_by_country', engine, if_exists='append', index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE numbeo_indexes_by_country ADD CONSTRAINT FK_country_code FOREIGN KEY (country_code) REFERENCES public.countries (country_code);""")

# Legatum Prosperity Index 2021 to Data Base

In [32]:
df_prosperity = pd.read_csv('./scripts/data/2021_Full_Data_Set_-_Legatum_Prosperity_Index.csv', sep=';').iloc[:, 1:]
df_prosperity.rename(columns={'area_code': 'country_code'}, inplace=True)
df_prosperity.to_sql('legatum_indexes', engine, if_exists='append', index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE legatum_indexes ADD CONSTRAINT FK_country_code FOREIGN KEY (country_code) REFERENCES public.countries (country_code);""")