In [25]:
import pandas as pd
import requests
import country_converter as coco
from bs4 import BeautifulSoup
import folium
from sqlalchemy import create_engine
from os import getenv
from datetime import date

In [26]:
engine = create_engine(getenv('AIRFLOW__CORE__SQL_ALCHEMY_CONN'))

# Достану из википедии и сохраню в БД таблицы кодов стран и штатов США.
И сделаю эти коды первичными ключами.

### Таблица всех стран и кодов Alpha 3

In [27]:
url_countries = 'https://en.wikipedia.org/wiki/ISO_3166-1'

df_countries = pd.read_html(url_countries)[1]
df_countries.rename(columns={'English short name (using title case)': 'country', 
                             'Alpha-3 code': 'country_code', 
                             'Independent': 'independent'}, inplace=True)
df_countries = df_countries.iloc[:, [2, 0, 5]]
for country_code, row in df_countries.iterrows():
    if country_code != 'VGB' and country_code != 'VIR':
        row['country'] = row['country'].strip('"')
        row['country'] = row['country'].split(' (')[0]
        row['country'] = row['country'].split('[')[0]
        row['country'] = row['country'].split(',')[0]

df_countries.head(2)

Unnamed: 0,country_code,country,independent
0,AFG,Afghanistan,Yes
1,ALA,Åland Islands,No


In [28]:
df_countries.to_sql('country', engine, index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE country ADD CONSTRAINT PK_country_code PRIMARY KEY (country_code);""")

### Таблица всех штатов США в формате ISO

In [29]:
url_states = 'https://en.wikipedia.org/wiki/ISO_3166-2:US'
df_states = pd.read_html(url_states)[0]
df_states.columns = ['state_code', 'state_name', 'category']
df_states.head(2)

Unnamed: 0,state_code,state_name,category
0,US-AL,Alabama,State
1,US-AK,Alaska,State


In [30]:
df_states.to_sql('state', engine, index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE state ADD CONSTRAINT PK_state_code PRIMARY KEY (state_code);""")

# Numbeo. Анализ и скрапинг городов и их индексов.

За основу беру таблицу Numbeo "Current Cost of Living Index", в которой на данный момент 506 городов и, пока, не понятно сколько стран и по сколько городов в этих странах.

In [31]:
url = 'https://www.numbeo.com/cost-of-living/rankings_current.jsp'
df = pd.read_html(url)[1]
df.head(2)

Unnamed: 0,Rank,City,Cost of Living Index,Rent Index,Cost of Living Plus Rent Index,Groceries Index,Restaurant Price Index,Local Purchasing Power Index
0,,"Hamilton, Bermuda",162.21,106.16,135.75,167.42,155.99,88.24
1,,"Basel, Switzerland",119.6,45.36,84.55,126.05,112.92,108.32


Добавляю другие интексы Numbeo относящиеся к городам. В разных таблицах индексы дублируются, но охватывают разное количество городов. Выбираю те, в которых больше городов.

In [32]:
indexes_dict = {
    'https://www.numbeo.com/quality-of-life/rankings_current.jsp': 
        ['City',
         'Quality of Life Index',
         'Property Price to Income Ratio',
         'Traffic Commute Time Index',
         'Climate Index'],
    'https://www.numbeo.com/crime/rankings_current.jsp': ['City', 'Safety Index'],
    'https://www.numbeo.com/health-care/rankings_current.jsp': ['City', 'Health Care Index'],
    'https://www.numbeo.com/pollution/rankings_current.jsp': ['City', 'Pollution Index']
}

In [33]:
for key, value in indexes_dict.items():
    df_temp = pd.read_html(key)[1][value]
    df = df.merge(df_temp, on='City', how='left')
    
df.drop(['Rank', 'Restaurant Price Index'], axis=1, inplace=True)
df.columns = list(map(lambda x: x.removesuffix(' Index').lower().replace(' ', '_'), df.columns))
df.rename(columns={'city': 'city_country'}, inplace=True)
#df.head(3)

Добавляю в таблицу ссылки на страницы данных городов со строгим указанием валюты USD

In [34]:
response = requests.get(url).text
soup = BeautifulSoup(response, 'lxml')
links = soup.find_all('td', class_='cityOrCountryInIndicesTable')

href_list = []
for i in links:
    href_list.append([i.find('a').text, i.find('a').get('href') + '?displayCurrency=USD'])
    
city_link = pd.DataFrame(href_list, columns=['city_country', 'link'])

df = df.merge(city_link, on='city_country', how='left')

### Разделить Город - Штат - Страна

In [35]:
split_table1 = df['city_country'].str.rsplit(', ', n=1, expand=True)
split_table1.columns=['city_state', 'country']

split_table2 = split_table1['city_state'].str.split(', ', n=1, expand=True)
split_table2.columns=['city', 'state']

df = pd.concat([split_table2, split_table1['country'], df],axis=1)

df['city'] = df.apply(lambda row: row.city.split(' (')[0], axis = 1) #убираю дубли городов в скобках

#df.head()

### Добавляю alpha_3 код к странам

Сначала удалю Косово, тк у него нет alpha3-кода.

In [36]:
df = df.drop(df.loc[df['country'] == 'Kosovo (Disputed Territory)'].index)

Добавляю Alpha_3 код через lambda к каждой строке. Не самый эффективный вариант, обрабатывается капждая строка, страны дублируются, но просто попробовать (merge уже был).

In [37]:
df['country_code'] = df.apply(lambda row: coco.convert(names=row.country, to='ISO3') , axis = 1)
df.head(3)

Unnamed: 0,city,state,country,city_country,cost_of_living,rent,cost_of_living_plus_rent,groceries,local_purchasing_power,quality_of_life,property_price_to_income_ratio,traffic_commute_time,climate,safety,health_care,pollution,link,country_code
0,Hamilton,,Bermuda,"Hamilton, Bermuda",162.21,106.16,135.75,167.42,88.24,,,,,,,,https://www.numbeo.com/cost-of-living/in/Hamil...,BMU
1,Basel,,Switzerland,"Basel, Switzerland",119.6,45.36,84.55,126.05,108.32,196.29,8.23,20.79,82.82,79.89,,,https://www.numbeo.com/cost-of-living/in/Basel...,CHE
2,Zurich,,Switzerland,"Zurich, Switzerland",118.33,61.58,91.54,120.83,117.21,195.42,10.04,32.62,81.48,82.68,74.84,17.58,https://www.numbeo.com/cost-of-living/in/Zuric...,CHE


### Добавляю код к штатам USA
Есть три значения штатов, не относящихся к штатам США.
Всем этим значениям присвою знанчение None.
А к штатам США добавлю "US- " - приведу к ISO стандарту обозначения штатов США.

In [38]:
df[df['state'].notnull() & (df['country'] != "United States")]

Unnamed: 0,city,state,country,city_country,cost_of_living,rent,cost_of_living_plus_rent,groceries,local_purchasing_power,quality_of_life,property_price_to_income_ratio,traffic_commute_time,climate,safety,health_care,pollution,link,country_code
49,Nanaimo,BC,Canada,"Nanaimo, BC, Canada",78.61,37.71,59.3,81.02,83.51,,,,,50.77,68.65,,https://www.numbeo.com/cost-of-living/in/Nanai...,CAN
74,St. John's,Newfoundland and Labrador,Canada,"St. John's, Newfoundland and Labrador, Canada",75.68,23.21,50.9,75.34,143.22,,,,,53.06,,,https://www.numbeo.com/cost-of-living/in/St-Jo...,CAN
446,Batumi,Ajara,Georgia,"Batumi, Ajara, Georgia",32.06,13.04,23.08,31.82,26.04,,,,,,,,https://www.numbeo.com/cost-of-living/in/Batum...,GEO


In [39]:
indexes = df[df['state'].notnull() & (df['country'] != "United States")]['state'].index
df.loc[indexes, 'state'] = None

df['state'] = df.apply(lambda row: f"US-{row['state']}" if row['state'] != None else None, axis = 1)

# Карта
В индексе Numbeo 505 городов, но не понятно, какие страны, много их или мало. Сколько городов в этих странах и в каких пропорциях. Суммарные цифры мне мало что скажут, гораздо понятнее будет отобразить их на карте.

In [40]:
map_df = df.groupby('country_code')['city'].nunique().reset_index().sort_values('city', ascending=False)

m = folium.Map(location=[37.87820990704326, 6.555063556986549], zoom_start=1.5)
folium.Choropleth(
    geo_data='data/world.geojson',
    name="choropleth",
    data=map_df,
    columns=['country_code', 'city'],
    key_on="feature.properties.ISO_A3", 
    fill_color="YlGn",
    nan_fill_color='pink',
    fill_opacity=0.8,
    bins=7,
    reset=True,
    highlight=True,
    legend_name='Count of cities'
).add_to(m)

FileNotFoundError: [Errno 2] No such file or directory: 'data/world.geojson'

![title](data/map.png)

На github интерактивная карта не отображается, поэтому вместо вывода карты, вставил картинку.

Теперь видно какие страны есть в индексе, а каких нет, и сколько городов в каждой стране. Меня всё устраивает, вот только мне не нужно в Индии так много городов. Удалю лишние (оставлю 2).

In [41]:
india = df.loc[(df['country'] == 'India') & 
              (df['city'] != 'Delhi') & 
              (df['city'] !='Mumbai')]

df = df.drop(india.index)

Подготовлю индексы (Primary Key) для будущей таблицы в postgres, чтобы сразу можно было загрузить её в БД.

In [42]:
df.reset_index(drop=True, inplace=True)
df.insert(0, 'city_id', df.index +1)
df.head(3)

Unnamed: 0,city_id,city,state,country,city_country,cost_of_living,rent,cost_of_living_plus_rent,groceries,local_purchasing_power,quality_of_life,property_price_to_income_ratio,traffic_commute_time,climate,safety,health_care,pollution,link,country_code
0,1,Hamilton,,Bermuda,"Hamilton, Bermuda",162.21,106.16,135.75,167.42,88.24,,,,,,,,https://www.numbeo.com/cost-of-living/in/Hamil...,BMU
1,2,Basel,,Switzerland,"Basel, Switzerland",119.6,45.36,84.55,126.05,108.32,196.29,8.23,20.79,82.82,79.89,,,https://www.numbeo.com/cost-of-living/in/Basel...,CHE
2,3,Zurich,,Switzerland,"Zurich, Switzerland",118.33,61.58,91.54,120.83,117.21,195.42,10.04,32.62,81.48,82.68,74.84,17.58,https://www.numbeo.com/cost-of-living/in/Zuric...,CHE


Сохраню таблицу (с нужными колонками) для скрапинга в файл

In [68]:
#df.iloc[:, [0, 1, 2, 3, -1, 4, -2]].to_pickle("./data/numbeo_links.pkl")

Формирую и загружаю в БД таблицу городов с информацией о том кем добавлены данные в таблицу и назначаю ключи

In [51]:
df_cities = df.iloc[:, [0, 1, 2, -1]].copy()
df_cities.to_sql('city', engine, if_exists='append', index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE city ADD CONSTRAINT PK_city_id PRIMARY KEY (city_id);
                ALTER TABLE city ADD CONSTRAINT FK_country_code FOREIGN KEY (country_code) REFERENCES public.country (country_code);
                ALTER TABLE city ADD CONSTRAINT FK_state_code FOREIGN KEY (state) REFERENCES public.state (state_code);""")

# Формирую таблицу Индексов намбео по городам + загрузка в БД

In [54]:
df_city_index = df.iloc[:, [0] + list(range(5,17))].copy()
# добавляю столбцы когда и кем добавлены данные в таблицу
df_city_index.loc[:, ('sys_updated_date', 'sys_updated_by')] = (date.today(), getenv('NB_USER'))
df_city_index.head(2)

In [57]:
df_city_index.to_sql('numbeo_indexes_by_city', engine, index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE numbeo_indexes_by_city ADD CONSTRAINT FK_numbeo_indexes_by_city FOREIGN KEY (city_id) REFERENCES public.city (city_id);""")

# Формирую таблицу Индексов намбео по странам + загрузка в БД

#### From Cost of Living Index by Country 2022

In [58]:
df_col_index = pd.read_html('https://www.numbeo.com/cost-of-living/rankings_by_country.jsp')[1]
df_col_index.drop('Rank', axis=1, inplace=True)
df_col_index.columns = list(map(lambda x: x.removesuffix(' Index').lower().replace(' ', '_'), df_col_index.columns))

#### From Quality of Life Index by Country

In [59]:
df_qol_index = pd.read_html('https://www.numbeo.com/quality-of-life/rankings_by_country.jsp')[1]
df_qol_index.drop(['Rank', 'Cost of Living Index'], axis=1, inplace=True)
df_qol_index.columns = list(map(lambda x: x.removesuffix(' Index').lower().replace(' ', '_'), df_qol_index.columns))

#### From Average Monthly Net Salary by Country

In [60]:
df_salary_index = pd.read_html('https://www.numbeo.com/cost-of-living/country_price_rankings?itemId=105')[1]
df_salary_index.drop([0, 2], axis=1, inplace=True)
df_salary_index[3] = df_salary_index[3].apply(lambda row: float(row.rstrip('\xa0$').replace(',', '')))
df_salary_index.rename(columns={1: 'country', 3: 'avg_salary(usd)'}, inplace=True)

#### Merge 3 Country Tables

In [61]:
df_country_index = df_col_index.merge(df_qol_index, on='country', how='left')
df_country_index = df_country_index.merge(df_salary_index, on='country', how='left')
# добавляю столбцы когда и кем добавлены данные в таблицу
df_country_index.loc[:, ('sys_updated_date', 'sys_updated_by')] = (date.today(), getenv('NB_USER'))

Удаляю Косово

In [64]:
df_country_index = df_country_index.drop(df_country_index.loc[df_country_index['country'] == 'Kosovo (Disputed Territory)'].index)

In [65]:
df_country_index['country_code'] = df_country_index.apply(lambda row: coco.convert(names=row.country, to='ISO3') , axis = 1)
df_country_index = df_country_index.reindex(columns=['country_code'] + list(df_country_index.columns[1:-1]))
df_country_index.head(2)

Unnamed: 0,country_code,cost_of_living,rent,cost_of_living_plus_rent,groceries,restaurant_price,local_purchasing_power,quality_of_life,purchasing_power,safety,health_care,property_price_to_income_ratio,traffic_commute_time,pollution,climate,avg_salary(usd),sys_updated_date,sys_updated_by
0,BMU,146.04,98.58,123.80,148.66,159.17,81.07,,,,,,,,,,2022-06-11,analyst_k2
1,CHE,123.35,53.54,90.62,128.13,122.09,118.44,195.27,118.44,78.32,74.85,8.29,28.50,19.59,80.21,5764.23,2022-06-11,analyst_k2
2,NOR,100.90,34.68,69.86,97.31,105.49,83.11,176.39,83.11,66.15,76.83,8.04,26.91,17.95,68.68,3349.62,2022-06-11,analyst_k2
3,ISL,94.86,41.93,70.05,90.22,99.42,77.06,182.26,77.06,76.47,66.36,6.36,19.77,15.83,68.81,3370.51,2022-06-11,analyst_k2
4,BRB,92.37,21.99,59.38,87.81,78.18,32.08,,,,,,,,,,2022-06-11,analyst_k2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134,DZA,26.87,4.59,16.43,28.82,14.48,24.63,,,,,,,,,240.16,2022-06-11,analyst_k2
135,COL,26.72,8.18,18.03,23.47,19.44,28.85,103.54,28.85,42.29,67.19,16.89,47.80,62.98,88.30,331.71,2022-06-11,analyst_k2
136,IND,24.43,5.30,15.47,26.43,17.56,49.72,110.99,49.72,55.37,65.66,10.16,46.52,73.05,65.13,737.99,2022-06-11,analyst_k2
137,AFG,20.37,2.72,12.09,14.92,12.41,23.04,,,,,,,,,,2022-06-11,analyst_k2


In [66]:
df_country_index.to_sql('numbeo_indexes_by_country', engine, if_exists='append', index=False)
with engine.connect() as con:
    con.execute("""ALTER TABLE numbeo_indexes_by_country ADD CONSTRAINT FK_country_code FOREIGN KEY (country_code) REFERENCES public.country (country_code);""")