In [107]:
import pandas as pd
import os
from bs4 import BeautifulSoup

In [108]:
df = pd.read_csv('/Users/ivan.sobolev/Downloads/world_countries.csv')

In [109]:
path = os.path.expanduser('~/Downloads/factbook-2017/docs/notesanddefs.html')
page = open(path).read()
page = BeautifulSoup(page)
file_by_category = {}
cols = page.select("span.category")
for col in cols:
    cells = col.select('td')
    colname = cells[0].text
    links = cells[1].select('a')
    if len(links) > 0:
        fpath = links[0]['href']
        file_by_category[colname] = fpath.split('#')[0]

In [110]:
def extract_col(col_name):
    path = os.path.expanduser('~/Downloads/factbook-2017/docs/' + file_by_category[col_name])
    page = open(path).read()
    page = BeautifulSoup(page)

    rows = page.select('#fieldListing tr')
    rows

    result = {}
    
    for row in rows:
        if (row.has_attr('class') and 'fieldHeading' in row['class']):
            continue
        country_code = str(row['id']).upper()
        country_name = row.select('td > a')[0].text
        field_data = row.select('.fieldData')[0].text
        
        result[country_code] = field_data.strip()

    return result

    
list(extract_col("Government type").items())[:2]

[('AF', 'presidential Islamic republic'), ('AL', 'parliamentary republic')]

In [111]:
df.columns

Index(['Country', 'Code', 'Region', 'Population', 'Area', 'Pop. Density',
       'Coastline', 'Net migration', 'Infant mortality', 'GDP', 'Literacy',
       'Phones', 'Arable', 'Crops', 'Other', 'Climate', 'Birthrate',
       'Deathrate', 'Agriculture', 'Industry', 'Service'],
      dtype='object')

In [112]:
import requests
response = requests.get('https://www.iban.com/country-codes')
page = BeautifulSoup(response.text)

In [113]:
from collections import defaultdict
codes_dict = {'code3': {}, 'code_num': {}}
for row in page.select('tbody tr'):
    cells = row.select('td')
    country_name = cells[0].text
    code2 = cells[1].text
    code3 = cells[2].text
    code_num = int(cells[3].text)
    codes_dict['code3'][code2] = code3
    codes_dict['code_num'][code2] = code_num

codes_df = pd.DataFrame(codes_dict)
codes_df.reset_index(inplace=True)
codes_df.columns = ['Code2', 'Code3', 'Code Num']
codes_df.head()

Unnamed: 0,Code2,Code3,Code Num
0,AD,AND,20
1,AE,ARE,784
2,AF,AFG,4
3,AG,ATG,28
4,AI,AIA,660


In [114]:
df_govt_type = pd.DataFrame({'Government type': extract_col("Government type")})
df_govt_type.head()

Unnamed: 0,Government type
AA,parliamentary democracy (Legislature); part of...
AC,parliamentary democracy (Parliament) under a c...
AE,federation of monarchies
AF,presidential Islamic republic
AG,presidential republic


In [115]:
len(df_govt_type['Government type'].unique()), len(df_govt_type)

(72, 234)

In [116]:
PRESIDENTIAL_REPUBLIC = 'REPUBLIC'
PARLIAMENTARY_DEMOCRACY = 'DEMOCRACY'
PARLIAMENTARY_REPUBLIC = 'REPUBLIC'
PRESIDENTIAL_DEMOCRACY = 'REPUBLIC'
MONARCHY = 'MONARCHY'
COMMUNIST = 'COMMUNIST'

def normalize_govt_type(orig_label):
    if ('presidential republic' in orig_label):
        return PRESIDENTIAL_REPUBLIC
    if ('parliamentary democracy' in orig_label):
        return PARLIAMENTARY_DEMOCRACY
    if ('parliamentary republic' in orig_label):
        return PARLIAMENTARY_REPUBLIC
    if ('monarchy' in orig_label):
        return MONARCHY
    if ('communist' in orig_label):
        return COMMUNIST
    if ('presidential democracy' in orig_label):
        return PRESIDENTIAL_DEMOCRACY

    return 'DIFFICULT'

df_govt_type[df_govt_type['Government type'] == df_govt_type['Government type'].apply(normalize_govt_type)]

Unnamed: 0,Government type


In [117]:
df_govt_type['Government type'] = df_govt_type['Government type'].apply(normalize_govt_type)
df_uni = df \
     .merge(codes_df, left_on='Code', right_on='Code3') \
     .merge(df_govt_type, left_on='Code2', right_index=True)
df_uni.head()

Unnamed: 0,Country,Code,Region,Population,Area,Pop. Density,Coastline,Net migration,Infant mortality,GDP,...,Climate,Birthrate,Deathrate,Agriculture,Industry,Service,Code2,Code3,Code Num,Government type
0,Afghanistan,AFG,ASIA (EX. NEAR EAST),31056997,647500,48.0,0.0,23.06,163.07,700.0,...,1.0,46.6,20.34,0.38,0.24,0.38,AF,AFG,4,DIFFICULT
1,Albania,ALB,EASTERN EUROPE,3581655,28748,124.6,1.26,-4.93,21.52,4500.0,...,3.0,15.11,5.22,0.232,0.188,0.579,AL,ALB,8,REPUBLIC
3,American Samoa,ASM,OCEANIA,57794,199,290.4,58.29,-20.71,9.27,8000.0,...,2.0,22.46,3.27,,,,AS,ASM,16,DEMOCRACY
5,Angola,AGO,SUB-SAHARAN AFRICA,12127071,1246700,9.7,0.13,0.0,191.19,1900.0,...,,45.11,24.2,0.096,0.658,0.246,AO,AGO,24,REPUBLIC
7,Antigua & Barbuda,ATG,LATIN AMER. & CARIB,69108,443,156.0,34.54,-6.15,19.46,11000.0,...,2.0,16.93,5.37,0.038,0.22,0.743,AG,ATG,28,REPUBLIC


In [123]:
df_uni.pivot_table(index='Region', columns='Government type', values='GDP', aggfunc='mean').round(0)

Government type,COMMUNIST,DEMOCRACY,DIFFICULT,MONARCHY,REPUBLIC
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ASIA (EX. NEAR EAST),1700.0,1900.0,9250.0,7780.0,10200.0
BALTICS,,,,11400.0,
C.W. OF IND. STATES,,3500.0,,,3500.0
EASTERN EUROPE,,13300.0,2200.0,6100.0,10050.0
LATIN AMER. & CARIB,2900.0,9925.0,6800.0,,6606.0
NEAR EAST,,16900.0,23200.0,12533.0,11250.0
NORTHERN AFRICA,,,6400.0,6900.0,4000.0
NORTHERN AMERICA,,24900.0,37800.0,,21450.0
OCEANIA,,7529.0,3500.0,,10017.0
SUB-SAHARAN AFRICA,,2680.0,4900.0,7200.0,1721.0
