# Statistical learning final project

## dataset selection and preprocessing

### Daniel A.
### UID: 100444499

In [242]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from autoimpute.imputations import MiceImputer
import pycountry
from difflib import get_close_matches

Prior to importing the data I have made a find and replace with the following regex: \"\s\[[\w\S]{1,}\]\", in order to remove some tags that the world bank databank adds to their columns

In [243]:
# Importing the raw data
raw_data = pd.read_csv('./data/raw/wb_raw_data.csv')

In [244]:
# excluding aggregates
codes_to_exclude = raw_data.iloc[2593:2640,3].values

# filtering the dataset
data = raw_data[~(raw_data['Country Code'].isin(codes_to_exclude))] 

# removing final diagnostic columns
data = data[~(data['Time'].isna()) & ~(data['Time Code'].isna())]

# converting year column to integer
data['Time'] = data['Time'].astype(int)

# replacing .. with NAN as the raw data indends this to be a NAN
data = data.replace('..',pd.NA)

# sorting values
data = data.sort_values(['Time','Country Name'])

In [245]:
# checking years we have queried
years = data.Time.unique()
print(years)

[2002 2004 2005 2008 2010 2012 2013 2015 2018 2020]


In [246]:
# making a dictionary with the subsets of the main dataframe
dfs = {}
for year in years:
    dfs[year] = data[data['Time'] == year].reset_index(drop=True)

# replacing nans in the 2020 dataframe with previous years data, as 
# the previous years' data still serves us a purpose for the analysis
for year in years:   
    dfs[2020] = dfs[2020].fillna(dfs[year])

# removing columns where there's too many NANs
cols_to_keep = []
df = {}
for col,val in zip(dfs[2020].columns,dfs[2020].isna().sum()):
    if val < 45:
        cols_to_keep.append(col)
df = dfs[2020][cols_to_keep]

In [247]:
# checking which countries have the most NANs
all_countries = df['Country Name'].values
countries_removed = []
for country,val in zip(all_countries,df.isna().sum(axis=1)):
    # removing at 3 nans per row
    if val > 4:
        countries_removed.append(country)

# finally filtering to remove them
# either way, these countries are mostly dependencies or 
# complex countries to get data from, like North Korea
# so even after imputing, this would probably
# yield unrealistic values
df = df[~(df['Country Name'].isin(countries_removed))]

In [248]:
# scraping the wikipedia page for list of countries by human development index
wikipage = requests.get('https://en.wikipedia.org/wiki/List_of_countries_by_Human_Development_Index').text
soup = BeautifulSoup(wikipage,'lxml')

# get the HDI table
table = soup.find('table',{'class':'wikitable sortable'})
links = table.findAll('a')
tds = table.findAll('td')
countries, hdi = [], []

# going through the links to find the countries wikipedia has data for
for link in links:
    countries.append(link.get('title'))

# cleaning countriesif not string
countries = [x for x in countries if x != None]

# finding the HDIs and appending them to the list hdi
for td in tds:
    try:
        num = float(td.text)
        if str(num)[0:2] == '0.':
            hdi.append(num)
    except:
        continue

In [249]:
# adding the HDI to the dataframe for the countries which we have in the wikipedia list
df['HDI'] = pd.NA
for n,country in enumerate(countries):
    df.loc[df['Country Name'] == country,'HDI'] = hdi[n]

# not all country names are written the same, we must find the country names
# using ISO 3166 country code and then finding those that match
missing_codes = list(df[~df['Country Name'].isin(countries)]['Country Code'].values)
for code in missing_codes:
    country = pycountry.countries.get(alpha_3=code)
    if country != None and country.name in countries: 
        idx = countries.index(country.name)
        df.loc[df['Country Code'] == code, 'HDI'] = hdi[idx]

        # remove the code from the list when done
        missing_codes.remove(code)

In [252]:
# to deal with the rest of the missing values
# we take the most important word of the country and find it in countries
missing_names = []
for code in missing_codes:
    country = pycountry.countries.get(alpha_3=code)
    if country != None:
        missing_names.append(get_close_matches(country.name,countries))
missing_names

[['Cuba'],
 ['The Bahamas', 'Panama'],
 [],
 [],
 ['Cape Verde'],
 ['Solomon Islands', 'Marshall Islands'],
 ['Democratic Republic of the Congo'],
 ['Togo', 'Mongolia', 'Tonga'],
 [],
 [],
 ['Zambia', 'Namibia', 'The Gambia'],
 ['Serbia'],
 [],
 ['Iceland', 'Iran', 'Poland'],
 ['Czech Republic', 'Dominican Republic'],
 [],
 ['Malta'],
 ['Federated States of Micronesia'],
 [],
 [],
 [],
 ['São Tomé and Príncipe'],
 [],
 ['Saint Kitts and Nevis', 'Saint Vincent and the Grenadines'],
 ['Saint Vincent and the Grenadines', 'Saint Kitts and Nevis'],
 ['Dominican Republic'],
 [],
 [],
 []]