<a href="https://colab.research.google.com/github/christoph-fraller/dopp_2020w_group03_ex3/blob/main/dopp_2020w_group03_ex3_with_git.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generate SSH-Keys for Accessing Git Repository

In [None]:
# import and mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# generate ssh keys (insert your username@github.com + hit enter when prompted for any answer)
! ssh-keygen -t rsa -b 4096 -C 'christoph.fraller@gmail.com'

In [None]:
# check whether or not the ssh keys have been created ('id_rsa' and 'id_rsa.pub' should be displayed)
! ls /root/.ssh/

In [None]:
# create directory for saving the ssh keys
! mkdir -p /content/drive/MyDrive/Ssh

In [None]:
# copy ssh keys from /root/.ssh/* to /content/drive/MyDrive/Ssh/*
! cp /root/.ssh/id_rsa /content/drive/MyDrive/Ssh/
! cp /root/.ssh/id_rsa.pub /content/drive/MyDrive/Ssh/

In [None]:
# display public ssh key for copy/paste
! cat /content/drive/MyDrive/Ssh/id_rsa.pub

In [None]:
# add github to known hosts and adapt file access permissions
! ssh-keyscan github.com >> /root/.ssh/known_hosts
! chmod 644 /root/.ssh/known_hosts
! chmod 600 /root/.ssh/id_rsa
! ssh -T git@github.com

# Git Setup

In [None]:
# git config settings (replace with your credentials)
! git config --global user.email "maximilian.loesch97@gmail.com"
! git config --global user.name "Maxiking1997"

In [None]:
# create directory for git repositories
! mkdir -p /content/drive/MyDrive/Git

In [None]:
# git-clone has to be performed only once when setting up the git repo at your google drive
! git clone git@github.com:christoph-fraller/dopp_2020w_group03_ex3.git /content/drive/MyDrive/Git/dopp_2020w_group03_ex3

## Important Shell and Git Commands


**NOTICE:** Always ensure that you are in the right directory when performing git commands (e.g. /content/drive/MyDrive/Git/dopp_2020w_group03_ex3). In case of any issues that might occur when switching directories it is highly recommended to restart the runtime engine (CTRL + M + .).

In [None]:
# check current working directory
! pwd

In [None]:
# switch to specified working directory
%cd /content/drive/MyDrive/Git/dopp_2020w_group03_ex3

In [None]:
# list content of current working directory
! ls

In [None]:
# check git status
! git status

In [None]:
# always perform a git pull before you start working or commit/push some changes
! git pull

In [None]:
# add a new data file to git repo directly from colab
# at first upload the file into the folder of your google drive 
! git add data/CPI-2005.csv
! git commit -m 'New file added.'
! git push

# Perform these steps everytime when a new session has been started

In [None]:
# import and mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# create directory
! mkdir -p /root/.ssh

In [None]:
# copy ssh keys from /content/drive/MyDrive/Ssh/* to /root/.ssh/*
! cp /content/drive/MyDrive/Ssh/id_rsa /root/.ssh/
! cp /content/drive/MyDrive/Ssh/id_rsa.pub /root/.ssh/ 

In [None]:
# add github to known hosts and adapt file access permissions
! ssh-keyscan github.com >> /root/.ssh/known_hosts
! chmod 644 /root/.ssh/known_hosts
! chmod 600 /root/.ssh/id_rsa
! ssh -T git@github.com

In [None]:
# switch to specified working directory
%cd /content/drive/MyDrive/Git/dopp_2020w_group03_ex3

In [None]:
# always perform a git pull before you start working or commit/push some changes
! git pull

# Geopandas installation

In [None]:
# Important library for many geopython libraries
!apt install gdal-bin python-gdal python3-gdal 
# Install rtree - Geopandas requirment
!apt install python3-rtree 
# Install Geopandas
!pip install git+git://github.com/geopandas/geopandas.git
# Install descartes - Geopandas requirment
!pip install descartes 
# Install Folium for Geographic data visualization
!pip install folium
# Install plotlyExpress
!pip install plotly_express

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sb
import geopandas
from ipywidgets import IntSlider, interact
from scipy import stats
from statistics import mean
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score
from datetime import datetime

# Preprocessing of Income Data

## Load and merge income data


In [None]:
def load_merge_income_data():
  
  # load income data from csv
  income_data = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/undata_gni_per_capita.csv', sep = ';')

  # extend income data by adding an entry for each combination of (calendar_year, country_code) due to there are currently no missing entries in the data
  country_data_list = income_data[['country_code', 'country_name']].drop_duplicates().values.tolist()
  output_list = []
  for lst in country_data_list:
      country_code = lst[0]
      country_name = lst[1]
      for calendar_year in range(1970, 2019):
        output_list.append([calendar_year, country_code, country_name])
  df = pd.DataFrame(output_list, columns = ['calendar_year', 'country_code', 'country_name'])
  income_data_complete = df.merge(income_data[['calendar_year', 'country_code', 'gni_per_capita_us_dollar']], how = 'left', on = ['calendar_year', 'country_code'])

  # load population data from csv
  population_data = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/undata_population_total.csv', sep = ';', 
                                usecols = ['country_code', 'calendar_year', 'population_total'])
  population_data.drop_duplicates(inplace = True)
  population_data['population_total'] = population_data['population_total'] * 1000 # total population is specified in 1000

  # load country codes from csv
  country_codes = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/iso3166_unsd_country_codes.csv', sep = ';', 
                              usecols = ['m49_code', 'iso_alpha2_code', 'iso_alpha3_code', 'small_island_developing_states'])

  # merge income with population data and country codes
  output_data = income_data_complete.merge(population_data, how = 'left', on = ['calendar_year', 'country_code'])
  output_data = output_data.merge(country_codes, how = 'left', left_on = 'country_code', right_on = 'm49_code')

  return output_data

merged_income_data = load_merge_income_data()

## Clean issues in income data

In [None]:
def clean_income_data(input_data):
  
  output_data = input_data.copy()

  # drop duplicate information at column-level
  output_data.drop(['country_code'], axis = 1, inplace = True)
  output_data.rename(columns = {'country_code': 'm49_code'}, inplace = True) 

  # replace values of column 'small_island_developing_states' by True/False
  output_data['small_island_developing_states'].replace('x', True, inplace = True)
  output_data['small_island_developing_states'].fillna(False, inplace = True)

  # fix issues at column 'small_island_developing_states' for some countries
  output_data.iloc[output_data[output_data['country_name'] == 'Former Netherlands Antilles'].index, output_data.columns.get_loc('small_island_developing_states')] = True
  output_data.iloc[output_data[output_data['country_name'] == 'United Republic of Tanzania: Mainland'].index, output_data.columns.get_loc('small_island_developing_states')] = True
  output_data.iloc[output_data[output_data['country_name'] == 'United Republic of Tanzania: Zanzibar'].index, output_data.columns.get_loc('small_island_developing_states')] = True

  # drop small island countries and reset row index
  output_data.drop(output_data[output_data['small_island_developing_states'] == True].index, inplace = True)
  output_data.reset_index(drop = True, inplace = True)

  # reorder colums of dataframe
  output_data = output_data[['calendar_year', 'iso_alpha3_code', 'iso_alpha2_code', 'm49_code', 'country_name', 'population_total', 'gni_per_capita_us_dollar']]

  return output_data

cleaned_income_data = clean_income_data(merged_income_data)

## Exploring missing values in income data

In [None]:
# get an overview of missing values in income data
# it seems there is pattern between the missing values of the columns iso_alpha3_code, iso_alpha2_code, m49_code and population_total
ax = plt.axes()
sb.heatmap(cleaned_income_data.isna(), cbar = False);
ax.set_title('Visualization of missing values in income dataset')
plt.show()

In [None]:
# ad 1) missing values in country codes
# obtain countries with missing country codes (iso_alpha3_code, iso_alpha2_code, m49_code)
iso_alpha3_code_set = set(cleaned_income_data.loc[cleaned_income_data['iso_alpha3_code'].isna()].country_name.unique().tolist())
iso_alpha2_code_set = set(cleaned_income_data.loc[cleaned_income_data['iso_alpha2_code'].isna()].country_name.unique().tolist())
m49_code_set = set(cleaned_income_data.loc[cleaned_income_data['m49_code'].isna()].country_name.unique().tolist())
union_list_sorted = sorted(iso_alpha3_code_set | iso_alpha2_code_set | m49_code_set)
print('\nCountries with missing entries in their country codes:')
print('------------------------------------------------------')
print(*union_list_sorted, sep = '\n')

# strategy on dealing with missing in country codes:

## most of the missing entries in country codes can be traced back to former countries that no longer exist and therefore their codes are missing in 
## the actual iso3166 standard but in order to obtain a complete dataset we will refill based on historical data: Former Czechoslovakia, Former Ethiopia,
## Former Sudan, Former USSR, Former Yugoslavia, Yemen: Former Democratic Yemen, Yemen: Former Yemen Arab Republic

## Nambia's iso_alpha2_code corresponds to 'NA', which is interpreted as NA per default, we have to fix this when inserting the other missing country codes

## Kosovo has declared its independence from Serbia in 2008 but until today this declaration is quite controversial
## due to reasons of simplicity and without being politically, we have decided to exclude the Kosovo from our analysis

In [None]:
# ad 2) missing values in population
# obtain countries with population total 
population_list_sorted = sorted(cleaned_income_data.loc[cleaned_income_data['population_total'].isna()].country_name.unique().tolist())
print('\nCountries with missing entries in their population values:')
print('----------------------------------------------------------')
print(*population_list_sorted, sep = '\n')

# strategy on dealing with missing in population values:

## missing entries in population values can be traced back to former countries that no longer exist
## because we know the former composition of that countries we can easily calculate their population values based on their components
## at least this is a possible approach for large countries that have been splitted up: 
### Former Sudan, Former USSR, Former Yugoslavia, Yemen: Former Democratic Yemen, Yemen: Former Yemen Arab Republic
### Former Czechoslovakia -> Czech Republic, Slovakia,
### Former Ethiopia -> Ethiopia, Eritrea,
### Former Sudan -> Sudan, South Sudan],
### Former USSR -> Armenia, Azerbaijan, Belarus, Estonia, Georgia, Kazakhstan, Kyrgyzstan, Latvia, Lithuania, Republic of Moldova, Russian Federation, Tajikistan, Turkmenistan, Ukraine, Uzbekistan
### Former Yugoslavia -> Bosnia and Herzegovina, Croatia, Montenegro, Republic of North Macedonia, Serbia, Slovenia

## in case of Yemen we have a merge of two former countries for which the above mentioned approach is not possible
## even if such a merge of two quite similiar countries (at least in gni_per_capita) is politically important, for our purposes it is not
## therefore we decided to consider Yemen in our data as one country for the entire observation period

## Kosovo has declared its independence from Serbia in 2008 but until today this declaration is quite controversial
## due to reasons of simplicity and without being politically, we have decided to exclude the Kosovo from our analysis

In [None]:
# ad 3) missing values in gni per capita
# obtain countries with gni per capita
gni_null_values = cleaned_income_data.gni_per_capita_us_dollar.isnull().groupby(cleaned_income_data['country_name']).sum().astype(int).reset_index(name = 'null_count')
print('\nCountries with missing entries in their gni per capita values:')
print('--------------------------------------------------------------')
print(gni_null_values[gni_null_values['null_count'] > 0])

# strategy on dealing with missing in gni per capita values:
## when taking a closer look at the data, almost all of the missing values at gni per capita can be traced back to years for which a country does not exist
## therefore such entries with missing values at gni per capita can be safely removed but for Yemen a special handling will be required: 
## due to the similarity of the Yemen: Former Democratic Yemen and Yemen: Former Yemen Arab Republic we decided to calculate the mean of the gni per capita
## of both countries and use it to replace the historical missing values of Yemen, for our work the separation between that two former countries is neglible

## Handling missing country codes

In [None]:
def handle_missing_country_codes(input_data):

  output_data = input_data.copy()

  # drop entries of Kosovo
  output_data.drop(output_data[output_data['country_name'] == 'Kosovo'].index, inplace = True)
  output_data.reset_index(drop = True, inplace = True)

  # load formerly used country codes from csv, set na_filter to false in order to fix the issue of Namibia's iso_alpha2_code
  former_country_codes = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/iso3166_formerly_used_country_codes.csv', sep = ';', na_filter = False)

  country_codes_list = ['iso_alpha3_code', 'iso_alpha2_code', 'm49_code']   

  # insert missing values from former_country_codes
  for row_index in range(0, len(output_data)):
      
    for country_code in country_codes_list:
        
      # check if country code value is missing
      if pd.isnull(output_data.loc[row_index, country_code]):

        # insert missing country code
        output_data.loc[row_index, country_code] = former_country_codes.loc[former_country_codes[former_country_codes['country_name'] == output_data.loc[row_index, 'country_name']].index[0], country_code]
  
  return output_data
    
income_data_country_codes_complete = handle_missing_country_codes(cleaned_income_data)

## Compute missing population data

In [None]:
def compute_missing_population_total(input_data):

    output_data = input_data.copy()

    former_country_lists = [['Former Czechoslovakia', 'Czech Republic', 'Slovakia'],
                             ['Former Ethiopia', 'Ethiopia', 'Eritrea'],
                             ['Former Sudan', 'Sudan', 'South Sudan'],
                             ['Former USSR', 'Armenia', 'Azerbaijan', 'Belarus', 'Estonia', 'Georgia', 'Kazakhstan', 'Kyrgyzstan', 'Latvia', 'Lithuania', 
                              'Republic of Moldova', 'Russian Federation', 'Tajikistan', 'Turkmenistan', 'Ukraine', 'Uzbekistan'],
                             ['Former Yugoslavia', 'Bosnia and Herzegovina', 'Croatia', 'Montenegro', 'Republic of North Macedonia', 'Serbia', 'Slovenia']]

    for country_list in former_country_lists:

      # get first list elem of list
      former_country_name = country_list.pop(0)

      # get list of calendar_years
      calendar_year_list = output_data.loc[output_data['country_name'] == former_country_name].calendar_year.tolist()

      for calendar_year in calendar_year_list:
        
        # get current index of former_country at particular calender_year
        curr_former_country_index = output_data.loc[(output_data['calendar_year'] == calendar_year) & (output_data['country_name'] == former_country_name)].population_total.index.max()
        
        # initialize new_population_total
        new_population_total = 0

        for country_name in country_list:
          
          if pd.isnull(output_data.loc[(output_data['calendar_year'] == calendar_year) & (output_data['country_name'] == country_name)].gni_per_capita_us_dollar).bool():
            
            # get current index of country at particular calender_year  
            curr_country_index = output_data.loc[(output_data['calendar_year'] == calendar_year) & (output_data['country_name'] == country_name)].population_total.index.max()

            # increment new_population_total by population_total of curr_country_index at particular calender_year 
            new_population_total += output_data.loc[curr_country_index, 'population_total']

        # update population_total of former_country with new_population_total at particular calender_year 
        output_data.loc[curr_former_country_index, 'population_total'] = new_population_total if new_population_total > 0 else np.nan

    return output_data
    
income_data_population_total_complete = compute_missing_population_total(income_data_country_codes_complete)

## Fix missing gni per capita values

In [None]:
def fix_missing_gni_per_capita_values(input_data):
  
  output_data = input_data.copy()
  
  # set index on calendar_year
  output_data.set_index(['calendar_year'], inplace = True)

  # special handling for 'Yemen': 
  # compute mean of gni per capita from 'Yemen: Former Democratic Yemen' and 'Yemen: Former Yemen Arab Republic'
  # and replace missing values of 'Yemen' with the computed mean
  yemen_missing_gni_values = (output_data[output_data['country_name'] == 'Yemen: Former Democratic Yemen'].gni_per_capita_us_dollar + output_data[output_data['country_name'] == 'Yemen: Former Yemen Arab Republic'].gni_per_capita_us_dollar) / 2
  yemen_missing_gni_values.dropna(inplace = True)
  yemen_df = pd.DataFrame(output_data[output_data['country_name'] == 'Yemen'].gni_per_capita_us_dollar.fillna(yemen_missing_gni_values))
  yemen_df['country_name'] = 'Yemen'
  yemen_df.reset_index(inplace = True)
  yemen_df.set_index(['calendar_year', 'country_name'], inplace = True)
  
  # fill missing values of 'Yemen' with above computed values
  output_data.reset_index(inplace = True)
  output_data.set_index(['calendar_year', 'country_name'], inplace = True)
  output_data.fillna(yemen_df, inplace = True)

  # reset index
  output_data.reset_index(inplace = True)
  
  # drop 'Yemen: Former Democratic Yemen' and 'Yemen: Former Yemen Arab Republic' due to consolidation
  output_data.drop(output_data[output_data['country_name'] == 'Yemen: Former Democratic Yemen'].index, inplace = True)
  output_data.drop(output_data[output_data['country_name'] == 'Yemen: Former Yemen Arab Republic'].index, inplace = True)
  
  # drop entries that still involve missing values is now safe 
  output_data.dropna(inplace = True)
  output_data.reset_index(inplace = True, drop = True)

  return output_data

income_data_preprocessed = fix_missing_gni_per_capita_values(income_data_population_total_complete)

# check whether or not all missing values have been replaced
sb.heatmap(income_data_preprocessed.isna(), cbar = False);

## Load thresholds and sdr deflators


In [None]:
def load_thresholds_sdr_deflators():
  
  # load income-level thresholds
  income_threshold_data = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/world_bank_income_level_thresholds.csv', sep = ',')
  income_threshold_data.drop(['banks_fiscal_year'], axis = 1, inplace = True)

  # load sdr deflator values
  sdr_deflator_data = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/world_bank_sdr_deflator.csv', sep = ',')
  sdr_deflator_data.drop(['sdr_deflator_us_dollar'], axis = 1, inplace = True)

  # create index on calendar_year
  income_threshold_data.set_index(['calendar_year'], inplace = True)
  sdr_deflator_data.set_index(['calendar_year'], inplace = True)

  # merge income_threshold_data and sdr_deflator_data using index 'calendar_year'
  output_data = income_threshold_data.merge(sdr_deflator_data, how = 'inner', left_index = True, right_index = True)

  return output_data

threshold_data = load_thresholds_sdr_deflators()

# visualize missing threshold values using a heatmap
sb.heatmap(threshold_data[['low_income_level_threshold', 'middle_income_level_threshold', 'high_income_level_threshold']].isna(), cbar = False);

## Calculate missing thresholds using sdr deflators


In [None]:
def calculate_missing_thresholds(input_data):
    
    output_data = input_data.copy()
    
    thresholds_list = ['low_income_level_threshold', 'middle_income_level_threshold', 'high_income_level_threshold']   
    
    # calculate missing values
    for row_index in range(0, len(output_data)):
      
      # retrieve current index
      curr_index = output_data.index.max() - row_index
      for threshold in thresholds_list:
        
        # check if threshold value is missing
        if(np.isnan(output_data.loc[curr_index, threshold])):
          
          # calculate missing threshold value based on existing threshold and sdr_inflation_rate_annual_change
          output_data.loc[curr_index, threshold] = output_data.loc[curr_index + 1, threshold] / (100 + output_data.loc[curr_index + 1, 'sdr_inflation_rate_annual_change']) * 100
    
    # round thresholds to nearest multiple of base
    base = 5
    for threshold in thresholds_list:
      output_data[threshold] = round(output_data[threshold] / base) * base

    return output_data[thresholds_list]

threshold_data_preprocessed = calculate_missing_thresholds(threshold_data)

# check whether or not all missing values have been replaced
sb.heatmap(threshold_data_preprocessed.isna(), cbar = False);

## Assign income-level labels based on gni and income-level thresholds

In [None]:
def assign_income_level_labels(input_income_data, input_threshold_data):

  # merge income_data_preprocessed and threshold_data_complete using calendar_year
  output_data = input_income_data.merge(input_threshold_data, how = 'left', on = 'calendar_year')

  # assign income-level labels based on gni and thresholds
  output_data.loc[(output_data['gni_per_capita_us_dollar'] <= output_data['low_income_level_threshold']), 'income_level_label'] = 'low_income'
  output_data.loc[((output_data['gni_per_capita_us_dollar'] > output_data['low_income_level_threshold']) & 
                          (output_data['gni_per_capita_us_dollar'] <= output_data['middle_income_level_threshold'])), 'income_level_label'] = 'lower_middle_income'
  output_data.loc[((output_data['gni_per_capita_us_dollar'] > output_data['middle_income_level_threshold']) & 
                          (output_data['gni_per_capita_us_dollar'] <= output_data['high_income_level_threshold'])), 'income_level_label'] = 'upper_middle_income'
  output_data.loc[(output_data['gni_per_capita_us_dollar'] > output_data['high_income_level_threshold']), 'income_level_label'] = 'high_income'

  # create index on calendar_year
  output_data.set_index(['calendar_year'], inplace = True)

  return output_data

income_classification_data = assign_income_level_labels(income_data_preprocessed, threshold_data_preprocessed)

# Question 1

## Calculating the population living in the different income levels

In [None]:
def calculate_sum_population_per_income_level_and_year(input_data):

  output_data = pd.DataFrame()

  # create sum of population of each income category for each year
  output_data['low_income_population'] = input_data['population_total'].loc[input_data['income_level_label'] == 'low_income'].groupby('calendar_year').sum()
  output_data['lower_middle_income_population'] = input_data['population_total'].loc[input_data['income_level_label'] == 'lower_middle_income'].groupby('calendar_year').sum()
  output_data['upper_middle_income_population'] = input_data['population_total'].loc[input_data['income_level_label'] == 'upper_middle_income'].groupby('calendar_year').sum()
  output_data['high_income_population'] = input_data['population_total'].loc[input_data['income_level_label'] == 'high_income'].groupby('calendar_year').sum()
  output_data['total_population'] = input_data['population_total'].groupby('calendar_year').sum()

  return output_data

population_income = calculate_sum_population_per_income_level_and_year(income_classification_data)

## Visualization of results

In [None]:
# plot cumulative total population according to income-levels

# select years to include in plot
years = range(min(population_income.index), max(population_income.index))

# calculate cumulative total_population according to income-levels
low_income_cumulative = population_income.loc[population_income.index.isin(years), 'low_income_population'].values
lower_middle_income_cumulative = low_income_cumulative + population_income.loc[population_income.index.isin(years), 'lower_middle_income_population'].values
upper_middle_income_cumulative = lower_middle_income_cumulative + population_income.loc[population_income.index.isin(years), 'upper_middle_income_population'].values
high_income_cumulative = upper_middle_income_cumulative + population_income.loc[population_income.index.isin(years), 'high_income_population'].values

# create plot
fig, ax = plt.subplots()

ax.plot(population_income['total_population'].groupby('calendar_year').max(), label = 'total_population', color = 'Black')
ax.plot(population_income['total_population'].groupby('calendar_year').max() * 0.5, label = 'half_of_total_population', color = 'Red', linestyle = 'dashed')

ax.fill_between(years ,upper_middle_income_cumulative, high_income_cumulative, alpha = 0.3, color = 'dodgerblue')
ax.fill_between(years ,lower_middle_income_cumulative, upper_middle_income_cumulative, alpha= 0.3, color = 'skyblue')
ax.fill_between(years, low_income_cumulative, lower_middle_income_cumulative, alpha = 0.3, color = 'lightcoral')
ax.fill_between(years, low_income_cumulative, 0, alpha = 0.3, color = 'maroon')
ax.text(1990, 1.5e9, 'low income', fontsize = 9, color = 'Black')
ax.text(1995, 3.5e9, 'lower_middle_income', fontsize = 9, color = 'Black')
ax.text(2003, 5e9, 'upper_middle_income', fontsize = 9, color = 'Black')
ax.text(2009, 6.5e9, 'high_income', fontsize = 9, color = 'Black')

ax.set_title('Visualizing historical development of world\'s population according to different \n income-levels based on the information of more than 160 countries', pad = 20)
ax.set_xlabel('Calendar Year')
ax.set_ylabel('World\'s Population')

plt.xlim([min(population_income.index), max(population_income.index) - 1])
plt.ylim([0, population_income['total_population'].max()])
plt.legend()

plt.show()

In [None]:
# show pie-plot of differtent income levels in the last available year (=2018)

fig, ax = plt.subplots()
population_income.loc[max(population_income.index),['low_income_population','lower_middle_income_population','upper_middle_income_population','high_income_population']].plot.pie(autopct='%.1f', pctdistance = 0.7, labels = ['low income', 'lower middle income', 'upper middle income', 'high income'], colors = ['maroon','lightcoral','skyblue','dodgerblue'])
ax.set_title('Distribution of World\'s Population according to different income-levels in 2018', pad = 20)
plt.ylabel('')
plt.show() 

In [None]:
# plot income of the 5 most populous countries in the last available year(= 2018)

# select years to include in plot
years = range(min(population_income.index), max(population_income.index))

# select 5 largest countries
countries_most_populous = income_classification_data.loc[max(population_income.index)].nlargest(5,'population_total')['country_name'].values

# extract income thresholds
high_income_treshold = threshold_data_preprocessed[threshold_data_preprocessed.index.isin(years)]['high_income_level_threshold'].values
middle_income_treshold = threshold_data_preprocessed[threshold_data_preprocessed.index.isin(years)]['middle_income_level_threshold'].values
low_income_treshold = threshold_data_preprocessed[threshold_data_preprocessed.index.isin(years)]['low_income_level_threshold'].values

# create plot
fig, ax = plt.subplots()

for i in range(5):
  ax.plot(income_classification_data['gni_per_capita_us_dollar'].loc[income_classification_data['country_name'] == countries_most_populous[i]],label = countries_most_populous[i])

#plot on logarithmic scale
plt.yscale('log')

#visualise thresholds and the different income classes
ax.fill_between(years,high_income_treshold, 1e5,alpha = 0.3,color = 'dodgerblue')
ax.fill_between(years,high_income_treshold, middle_income_treshold, alpha = 0.3,color = 'skyblue')
ax.fill_between(years,low_income_treshold, middle_income_treshold, alpha = 0.3, color = 'lightcoral')
ax.fill_between(years,low_income_treshold, 0, alpha = 0.3, color = 'maroon')
ax.text(2004, 1.5e2, 'low income', fontsize = 9, color = 'Black')
ax.text(1980, 1.3e3, 'lower_middle_income', fontsize = 9, color = 'Black')
ax.text(1990, 5e3, 'upper_middle_income', fontsize = 9, color = 'Black')
ax.text(2000, 2e4, 'high_income', fontsize = 9, color = 'Black')

ax.set_title('GNI per Capita over Time for the 5 most populous Countries', pad = 20)
ax.set_xlabel('Calendar Year')
ax.set_ylabel('log(GNI per Capita in US Dollar)')

plt.xlim([min(population_income.index), max(population_income.index) - 1])
plt.ylim([5e1,1e5])
plt.legend(loc='upper left',fontsize = 8)

plt.show()

In [None]:
# create df to show the sum of coutries for each income classification
income_classification_count = income_classification_data['income_level_label'].groupby('calendar_year').value_counts(dropna = False)

# plot total count of classification entries each year [sum of countries]
fig, ax = plt.subplots()
income_classification_total_count = income_classification_count[:,'high_income']+income_classification_count[:,'upper_middle_income']+income_classification_count[:,'lower_middle_income']+income_classification_count[:,'low_income']
plt.plot(income_classification_total_count, label = 'total_classification_count', color = 'black')
ax.set_title('Number of Classified Countries per Year', pad = 20)
plt.xlabel('Calendar Year')
plt.ylabel('Number of Countries')
plt.xlim([min(income_classification_total_count.index), max(income_classification_total_count.index) - 1])
plt.ylim([0, 200])
plt.legend()
plt.show()

In [None]:
# wollt ihr die beiden plots drinnen lassen, glaub die sind recht schwer zu interpretieren? Ich würde es auch mal rauswerfen
# plot income classification count over time
plt.plot(population_income['high_income_population'], label = 'high_income' )
plt.plot(population_income['upper_middle_income_population'], label = 'upper_middle_income')
plt.plot(population_income['lower_middle_income_population'], label = 'lower_middle_income')
plt.plot(population_income['low_income_population'], label = 'low_income')
plt.xlabel('Calendar Year')
plt.ylabel('Number of People')
plt.legend()
plt.show()

# income_classification_data =  income_classification_data.loc[income_classification_data['population_total'] > 1000000.0]
# print(income_classification_data)
# print(income_classification_data[['population_total', 'income_level_label']].groupby(['calendar_year', 'income_level_label']).sum().groupby('calendar_year').sum())

# plot income classification count over time [sum of countries]
plt.plot(income_classification_count[:,'high_income'], label = 'high_income' )
plt.plot(income_classification_count[:,'upper_middle_income'], label = 'upper_middle_income')
plt.plot(income_classification_count[:,'lower_middle_income'], label = 'lower_middle_income')
plt.plot(income_classification_count[:,'low_income'], label = 'low_income')
plt.xlabel('Calendar Year')
plt.ylabel('Number of Countries')
plt.legend()
plt.show()

## Interactive world map (choropleth map)

In [None]:
def preprocessing_world_map_data(input_data):

  output_data = input_data.copy()

  # drop index on calendar_year
  output_data.reset_index(inplace = True)

  former_country_lists = [['Former Czechoslovakia', 'Czech Republic', 'Slovakia'],
                             ['Former Ethiopia', 'Ethiopia', 'Eritrea'],
                             ['Former Sudan', 'Sudan', 'South Sudan'],
                             ['Former USSR', 'Armenia', 'Azerbaijan', 'Belarus', 'Estonia', 'Georgia', 'Kazakhstan', 'Kyrgyzstan', 'Latvia', 'Lithuania', 
                              'Republic of Moldova', 'Russian Federation', 'Tajikistan', 'Turkmenistan', 'Ukraine', 'Uzbekistan'],
                             ['Former Yugoslavia', 'Bosnia and Herzegovina', 'Croatia', 'Montenegro', 'Republic of North Macedonia', 'Serbia', 'Slovenia']]

  for country_list in former_country_lists:

    # get first list elem of list
    former_country_name = country_list.pop(0)

    # get list of calendar_years
    calendar_year_list = output_data.loc[output_data['country_name'] == former_country_name].calendar_year.tolist()
    
    for calendar_year in calendar_year_list:

      # get current index of former country at particular calender_year  
      curr_former_country_index = output_data.loc[(output_data['calendar_year'] == calendar_year) & (output_data['country_name'] == former_country_name)].income_level_label.index.max()

      # get income-level of former country
      income_level_label = output_data.loc[curr_former_country_index, 'income_level_label']
      
      # get max calender year of dataframe
      max_calendar_year = max(output_data['calendar_year'])

      for country_name in country_list:
        
        # get current index of country at particular calender_year  
        curr_country_index = output_data.loc[(output_data['calendar_year'] ==  max_calendar_year) & (output_data['country_name'] == country_name)].income_level_label.index.max()

        # get income-level of former country
        iso_alpha3_code = output_data.loc[curr_country_index, 'iso_alpha3_code']

        # append row to dataframe
        output_data = output_data.append({'calendar_year' : calendar_year, 
                                          'iso_alpha3_code' : iso_alpha3_code,
                                          'country_name' :  country_name, 
                                          'income_level_label' : income_level_label} , ignore_index = True)

  # set index on calendar_year
  output_data.reset_index(inplace = True, drop = True)
  output_data.sort_values(by = ['country_name', 'calendar_year'], inplace = True)
  output_data.set_index('calendar_year', inplace = True)

  return output_data

world_map_data = preprocessing_world_map_data(income_classification_data)

In [None]:
@interact(selected_year = IntSlider(value = min(world_map_data.index), min = min(world_map_data.index), max = max(world_map_data.index)))
def interactive_world_map_visualization_(selected_year):
  
  # load geometry data for all countries 
  world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

  # correct wrong iso code
  world.loc[world['name'] == 'France', 'iso_a3'] = 'FRA'
  world.loc[world['name'] == 'Norway', 'iso_a3'] = 'NOR'
  world.loc[world['name'] == 'Kosovo', 'iso_a3'] = 'XKX'

  # drop unnecessary columns
  country_shapes = world[['geometry', 'iso_a3']]

  # merge geometry with income data on country code, select year to look at
  geo_income_data = country_shapes.merge(world_map_data.loc[selected_year], left_on='iso_a3', right_on='iso_alpha3_code')

  # plot map
  legend_dict = {'bbox_to_anchor' : (1., 1), 'loc' :'upper left'}
  colors = ['dodgerblue','maroon','lightcoral','skyblue']
  geo_income_data.plot(column='income_level_label', legend = True, legend_kwds = legend_dict, figsize=(15, 10), cmap = matplotlib.colors.ListedColormap(colors) )


# Data Preprocessing of Features

## Load Data

In [None]:
def load_world_bank_indicator_data():
  # load indicator data from csv (World Bank Data) (https://databank.worldbank.org/source/world-development-indicators/)
  data = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/data_indicators_2.csv', sep = ',', nrows = 16104)
  data.replace('..',np.nan,inplace=True)

  #rename columns
  rename_columns_dict = {'Time':'calendar_year','Country Code':'iso_alpha3_code','Fertility rate, total (births per woman) [SP.DYN.TFRT.IN]':'fertility_rate','Urban population (% of total population) [SP.URB.TOTL.IN.ZS]':'urban_population','Access to electricity (% of population) [EG.ELC.ACCS.ZS]':'access_electricity', 'Agriculture, forestry, and fishing, value added (% of GDP) [NV.AGR.TOTL.ZS]':'agriculture_forestry_fishing_sector','Unemployment, total (% of total labor force) (modeled ILO estimate) [SL.UEM.TOTL.ZS]':'unemployment', 'Total natural resources rents (% of GDP) [NY.GDP.TOTL.RT.ZS]':'natural_resources_rent','Inflation, consumer prices (annual %) [FP.CPI.TOTL.ZG]':'inflation','External balance on goods and services (% of GDP) [NE.RSB.GNFS.ZS]':'external_balance_on_goods_and_services','School enrollment, primary (% gross) [SE.PRM.ENRR]':'primary_school_enrollment','Life expectancy at birth, total (years) [SP.DYN.LE00.IN]':'life_expectancy'}
  data.rename(rename_columns_dict, axis='columns',inplace=True)

  #only use Data from 1995-2018. Some indicators are not available before 1995
  data = data[data['calendar_year'].isin(range(1995,2019))]

  # select only ceratain parameters (drop columns with very few entries)
  data = data[['calendar_year','iso_alpha3_code','fertility_rate','urban_population','access_electricity', 'agriculture_forestry_fishing_sector','external_balance_on_goods_and_services','natural_resources_rent','inflation','life_expectancy']]

  #change data types of columns
  data = data.astype({'fertility_rate': float, 'urban_population': float, 'access_electricity': float, 'agriculture_forestry_fishing_sector': float, 'inflation': float, 'external_balance_on_goods_and_services': float, 'natural_resources_rent': float, 'life_expectancy': float})

  return data

indicator_data = load_world_bank_indicator_data()

In [None]:
def load_corruption_perceptions_index():
  #load corruption_perceptions_index provided by Transperancy International (https://www.transparency.org/en/cpi/2019/results)
  data = pd.read_excel('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/CPI2019.xlsx', sheet_name= 'CPI Timeseries 2012 - 2019',header = 2, usecols = ['ISO3', 'CPI score 2018','CPI score 2017','CPI score 2016','CPI score 2015','CPI score 2014','CPI Score 2013','CPI Score 2012' ] )
  data.rename({'ISO3': 'iso_alpha3_code', 'CPI score 2018':2018,'CPI score 2017':2017,'CPI score 2016':2016,'CPI score 2015':2015,'CPI score 2014':2014,'CPI Score 2013':2013,'CPI Score 2012':2012}, axis='columns',inplace =True)
  
  #transform dataframe
  cpi_data_complete = pd.melt(data, id_vars=['iso_alpha3_code'], value_vars=range(2012,2019),var_name = 'calendar_year', value_name = 'corruption_perceptions_index')

  #load data from 1995 - 2011
  for i in range(1995, 2011+1):
    year_data = pd.read_csv('/content/drive/MyDrive/Git/dopp_2020w_group03_ex3/data/CPI-' + str(i) +'.csv', sep = ',', usecols = ['iso','score'])
    year_data['calendar_year'] = i
    year_data.rename({'iso': 'iso_alpha3_code', 'score':'corruption_perceptions_index'},axis='columns',inplace =True)

    year_data = year_data.astype({'corruption_perceptions_index': float})

    #multiply corruption_perceptions_index with factor 10 for calendar years 1995 - 2011. Score was then between 0 and 10. From 2012-2018 score is between 0 and 100. Therfore the score is scaled up to match the data from 2012 to 2018
    year_data['corruption_perceptions_index'] = year_data['corruption_perceptions_index'] *10

    #add year i to cpi dataframe
    cpi_data_complete = pd.concat([cpi_data_complete, year_data],ignore_index=True)

  cpi_data_complete = cpi_data_complete.astype({'corruption_perceptions_index': float})
  return cpi_data_complete

cpi_data = load_corruption_perceptions_index()


## Merge with Data from Question 1

In [None]:
def merge_indicator_and_income_data(indicator, cpi,income):
  merged_indicator_income_data = income.merge(indicator, how = 'inner', on = ['calendar_year','iso_alpha3_code'])
  merged_data_complete = merged_indicator_income_data.merge(cpi, how = 'left', on = ['calendar_year','iso_alpha3_code'])

  #set index 
  merged_data_complete.set_index(['calendar_year','country_name'], inplace = True)

  return merged_data_complete

indicator_income_data = merge_indicator_and_income_data(indicator_data,cpi_data, income_classification_data)

## Deal with missing data and outliers

In [None]:
sb.heatmap(indicator_income_data.isna(), cbar = False);

In [None]:
def clean_data(input_data):
  output_data= input_data.copy()

  #Remove all countries with missing indicator entries (exception: corruption_perceptions_index) 
  # The corruption_perceptions_index exists since 1995, but only for few countries. 
  output_data.dropna(subset = ['fertility_rate','urban_population','access_electricity', 'agriculture_forestry_fishing_sector','external_balance_on_goods_and_services','natural_resources_rent','inflation','life_expectancy'], inplace = True)

  #store the index of the droped rows for further analysis of the missing data
  droped_rows = input_data[~input_data.index.isin(output_data.index)].index.to_frame(index=False)
  droped_rows.set_index('calendar_year', inplace = True)
  
  
  return output_data, droped_rows

indicator_income_data_cleaned, droped_rows_cleaning = clean_data(indicator_income_data)

indicator_income_data_cleaned



In [None]:
droped_rows_cleaning.value_counts().hist(bins = 29)
plt.show()

display(droped_rows_cleaning.value_counts())


#in the droped_rows_cleaning dataframe it can be seen that some countries where completly deleted (All 24 years). 
#This is due to the fact that some indicators wheren't available for that countries. When you look at the droped rows in more detail it can be seen that they are mostly missing in many consecutive years. 
#Also the droped rows are always located on the "edges" of the available data([1995 - x] or [y - 2018]). Therfore the missing data only could be extrapolated and not interpolated. This would lead to highly unceratain predictions. 
#We choose not to guess the missing indicators to avoid making wrong predictions. In the Following ML- Algorithm we only use years where few rows where dropped.


In [None]:
sb.heatmap(indicator_income_data_cleaned.isna(), cbar = False);
plt.show()


## Deal with outliers

In [None]:
# Discovering outliers with visualization tool Box plot

def outliersBoxPlot (input_data):
  
  data = input_data.copy()
  #Remove all countries with missing indicator values
  data.dropna(subset = ['fertility_rate','urban_population','access_electricity', 'agriculture_forestry_fishing_sector','external_balance_on_goods_and_services','natural_resources_rent','inflation','life_expectancy','corruption_perceptions_index'], inplace = True)

  fig, axs = plt.subplots(3,4)
  axs[0,0].boxplot(data['fertility_rate'])
  axs[0,0].set_title('fertility_rate')
  axs [0,1].boxplot(data['urban_population'])
  axs [0,1].set_title('urban_population')
  axs[0,2].boxplot(data['access_electricity'])
  axs[0,2].set_title('access_electricity')
  axs [0,3].boxplot(data['agriculture_forestry_fishing_sector'])
  axs [0,3].set_title('agriculture_forestry_fishing_sector')
  axs[1,0].boxplot(data['external_balance_on_goods_and_services'])
  axs[1,0].set_title('external_balance_on_goods_and_services')
  axs [1,1].boxplot(data['natural_resources_rent'])
  axs [1,1].set_title('natural_resources_rent')
  axs[1,2].boxplot(data['inflation'])
  axs[1,2].set_title('inflation')
  axs [1,3].boxplot(data['life_expectancy'])
  axs [1,3].set_title('life_expectancy1')
  axs [2,0].boxplot(data['corruption_perceptions_index'])
  axs [2,0].set_title('corruption_perceptions_index')

  fig.subplots_adjust(left=0.05, right=0.98, bottom=0.05, top=1.5,
                    hspace=1, wspace=1)

outliersBoxPlot(indicator_income_data_cleaned)


In [None]:
# Discovering outliers with mathematical function
def outlierZScore(input_data):
  
  # Z-score
  # With Z-score we re-scale and center the data and look for data points which are too far from zero. Data points which are too far from zero will be treated as outliers.
  # In the most cases a threshold of 3 or -3 is used. Z-score values greater than or less than 3 or -3 respectively is an outlier.

  # defining threshold
  threshold = 3

  # features which are considered to be used to train the ML algorithm
  features = ['fertility_rate','urban_population','access_electricity', 'agriculture_forestry_fishing_sector','external_balance_on_goods_and_services','natural_resources_rent','inflation','life_expectancy','corruption_perceptions_index']
  df = pd.DataFrame()
  df = input_data.copy()
  
  #Remove all countries with missing indicator values
  df.dropna(subset = features, inplace = True)
  
  # Function to compute z-score 
  z = np.abs(stats.zscore(df[features]))
  #print(z)
  #print()

  # The first array contains the list of row numbers and the second array respective column numbers
  outliers = np.where(z > 3)
  #print(outliers)

  out_rows = outliers[0]
  out_columns = outliers[1]
  #print(len(out_rows))
  #print(len(out_columns))

  display(df.iloc[48])
  
  output_data = [] 

  for x in range(len(out_rows)):
    row = pd.DataFrame()
    row = df.iloc[out_rows[x]].copy()
    row['outlier'] = features[out_columns[x]]
    output_data.append(row)
    
  outDataFrame = pd.DataFrame(output_data)
  cols = list(outDataFrame.columns.values)
  cols = cols[-1:] + cols[:-1]
  #display(cols)
  outDataFrame = outDataFrame[cols]
  pd.set_option('display.max_rows', 500)
  display(outDataFrame)


outlierZScore(indicator_income_data_cleaned)

In [None]:
# exploring outliers
def exploreOutlier(country_name, feature):
  
  country = indicator_income_data_cleaned.iloc[indicator_income_data_cleaned.index.get_level_values('country_name') == country_name]
  print(country[feature])

exploreOutlier('Nigeria','inflation')

## Explore the data

In [None]:
sb.heatmap(indicator_income_data_cleaned.isna(), cbar = False);


In [None]:
sb.countplot(indicator_income_data_cleaned['income_level_label'])

In [None]:
def plot_scatter_matrix(input_data):
  # Scatterplot Matrix
  sm = pd.plotting.scatter_matrix(input_data[['income_level_label','gni_per_capita_us_dollar','fertility_rate','urban_population','access_electricity', 'agriculture_forestry_fishing_sector','external_balance_on_goods_and_services','natural_resources_rent','inflation','life_expectancy','corruption_perceptions_index']], figsize=(18, 18), diagonal='hist')
  #Change label rotation
  [s.xaxis.label.set_rotation(90) for s in sm.reshape(-1)]
  [s.yaxis.label.set_rotation(0) for s in sm.reshape(-1)]
  #May need to offset label when rotating to prevent overlap of figure
  [s.get_yaxis().set_label_coords(-0.6,0.5) for s in sm.reshape(-1)]
  #Hide all ticks
  [s.set_xticks(()) for s in sm.reshape(-1)]
  [s.set_yticks(()) for s in sm.reshape(-1)]
  plt.show()

plot_scatter_matrix(indicator_income_data_cleaned)

In [None]:
def plot_gni_scatter_plot(input_data):

  #plot scatter plot with colorized classification and logarithmic scale for gni per capita

  #create income level number
  # low_income = 1
  # lower_middle_income = 2
  # upper_middle_income = 3
  # high_income = 4
  scatter_data = input_data.copy()
  scatter_data['income_classification_number'] = np.nan
  scatter_data.loc[scatter_data['income_level_label'] == 'low_income', 'income_classification_number'] = 1
  scatter_data.loc[scatter_data['income_level_label'] == 'lower_middle_income',['income_classification_number']] = 2
  scatter_data.loc[scatter_data['income_level_label'] == 'upper_middle_income',['income_classification_number']] = 3
  scatter_data.loc[scatter_data['income_level_label'] == 'high_income',['income_classification_number']] = 4


  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['fertility_rate'], scatter_data['gni_per_capita_us_dollar'],\
              c = scatter_data['income_classification_number'],s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('fertility_rate')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()

  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['urban_population'],scatter_data['gni_per_capita_us_dollar'],\
              c=scatter_data['income_classification_number'],s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('urban_population')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()

  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['access_electricity'],scatter_data['gni_per_capita_us_dollar'],\
              c=scatter_data['income_classification_number'],s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('access_electricity')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()

  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['agriculture_forestry_fishing_sector'],scatter_data['gni_per_capita_us_dollar'],\
              c=scatter_data['income_classification_number'],s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('agriculture_forestry_fishing_sector')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()

  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['inflation'],scatter_data['gni_per_capita_us_dollar'],\
              c=scatter_data['income_classification_number'],s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('inflation')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()

  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['external_balance_on_goods_and_services'],scatter_data['gni_per_capita_us_dollar'],\
              c=scatter_data['income_classification_number'].values,s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('external_balance_on_goods_and_services')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()

  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['natural_resources_rent'],scatter_data['gni_per_capita_us_dollar'],\
              c=scatter_data['income_classification_number'],s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('natural_resources_rent')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()

  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['life_expectancy'],scatter_data['gni_per_capita_us_dollar'],\
              c=scatter_data['income_classification_number'],s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('life_expectancy')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()

  fig = plt.figure()
  ax = plt.gca()
  plt.scatter(scatter_data['corruption_perceptions_index'],scatter_data['gni_per_capita_us_dollar'],\
              c=scatter_data['income_classification_number'],s = 1)
  ax.set_yscale('log')
  ax.set_xlabel('corruption_perceptions_index')
  ax.set_ylabel('GNI per Capita in US Dollar')
  plt.show()


plot_gni_scatter_plot(indicator_income_data_cleaned)



In [None]:
def plot_correlation_matrix(correlation_matrix_data):
  # Full correlation matrix

  colum_names = ['gni_per_capita_us_dollar','fertility_rate','urban_population','access_electricity', 'agriculture_forestry_fishing_sector','external_balance_on_goods_and_services','natural_resources_rent','inflation','life_expectancy','corruption_perceptions_index']
  # Correlation matrix
  correlations = correlation_matrix_data[['gni_per_capita_us_dollar','fertility_rate','urban_population','access_electricity', 'agriculture_forestry_fishing_sector','external_balance_on_goods_and_services','natural_resources_rent','inflation','life_expectancy', 'corruption_perceptions_index']].corr()
  # Plot figsize
  fig, ax = plt.subplots(figsize=(12, 12))
  # Generate Color Map
  colormap = sb.diverging_palette(220, 10, as_cmap=True)
  # Generate Heat Map, allow annotations and place floats in map
  sb.heatmap(correlations, cmap=colormap, annot=True, fmt=".2f")
  ax.set_xticklabels(
      colum_names,
      rotation=45,
      horizontalalignment='right'
  );
  ax.set_yticklabels(colum_names);
  plt.show()

  display(correlations)

plot_correlation_matrix(indicator_income_data_cleaned)  

# ML Algorithm

## Separate train and test data

In [None]:
def separate_training_test_data(input_series, rnd):

  # retrieve number of columns of input_data
  n_cols = len(input_series.columns)
    
  # split up input_series into X and y
  X = input_series.reset_index().iloc[:, 1:n_cols].values
  y = input_series.reset_index().iloc[:, n_cols:(n_cols+1)].values.reshape(-1)

  # separate training and test data with test_size = 0.2
  X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = rnd)
  
  return X_train, X_test, y_train, y_test

## Build random forest classification model

In [None]:
def build_random_forest_classifier(X, y, k, rnd):

  # set up grid-search to optimize hyperparameters of random forest algorithm
  ## n_estimators ... number of trees in random forest
  ## max_depth ... maximum depth in trees
  ## max_features ... number of considered features
  ## min_samples_split ... minimum number of samples for node splitting
  ## min_samples_leaf ... minimum number of samples for leaf node
  ## bootstrap ... bootstrap samples used for building trees
  gsc = GridSearchCV(
      estimator = RandomForestClassifier(random_state = rnd),
      param_grid = {
          'n_estimators': [50, 100, 150],
          'max_depth': [2, 4, 8],
          'max_features' : ['auto', 'sqrt', 'log2'],
          'min_samples_split': [2, 3, 5],
          'min_samples_leaf' : [1, 2, 5],
          'bootstrap' : [True, False]
      },
      cv = k // 3, # divide k by 3 to reduce execution time
      scoring = 'f1_weighted', 
      n_jobs = -1 # use all processors available
  )
  
  # obtain hyperparameters through grid-search
  gsc_result = gsc.fit(X, y)
  best_params = gsc_result.best_params_

  # build random forest clasification model
  rfcl = RandomForestClassifier(
      n_estimators = best_params['n_estimators'],  
      max_depth = best_params['max_depth'],
      max_features = best_params['max_features'],
      min_samples_split = best_params['min_samples_split'],
      min_samples_leaf = best_params['min_samples_leaf'],
      bootstrap = best_params['bootstrap'],
      random_state = rnd)
  
  # initialize cross-validation settings
  cv = KFold(n_splits = k, random_state = rnd, shuffle = True)

  # use k-fold cv for train-/validation-split when fitting the model
  for train_index, val_index in cv.split(X):
    
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # fit random forest classification model to data
    rfcl.fit(X_train, y_train)

  return rfcl

## Perform classification of labels

In [None]:
def perform_label_classification(input_series, parameters, k, rnd):
   
   series = input_series.copy()

   # drop remaining rows with nan-values      
   series.dropna(inplace = True)

   # separate train and test data
   X_train, X_test, y_train, y_test = separate_training_test_data(series, rnd)

   # build random forest classification model
   rfcl = build_random_forest_classifier(X_train, y_train, k, rnd)
      
   # prediction based on features of test data
   y_pred = rfcl.predict(X_test)

   # retrieve labels of income-levels
   labels = np.unique(y_test)

   # compute weighted evaluation metrics
   f1_wt = f1_score(y_test, y_pred, average = 'weighted', labels = labels, zero_division = 1)
   precision_wt = precision_score(y_test, y_pred, average = 'weighted', labels = labels, zero_division = 1)
   recall_wt = recall_score(y_test, y_pred, average = 'weighted', labels = labels, zero_division = 1)
   
   # compute precision for each income-level
   precision_income_level = precision_score(y_test, y_pred, average = None, labels = labels)
   
   # store results of evaluation metrics in dictionaries
   result_dict_wt = dict(zip(['f1', 'precision', 'recall'], [f1_wt, precision_wt, recall_wt]))
   result_dict_income_level = dict(zip(labels, precision_income_level))

   # create confusion matrix
   confusion_mx = confusion_matrix(y_test, y_pred, labels = labels)
   confusion_mx_data = pd.DataFrame(confusion_mx, index = labels, columns = labels)
   
   # obtain feature importance
   feature_importance = pd.Series(rfcl.feature_importances_, index = parameters, 
                                  name = 'feature_importance').sort_values(ascending = False)

   return result_dict_wt, result_dict_income_level, confusion_mx_data, feature_importance

## Selection of calendar year and parameters

In [None]:
def selection_of_calendar_year(input_data, calendar_year, parameters):
  
  output_data = input_data.copy()
  
  output_data.reset_index(inplace = True)
  output_data = output_data.loc[output_data['calendar_year'] == calendar_year]
  output_data.set_index(['calendar_year', 'country_name'], inplace = True)
  
  return output_data[parameters]

# Question 2

## Classification of income-level labels

In [None]:
def classify_income_level_labels(input_data, calendar_year, k, seed):

  parameters = ['fertility_rate', 'urban_population', 'access_electricity', 
                    'agriculture_forestry_fishing_sector', 
                    'external_balance_on_goods_and_services', 
                    'life_expectancy', 'natural_resources_rent', 
                    'inflation', 'corruption_perceptions_index']
  
  label = ['income_level_label']

  data = selection_of_calendar_year(input_data, calendar_year, parameters + label)

  # initialize random seed in order to ensure reproducible results
  np.random.seed(seed)
  rnd = np.random.randint(0,1000)

  # extract series of specific calendar_year   
  series = data.loc[calendar_year]

  # separate train and test data
  X_train, X_test, y_train, y_test = separate_training_test_data(series, rnd)

  # starttime 
  print('start: ', datetime.now().time())

  # perform income-level classifications
  result = perform_label_classification(series, parameters, k, rnd)

  # endtime
  print('end: ', datetime.now().time())

  return result

result_dict_wt, result_dict_income_level, confusion_mx_data, feature_importance = classify_income_level_labels(indicator_income_data_cleaned, 2018, 10, 56)

## Evaluation of metrics and visualization of results

In [None]:
print(result_dict_wt)

In [None]:
print(result_dict_income_level)

In [None]:
print(confusion_mx_data.to_string())

In [None]:
print(feature_importance)

# Question 3

## Compute a new column that shows whether or not a change in income-level occured

In [None]:
def get_IndicatorLevelChange(inputdata, firstyear, secondyear):
  data = inputdata.copy()
  output = inputdata.copy()
  data = get_incomeClassificationNumber(data)
  data['level_change'] = np.nan
  
  data1 = data.loc[firstyear]
  data2 = data.loc[secondyear]
  
  output = get_change(data1, data2)
  
  #data1 = data1.set_index(['calender_year','country_name'])
  #display(data1)

  return output

change_dataset = get_IndicatorLevelChange(indicator_income_data_cleaned,2008,2018)
display(change_dataset.loc[:,'level_change'])

In [None]:
def get_incomeClassificationNumber (inputdata):
  data = inputdata.copy()

  data['income_classification_number'] = np.nan
  data.loc[data['income_level_label'] == 'low_income', 'income_classification_number'] = 1
  data.loc[data['income_level_label'] == 'lower_middle_income',['income_classification_number']] = 2
  data.loc[data['income_level_label'] == 'upper_middle_income',['income_classification_number']] = 3
  data.loc[data['income_level_label'] == 'high_income',['income_classification_number']] = 4
  
  return data

In [None]:
def get_change(data1, data2):
  output = data1.copy()

  for x in data1.index:
    try: #try running the loop
      level1 = data1.loc[x,'income_classification_number']
      #print(level1)
      level2 = data2.loc[x,'income_classification_number']
      #print(level2)
      if level1 > level2:
        data1.loc[x, 'level_change'] = 'higher'        
      elif level1 < level2:
        data1.loc[x, 'level_change'] = 'lower'
      else:
        data1.loc[x, 'level_change'] = 'same'

    except: #if the loop breaks - countries which are in data1 but not in data2 are printed out
      print(x + ' - not included in both years') 

  return data1


In [None]:
indicator_income_data_cleaned.iloc[1].pd.Index.name