<h2> 4. Prepare data for training </h2>

In [1]:
import sys, os; sys.path.insert(0, os.path.dirname(os.getcwd()))
import pandas as pd
import utils.io
import utils.preprocessing
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import itertools
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import EnsembleVoteClassifier
from mlxtend.plotting import plot_decision_regions

# Environment settings:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_column', None)

In [6]:
df_country_indicators = utils.io.retrieve_table('CountryIndicators') # Loading indicators form previous sections
df_countries = utils.io.retrieve_table('Countries')
data = utils.preprocessing.prepare_data(df_country_indicators, df_countries)

In [8]:
data.isnull().sum().sum() # Check there is no NaN

0

### Extract target variable to predict

In [4]:
data[['Year','NY.GDP.PCAP.CD', 'NY.GDP.MKTP.KD.ZG', 'NY.GDP.PCAP.KD.ZG']]
# target variable is 'NY.GDP.MKTP.KD.ZG' (GPD growth %) of the next year

Unnamed: 0,Year,NY.GDP.PCAP.CD,NY.GDP.MKTP.KD.ZG,NY.GDP.PCAP.KD.ZG
0,1960,771.879064,5.223728,2.990284
1,1961,811.124632,5.193432,2.456528
2,1962,833.774529,6.781221,4.045457
3,1963,841.389893,4.926467,2.306154
4,1964,918.781408,6.231664,3.650526
...,...,...,...,...
12592,2006,414.680115,-3.461495,-4.516923
12593,2007,397.956872,-3.653327,-4.883986
12594,2008,327.199084,-17.668947,-18.874825
12595,2009,594.495968,5.984391,4.242302


In [5]:
data['target'] = data.sort_values(['CountryCode', 'Year']).groupby('CountryCode')['NY.GDP.MKTP.KD.ZG'].shift(periods=-1)
data[['CountryCode', 'Year', 'NY.GDP.MKTP.KD.ZG', 'target']]

Unnamed: 0,CountryCode,Year,NY.GDP.MKTP.KD.ZG,target
0,ABW,1960,5.223728,5.193432
1,ABW,1961,5.193432,6.781221
2,ABW,1962,6.781221,4.926467
3,ABW,1963,4.926467,6.231664
4,ABW,1964,6.231664,23.722879
...,...,...,...,...
12592,ZWE,2006,-3.461495,-3.653327
12593,ZWE,2007,-3.653327,-17.668947
12594,ZWE,2008,-17.668947,5.984391
12595,ZWE,2009,5.984391,11.375921


Look at the non numerical columns:

In [6]:
non_numeric = data.select_dtypes(exclude='number')
non_numeric

Unnamed: 0,CountryCode,Region,IncomeGroup,LongName
0,ABW,Latin America & Caribbean,High income: nonOECD,Aruba
1,ABW,Latin America & Caribbean,High income: nonOECD,Aruba
2,ABW,Latin America & Caribbean,High income: nonOECD,Aruba
3,ABW,Latin America & Caribbean,High income: nonOECD,Aruba
4,ABW,Latin America & Caribbean,High income: nonOECD,Aruba
...,...,...,...,...
12592,ZWE,Sub-Saharan Africa,Low income,Republic of Zimbabwe
12593,ZWE,Sub-Saharan Africa,Low income,Republic of Zimbabwe
12594,ZWE,Sub-Saharan Africa,Low income,Republic of Zimbabwe
12595,ZWE,Sub-Saharan Africa,Low income,Republic of Zimbabwe


In [7]:
non_numeric['Region'].unique()
non_numeric['IncomeGroup'].unique()

array(['Latin America & Caribbean', 'Europe & Central Asia', 'South Asia',
       'Sub-Saharan Africa', '', 'Middle East & North Africa',
       'East Asia & Pacific', 'North America'], dtype=object)

array(['High income: nonOECD', 'Low income', 'Upper middle income', '',
       'Lower middle income', 'High income: OECD'], dtype=object)

##### Encode and drop categorical columns

In [5]:
income_group_replace_dict = {'Low income': -2,
                             'Lower middle income': -1,
                             'Upper middle income': 0,
                             'High income: nonOECD': 1,
                             'High income: OECD': 2,
                             '': 0}
data['IncomeGroup'] = data['IncomeGroup'].replace(income_group_replace_dict)

In [9]:
data['IncomeGroup']

0        1
1        1
2        1
3        1
4        1
        ..
12592   -2
12593   -2
12594   -2
12595   -2
12596   -2
Name: IncomeGroup, Length: 12597, dtype: int64

In [11]:
data = pd.get_dummies(data, columns=['Region'])

In [17]:
data.select_dtypes(exclude='number')

Unnamed: 0,CountryCode,LongName
0,ABW,Aruba
1,ABW,Aruba
2,ABW,Aruba
3,ABW,Aruba
4,ABW,Aruba
...,...,...
12592,ZWE,Republic of Zimbabwe
12593,ZWE,Republic of Zimbabwe
12594,ZWE,Republic of Zimbabwe
12595,ZWE,Republic of Zimbabwe


In [19]:
data = data.select_dtypes(include='number')