## Preparing the dataset for analysis and Model

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import pystan
import pystan_utils
import os

# matplotlib style options
plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)

In [2]:
# load csv
df = pd.read_csv("SDG_Indicators.csv")
df1 = df.drop(['IndicatorId', 'Series Code', 'Series Type', 'Series Description', 'Parent Country or Area Code',
                'Country or Area Code', 'Country or Area Name', 'LDC', 'LLDC', 'SIDS','Frequency', 'Source type',
                'Location', 'Value type', '1983', 'FN', 'FN.1', 'FN.2', 'FN.3', 'FN.4', 'FN.5', 'FN.6', 'FN.7',
                'FN.8', 'FN.9', 'FN.10', 'FN.11', 'FN.12', 'FN.13', 'FN.14', 'FN.15', 'FN.16', 'FN.17', 'FN.18',
               'FN.19', 'FN.20', 'FN.21','FN.22','FN.23','FN.24',
               'FN.25','FN.26','FN.27','FN.28','FN.29','FN.30','FN.31','FN.32','FN.33','FN.34',
              ], axis=1)
df1.columns

Index(['Goal', 'Target', 'Indicator Ref', 'Indicator Description', 'Age group',
       'Sex', 'Unit', 'Unit multiplier', '1984', '1985', '1986', '1987',
       '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
       '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005',
       '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017'],
      dtype='object')

Selecting relevant rows

In [3]:
# Rows
# all age groups
df2 = df1[df1['Age group'].isin(['All age ranges or no breakdown by age'])]
df3 = df2.drop(['Age group'], axis=1)

# all sex
df4 = df3[df3['Sex'].isin(['Both sexes or no breakdown by sex'])]
df5 = df4.drop(['Sex'], axis=1)

# delete rows with no data
df6 = df5[~df5.isnull()]

Use category sheet to find indicators of interest

In [4]:
# import Excel sheet
indicator_categories = pd.read_csv('Dev_Indicators.csv', delimiter=';')
del indicator_categories['Unnamed: 2']
del indicator_categories['Unnamed: 3']
indicator_categories.head()

FileNotFoundError: File b'Dev_Indicators.csv' does not exist

In [None]:
ind_cat_joined = indicator_categories.merge(df6, on='Indicator Description')
# delete rows with '----' and '---'
ind_cat_joined_1 = ind_cat_joined[ind_cat_joined.Category != '----']
ind_cat_joined_2 = ind_cat_joined_1[ind_cat_joined_1.Category != '---']
ind_cat_joined_2

### Prepare Population Dataframe

In [None]:
df_population = ind_cat_joined_2[ind_cat_joined_2['Category'].str.contains('Population')]

In [None]:
year = ['1984', '1985', '1986', '1987','1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995', '1996',
       '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004','2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017']

In [None]:
def popToPercent(unit, n):
    if type(n)==float:
        n2 = n
    elif n.find(',') != -1:
        n = n.replace(',','')
    n2 = float(n)
    if np.isnan(n2):
        return np.nan
    if unit =='Percent':
        return n2
    if unit == 'Per 1,000 population' or 'Per 1,000 uninfected population':
        return n2/10
    if unit == 'Per 100,000 population':
        return n2/1000

In [None]:
df_pop_soon_percent = df_population[~df_population['Unit'].isin(['Number'])]
df_pop_to_percent = df_pop_soon_percent.copy()
for yr in year:
    df_pop_to_percent[yr] = df_pop_to_percent.apply(lambda r: popToPercent(r['Unit'], r[yr]), axis=1)
df_pop_to_percent['Unit'] = 'Percent'

In [None]:
df_pop_percent = df_population[df_population['Unit'].isin(['Percent'])]

In [None]:
df_pop_in_percent = df_pop_percent.append(df_pop_to_percent)

### Economy and Politics

In [None]:
df_ecopol = ind_cat_joined_2[ind_cat_joined_2['Category'].str.contains('E&P')]
df_ecopol.head()

In [None]:
# dataframe for indicators which are already expressed in percent
df_ecopol_percent = df_ecopol[df_ecopol['Unit'].isin(['Percent'])]
# dataframe for indicators which need to be converted to percent or USD
df_ecopol_to_percent = df_ecopol[~df_ecopol['Unit'].isin(['Percent'])]

In [None]:
df_ecopol_to_percent_no_countries = df_ecopol_to_percent[df_ecopol_to_percent['Indicator Description'].str.contains('Number of countries')]

for yr in year:
    df_ecopol_to_percent_no_countries[yr] = df_ecopol_to_percent_no_countries.apply(lambda r: float(r[yr])/1.93, axis=1)
df_ecopol_to_percent_no_countries['Unit'] = 'Percent'

df_ecopol_to_percent_no_countries

In [None]:
df_ecopol_percent1 = df_ecopol_percent.append(df_ecopol_to_percent_no_countries)

In [None]:
df_ecopol_to_usd = df_ecopol[df_ecopol['Unit'].str.contains('USD') & ~df_ecopol['Unit'].str.contains('Megajoules')]

In [None]:
def toUSD(unit, multiplier, n):
    if type(n)==float:
        n2 = n
    elif n.find(',') != -1:
        n = n.replace(',','')
    n2 = float(n)
    if np.isnan(n2):
        return np.nan
    if unit=='Constant USD' or unit=='USD':
        if multiplier=='Units':
            return n2
        if multiplier=='Millions':
            return n2/1000000
        if multiplier=='Billions':
            return n2/1000000000
    elif unit=='Per 1,000 USD':
        return n2/1000

In [None]:
for yr in year:
    df_ecopol_to_usd[yr] = df_ecopol_to_usd.apply(lambda r: toUSD(r['Unit'], r['Unit multiplier'], r[yr]), axis=1)
df_ecopol_to_usd

### Environment

In [None]:
df_environment = ind_cat_joined[ind_cat_joined['Category'].str.contains('Environment')]

In [None]:
df_environment_percent = df_environment[df_environment['Unit'].str.contains('Percent')]
df_environment_not_percent = df_environment[~df_environment['Unit'].str.contains('Percent')]
df_environment_weight_units = df_environment_not_percent[df_environment_not_percent['Unit'].isin(['Metric Tons', 'Micrograms per cubic meter',
                                    'Kilograms', 'Tonne kilometres', 'kg CO2 equivalent per USD1 constant 2005 PPP GDP','Kilograms per constant USD'])]


In [None]:
def convertWeightUnits(unit, multiplier, n):
    if type(n)==float:
        n2 = n
    elif n.find(',') != -1:
        n = n.replace(',','')
    n2 = float(n)
    if np.isnan(n2):
        return np.nan
    elif unit=='Metric Tons' or unit=='Tonne kilometres':
        if multiplier=='Units':
            return n2
        elif multiplier=='Millions':
            return n2*1000000
        elif multiplier=='Thousands':
            return n2*1000
    elif unit=='Kilograms' or unit=='kg CO2 equivalent per USD1 constant 2005 PPP GDP':
        return n2/1000
    elif unit=='Micrograms per cubic meter':
        return n2/1000000000000

In [None]:
def newUnitName(unit):
    if unit=='Metric Tons' or unit=='Tonne kilometres':
        return unit
    elif unit=='Kilograms':
        return 'Metric Tons'
    elif unit=='kg CO2 equivalent per USD1 constant 2005 PPP GDP':
        return 'Metric Tons CO2 equivalent per USD1 constant 2005 PPP GDP'
    elif unit=='Micrograms per cubic meter':
        return 'Metric Tons per cubic meter'

In [None]:
for yr in year:
    df_environment_weight_units[yr] = df_environment_weight_units.apply(lambda r: convertWeightUnits(r['Unit'], r['Unit multiplier'], r[yr]), axis=1)
df_environment_weight_units['Unit multiplier']='Units'
df_environment_weight_units['Unit'] = df_environment_weight_units.apply(lambda r: newUnitName(r['Unit']), axis=1)

Saving all together in one Dataframe

In [None]:
complete_df = df_pop_in_percent.append([df_ecopol_percent1, df_ecopol_to_usd, df_environment_percent, df_environment_weight_units])
complete_df = complete_df.drop(['Category', 'Unit Multiplier'])

In [None]:
complete_df.to_csv('TargetsForAnalysis.csv')