# SDG Test Ground

In [1]:
#Import dependencies 
%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import matplotlib.lines as mlines

## Indicator Codes & Names

In [2]:
WDIData = os.path.join('resources', 'WDIData.csv')
original_df = pd.read_csv(WDIData)

indicators = original_df['Indicator Name']
indicators = indicators.unique()
print(len(indicators))
indicators[0:11]

1429


array(['Access to clean fuels and technologies for cooking (% of population)',
       'Access to electricity (% of population)',
       'Access to electricity, rural (% of rural population)',
       'Access to electricity, urban (% of urban population)',
       'Account ownership at a financial institution or with a mobile-money-service provider (% of population ages 15+)',
       'Account ownership at a financial institution or with a mobile-money-service provider, female (% of population ages 15+)',
       'Account ownership at a financial institution or with a mobile-money-service provider, male (% of population ages 15+)',
       'Account ownership at a financial institution or with a mobile-money-service provider, older adults (% of population ages 25+)',
       'Account ownership at a financial institution or with a mobile-money-service provider, poorest 40% (% of population ages 15+)',
       'Account ownership at a financial institution or with a mobile-money-service provider, 

In [3]:
WDISeries = os.path.join('resources', 'WDISeries.csv')
series_df = pd.read_csv(WDISeries)
series_df = series_df.iloc[:,0:3]

topic_df = series_df["Topic"].str.split(":", n=1, expand = True)
series_df.insert(1, "Topic Parent", topic_df[0])
# series_df.insert(2, "Topic Child", topic_df[1])
print(len(series_df))
series_df.head()

1429


Unnamed: 0,Series Code,Topic Parent,Topic,Indicator Name
0,AG.AGR.TRAC.NO,Environment,Environment: Agricultural production,"Agricultural machinery, tractors"
1,AG.CON.FERT.PT.ZS,Environment,Environment: Agricultural production,Fertilizer consumption (% of fertilizer produc...
2,AG.CON.FERT.ZS,Environment,Environment: Agricultural production,Fertilizer consumption (kilograms per hectare ...
3,AG.LND.AGRI.K2,Environment,Environment: Land use,Agricultural land (sq. km)
4,AG.LND.AGRI.ZS,Environment,Environment: Land use,Agricultural land (% of land area)


## Parent Topics

In [4]:
topics = series_df["Topic Parent"].unique()
print(len(topics))
topics

12


array(['Environment', 'Economic Policy & Debt', 'Infrastructure',
       'Financial Sector', 'World Bank, International Debt Statistics.',
       'Public Sector', 'Private Sector & Trade',
       'Social Protection & Labor', 'Education', 'Gender', 'Health',
       'Poverty'], dtype=object)

## Indicator Codes

In [5]:
substring = 'SI.POV'

agr = series_df[series_df['Series Code'].str.startswith(substring)]
print(len(agr))
agr

13


Unnamed: 0,Series Code,Topic Parent,Topic,Indicator Name
1074,SI.POV.DDAY,Poverty,Poverty: Poverty rates,Poverty headcount ratio at $1.90 a day (2011 P...
1075,SI.POV.GAPS,Poverty,Poverty: Poverty rates,Poverty gap at $1.90 a day (2011 PPP) (%)
1076,SI.POV.GINI,Poverty,Poverty: Income distribution,GINI index (World Bank estimate)
1077,SI.POV.LMIC,Poverty,Poverty: Poverty rates,Poverty headcount ratio at $3.20 a day (2011 P...
1078,SI.POV.LMIC.GP,Poverty,Poverty: Poverty rates,Poverty gap at $3.20 a day (2011 PPP) (%)
1079,SI.POV.NAGP,Poverty,Poverty: Poverty rates,Poverty gap at national poverty lines (%)
1080,SI.POV.NAHC,Poverty,Poverty: Poverty rates,Poverty headcount ratio at national poverty li...
1081,SI.POV.RUGP,Poverty,Poverty: Poverty rates,Rural poverty gap at national poverty lines (%)
1082,SI.POV.RUHC,Poverty,Poverty: Poverty rates,Rural poverty headcount ratio at national pove...
1083,SI.POV.UMIC,Poverty,Poverty: Poverty rates,Poverty headcount ratio at $5.50 a day (2011 P...


### Translate Code to Indicator Name

In [6]:
code = 'NY.GNP.PCAP.CD'
select = series_df.loc[series_df['Series Code']==code]
select

Unnamed: 0,Series Code,Topic Parent,Topic,Indicator Name
701,NY.GNP.PCAP.CD,Economic Policy & Debt,Economic Policy & Debt: National accounts: Atl...,"GNI per capita, Atlas method (current US$)"


### Indicator Values

In [7]:
# selection = 'GNI per capita, Atlas method (current US$)'
select = original_df.loc[original_df['Indicator Code']==code]
print(len(select))
select

264


Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
501,Arab World,ARB,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,,,,,...,5780.968493,6053.445004,7000.065606,7448.258147,7544.460299,7084.897629,6654.817513,6190.119471,6367.236882,
1930,Caribbean small states,CSS,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,,,,,...,8765.981856,8614.941367,9315.326667,9715.496380,9648.995981,9676.421949,9086.556330,9008.397271,9240.085673,
3359,Central Europe and the Baltics,CEB,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,,,,,...,12760.332390,12908.282170,13063.386660,13416.548580,13557.560400,13215.979210,12837.605580,13024.126870,14563.876080,
4788,Early-demographic dividend,EAR,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,,,169.550384,180.525954,...,2726.824099,2970.674271,3278.481829,3416.432217,3454.090762,3373.745534,3336.861626,3379.815663,3567.833761,
6217,East Asia & Pacific,EAS,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,142.719774,156.136131,174.731222,193.500256,...,7319.776405,8119.449224,9087.272233,9688.427266,9908.512445,9783.629718,9837.912546,10151.536670,10982.312520,
7646,East Asia & Pacific (excluding high income),EAP,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,71.307032,77.570267,86.234680,97.707052,...,3763.230311,4345.421753,5079.446668,5714.289283,6207.062381,6456.829461,6648.648604,6960.829469,7600.548586,
9075,East Asia & Pacific (IDA & IBRD countries),TEA,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,72.122245,78.447667,87.202716,98.799531,...,3803.685865,4392.008030,5133.750986,5775.216117,6273.073572,6525.332297,6718.986320,7034.239389,7680.525267,
10504,Euro area,EMU,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,1087.329531,1191.975117,1313.069662,1449.395082,...,39768.317030,40305.785110,39273.152670,39470.036120,39356.781620,37730.844850,36344.960400,35838.442410,38774.486210,
11933,Europe & Central Asia,ECS,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,,,,,...,24486.212940,24964.496580,25273.509030,25971.850330,25997.632830,24625.549590,23322.184510,22808.815310,24328.652240,
13362,Europe & Central Asia (excluding high income),ECA,"GNI per capita, Atlas method (current US$)",NY.GNP.PCAP.CD,,,,,,,...,7441.475878,8073.418186,9251.169369,10209.739360,10138.192050,8803.015382,7708.167008,7401.174175,7804.194900,


In [8]:
SDGTiers = os.path.join('resources', 'SDGTiers.csv')
tiers_df = pd.read_csv(SDGTiers, header=1)
tiers_df = tiers_df.iloc[:,[1,2,6]]
tiers_df = tiers_df.dropna()
tiers_df.head()

Unnamed: 0,Target,Indicator,Updated Tier Classification (by IAEG-SDG Members)
1,"1.1 By 2030, eradicate extreme poverty for all...",1.1.1 Proportion of population below the inter...,Tier I
2,"1.2 By 2030, reduce at least by half the propo...",1.2.1 Proportion of population living below th...,Tier I
3,\n,"1.2.2 Proportion of men, women and children of...",Tier II
4,1.3 Implement nationally appropriate social pr...,1.3.1 Proportion of population covered by soci...,Tier II
5,"1.4 By 2030, ensure that all men and women, in...",1.4.1 Proportion of population living in house...,Tier I


## API Calls

In [9]:
import json
import requests
import datetime 
from datetime import date
from pprint import pprint

In [10]:
# Start url query
url = "https://api.worldbank.org/v2/country/WLD/indicator/SP.POP.TOTL?format=json"
response = requests.get(url).json()
results = pprint(response[1])
results


[{'country': {'id': '1W', 'value': 'World'},
  'countryiso3code': 'WLD',
  'date': '2019',
  'decimal': 0,
  'indicator': {'id': 'SP.POP.TOTL', 'value': 'Population, total'},
  'obs_status': '',
  'unit': '',
  'value': None},
 {'country': {'id': '1W', 'value': 'World'},
  'countryiso3code': 'WLD',
  'date': '2018',
  'decimal': 0,
  'indicator': {'id': 'SP.POP.TOTL', 'value': 'Population, total'},
  'obs_status': '',
  'unit': '',
  'value': 7594270356},
 {'country': {'id': '1W', 'value': 'World'},
  'countryiso3code': 'WLD',
  'date': '2017',
  'decimal': 0,
  'indicator': {'id': 'SP.POP.TOTL', 'value': 'Population, total'},
  'obs_status': '',
  'unit': '',
  'value': 7510990456},
 {'country': {'id': '1W', 'value': 'World'},
  'countryiso3code': 'WLD',
  'date': '2016',
  'decimal': 0,
  'indicator': {'id': 'SP.POP.TOTL', 'value': 'Population, total'},
  'obs_status': '',
  'unit': '',
  'value': 7426103221},
 {'country': {'id': '1W', 'value': 'World'},
  'countryiso3code': 'WLD',
 

In [21]:
# Set up empty dataframe
columns = ['Code','Indicator','Country','Year','Value']
values_df = pd.DataFrame(columns=columns, index=[0])

In [23]:
# Parameters
code = 'SP.POP.TOTL'
country = 'WLD'

# API call
url = f"https://api.worldbank.org/v2/country/{country}/indicator/{code}?format=json"
response = requests.get(url).json()
results = response[1]

# Iterate through each row (one year each)
for i in range(len(results)):
    row = []
    # Code
    row.append(results[i]['indicator']['id'])
    # Indicator
    row.append(results[i]['indicator']['value'])
    # Country
    row.append(results[i]['country']['value'])
    # Year
    row.append(results[i]['date'])
    # Value
    row.append(results[i]['value'])
    # Append row
    values_df.loc[i,:] = row

values_df = values_df.dropna()
values_df

Unnamed: 0,Code,Indicator,Country,Year,Value
1,SP.POP.TOTL,"Population, total",World,2018,7594270356
2,SP.POP.TOTL,"Population, total",World,2017,7510990456
3,SP.POP.TOTL,"Population, total",World,2016,7426103221
4,SP.POP.TOTL,"Population, total",World,2015,7340548192
5,SP.POP.TOTL,"Population, total",World,2014,7255653881
6,SP.POP.TOTL,"Population, total",World,2013,7170961674
7,SP.POP.TOTL,"Population, total",World,2012,7086993625
8,SP.POP.TOTL,"Population, total",World,2011,7004011262
9,SP.POP.TOTL,"Population, total",World,2010,6922947261
10,SP.POP.TOTL,"Population, total",World,2009,6840591577


## Cleaning the Data
### Datasets included:
 - The World Bank's World Development Indicators Data
 - United Nations Development Program Human Development Reports
   - Human Development Index
   - Education Index
   - Gender

In [None]:
filepath = os.path.join('resources', 'indicator_df.csv')
indicator_df = pd.read_csv(filepath, header=0)

filepath = os.path.join('resources','HDI_df.csv')
hdi_df = pd.read_csv(filepath)

filepath = filepath = os.path.join('resources','undp_hdr_education-index.csv')
edu_index_df = pd.read_csv(filepath)

filepath = filepath = os.path.join('resources','undp_hdr_gender-inequality-index.csv')
gii_df = pd.read_csv(filepath)

filepath= os.path.join('resources', 'WDISeries.csv')
series_df = pd.read_csv(filepath, header=0)

In [None]:
#Pull all in all indicicators from 1990-2019
full_ind_1990_2019 = wdi_data_df[["Country Name","Country Code","Indicator Name","Indicator Code",
                                  "1990","1991","1992","1993","1994","1995","1996","1997","1998","1999",
                                  "2000","2001","2002","2003","2004","2005","2006","2007","2008","2009",
                                  "2010","2011","2012","2013","2014","2015","2016","2017","2018","2019"]]

In [None]:
#Reformat years from columns to rows
full_ind_reformatted = pd.melt(full_ind_1990_2019, id_vars =["Country Name","Country Code","Indicator Name","Indicator Code"], 
                               value_vars=["1990","1991","1992","1993","1994","1995","1996","1997","1998","1999",
                                           "2000","2001","2002","2003","2004","2005","2006","2007","2008","2009",
                                           "2010","2011","2012","2013","2014","2015","2016","2017","2018","2019"]) 

In [None]:
#Drop all NaN from full 1990-2019 dataset 
ind_nan_dropped = full_ind_reformatted.dropna()
ind_nan_dropped_renamed = ind_nan_dropped.rename(columns={"Country Name":"country_name","Country Code":"country_code","Indicator Name":"indicator_name","Indicator Code":"indicator_code","variable":"year"})

In [None]:
#Read in indicator topic data
code_topics_df = series_df[["Series Code","Topic"]]
code_topics_renamed = code_topics_df.rename(columns={"Series Code":"series_code","Topic":"topic"})

In [None]:
#Merge indicators and topics
indicators_and_topics = pd.merge(ind_nan_dropped_renamed, code_topics_renamed, left_on = ["indicator_code"], right_on=["series_code"])
indicators_and_topics.head()

In [None]:
#Reformat the HDI dataframe
hdi_reformatted = pd.melt(hdi_df, id_vars =["Country"], 
                               value_vars=["1990","1991","1992","1993","1994","1995","1996","1997","1998","1999",
                                           "2000","2001","2002","2003","2004","2005","2006","2007","2008","2009",
                                           "2010","2011","2012","2013","2014","2015","2016","2017","2018"]) 
hdi_reformatted.head()

In [None]:
#Merge primary indicators and HDI
indicators_hdi = pd.merge(indicators_and_topics, hdi_reformatted, left_on = ["country_name", "year"], right_on=["Country", "variable"])
indicators_hdi.head()

In [None]:
indicators_hdi_dropped = indicators_hdi.drop(["series_code","Country", "variable"], axis=1)
indicators_hdi_renamed = indicators_hdi_dropped.rename(columns={"value_x":"value","value_y":"hdi"})
indicators_hdi_renamed["category"] = indicators_hdi_renamed["topic"].str.split(':').str[0]
indicators_hdi_renamed["subcategory"] = indicators_hdi_renamed["topic"].str.split(':').str[1]
indicators_hdi_renamed.head()

In [None]:
twb_wdi_indicators_final = indicators_hdi_renamed[["country_name","country_code","indicator_name","indicator_code","topic","category","subcategory","year","value","hdi"]]
twb_wdi_indicators_final.head()

In [None]:
#Pull out just the WDI values
wdi_values = twb_wdi_indicators_final[["country_name","indicator_name","year","value","hdi"]]
wdi_values.head()

In [None]:
#Select the relevent columns from the Gender Inequality Index dataframe
gii_2014_2018 = gii_df[["Country","2014","2015","2016","2017","2018"]]

#Add column for indicator name
gii_2014_2018["indicator_name"] = "Gender Inequality Index"

#Pull out space from Country column values
gii_2014_2018["country_name"] = gii_2014_2018["Country"].str.split(" ").str[1]

In [None]:
#Pivot years
gii_2014_2018_pivot = pd.melt(gii_2014_2018, id_vars =["country_name","indicator_name"], 
                               value_vars=["2014","2015","2016","2017","2018"])

In [None]:
#Add in HDI
gii_hdi_combined = pd.merge(gii_2014_2018_pivot, hdi_reformatted, how="left",
                            left_on=["country_name", "variable"], 
                            right_on=["Country", "variable"])

In [None]:
#Rename columns
gii_hdi_renamed = gii_hdi_combined.rename(columns={"variable":"year", "value_x":"value","value_y":"hdi"})

#Select final dataframe columns
gii_hdi = gii_hdi_renamed[["country_name","indicator_name","year","value","hdi"]]

#Preview GII dataframe
gii_hdi.head()

In [None]:
#Select the relevent columns from the Gender Inequality Index dataframe
ei_2014_2018 = edu_index_df[["Country","2014","2015","2016","2017","2018"]]

#Add column for indicator name
ei_2014_2018["indicator_name"] = "Education Index"

#Pull out space from Country column values
ei_2014_2018["country_name"] = ei_2014_2018["Country"].str.split(" ").str[1]

In [None]:
#Pivot years
ei_2014_2018_pivot = pd.melt(ei_2014_2018, id_vars =["country_name","indicator_name"], 
                               value_vars=["2014","2015","2016","2017","2018"])

In [None]:
#Add in HDI
ei_hdi_combined = pd.merge(ei_2014_2018_pivot, hdi_reformatted, how="left",
                            left_on=["country_name", "variable"], 
                            right_on=["Country", "variable"])

In [None]:
#Rename columns
ei_hdi_renamed = ei_hdi_combined.rename(columns={"variable":"year", "value_x":"value","value_y":"hdi"})

#Select final dataframe columns
ei_hdi = ei_hdi_renamed[["country_name","indicator_name","year","value","hdi"]]

#Preview GII dataframe
ei_hdi.head()

In [None]:
#Create combined indicators dataframe from both WDI and UNDP datasets
all_indicators = [wdi_values, gii_hdi, ei_hdi]
combined_indicators = pd.concat(all_indicators)

#Drop ".." rows
dropped_non_float = combined_indicators[combined_indicators.value != ".."]
dropped_non_float = dropped_non_float[dropped_non_float.hdi != ".."]

#Asign value and hdi columns as floats
dropped_non_float["value"] = dropped_non_float["value"].astype(float)
dropped_non_float["hdi"] = dropped_non_float["hdi"].astype(float)
dropped_non_float.head()

In [None]:
combined_indicators_final = dropped_non_float
combined_indicators_final.dtypes

## Working with Primary Indicators 
Indicators tracked by the most countries over the most years

In [None]:
#Set chart style for all included charts in notebook
plt.style.use('seaborn-darkgrid')

In [None]:
#Pull out primary indicators (most commonly tracked by countries over the years)
common_indicators = [
    "Population, total",
    "Agriculture, forestry, and fishing, value added (% of GDP)",
    "Life expectancy at birth, total (years)",
    "GDP (current US$)",
    "GDP per capita (current US$)",
    "GNI per capita, Atlas method (current US$)",
    "Employers, total (% of total employment) (modeled ILO estimate)"]

primary_indicators = combined_indicators_final[combined_indicators_final['indicator_name'].isin(common_indicators) ]

filepath = os.path.join('resources','primary_indicators.csv')
primary_indicators.to_csv(filepath)

primary_indicators.head()

In [None]:
#Create Agricultural v. HDI chart
agri_values = primary_indicators.loc[(primary_indicators['indicator_name']=='Agriculture, forestry, and fishing, value added (% of GDP)')]

# defining variables
x = agri_values['value']
y = agri_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Agricultural Contribution to GDP (%)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Agricultural Contribution vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Agriculture_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)


In [None]:
#Create Life Expectancy v. HDI chart
life_values = primary_indicators.loc[(primary_indicators['indicator_name']=='Life expectancy at birth, total (years)')]

# defining variables
x = life_values['value']
y = life_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Life expectancy at birth (years)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Life Expectancy vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Life-expectancy_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create GDP v. HDI chart
gdp_values = primary_indicators.loc[(primary_indicators['indicator_name']=='GDP (current US$)')]

# defining variables
x = gdp_values['value']
y = gdp_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('GDP (US $)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Gross Domestic Product vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'GDP_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Employment v. HDI chart

GNI_values = primary_indicators.loc[(primary_indicators['indicator_name']=='GNI per capita, Atlas method (current US$)')]

# defining variables
x = GNI_values['value']
y = GNI_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('GNI per capita, (current US$)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Gross National Income vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'GNI_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Employment v. HDI chart
employment_values = primary_indicators.loc[(primary_indicators['indicator_name']=='Employers, total (% of total employment) (modeled ILO estimate)')]

# defining variables
x = employment_values['value']
y = employment_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Employment (%)',fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Employment vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Employment_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

## Working with Secondary Indicators

In [None]:
#Pull out selected education indicators for a deeper look into how education impacts HDI
education_indicators = ["Education Index",
                        "Labor force with basic education (% of total working-age population with basic education)",
                        "Labor force with intermediate education (% of total working-age population with intermediate education)",
                        "Labor force with advanced education (% of total working-age population with advanced education)",
                        "Government expenditure on education, total (% of GDP)",
                        "Literacy rate, adult total (% of people ages 15 and above)"]

secondary_indicators_education = combined_indicators_final[combined_indicators_final['indicator_name'].isin(education_indicators) ]

#Save to CSV
filepath = os.path.join('resources','education_df.csv')
secondary_indicators_education.to_csv(filepath)

#Preview data frame
secondary_indicators_education.head()

In [None]:
#Pull out selected gender indicators for a deeper look into how gender parity impacts HDI
gender_indicators = [
    "Gender Inequality Index",
    "Labor force, female (% of total labor force)",
    "Unemployment, female (% of female labor force) (modeled ILO estimate)",
    "School enrollment, primary (gross), gender parity index (GPI)",
    "School enrollment, secondary (gross), gender parity index (GPI)",
    "School enrollment, tertiary (gross), gender parity index (GPI)",
    "Proportion of seats held by women in national parliaments (%)"]

secondary_indicators_gender = combined_indicators_final[combined_indicators_final['indicator_name'].isin(gender_indicators) ]

#Save to CSV
filepath = os.path.join('resources','gender_df.csv')
secondary_indicators_gender.to_csv(filepath)

#Preview data frame
secondary_indicators_gender.head()

In [None]:
#Pull out selected accessiblity (to basic and technology needs) indicators for a deeper look into how access to these things impacts HDI
access_indicators = [
    "Mobile cellular subscriptions (per 100 people)",
    "Individuals using the Internet (% of population)",
    "People using at least basic drinking water services (% of population)",
    "Access to clean fuels and technologies for cooking (% of population)",
    "Access to electricity (% of population)",
    "People using at least basic sanitation services (% of population)"]

secondary_indicators_access = combined_indicators_final[combined_indicators_final['indicator_name'].isin(access_indicators) ]

#Save to CSV
filepath = os.path.join('resources','access_df.csv')
secondary_indicators_access.to_csv(filepath)

#Preview data frame
secondary_indicators_access.head()

In [None]:
#Create Education Index v. HDI chart
edu_1_values = secondary_indicators_education.loc[(secondary_indicators_education['indicator_name']=='Education Index')]

# defining variables
x = edu_1_values['value']
y = edu_1_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Education Index', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Education Index vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Education_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Workforce with Basic Education Index v. HDI chart
edu_2_values = secondary_indicators_education.loc[(secondary_indicators_education['indicator_name']=='Labor force with basic education (% of total working-age population with basic education)')]

# defining variables
x = edu_2_values['value']
y = edu_2_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Working Age Labor Force with Basic Education (%)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Workforce with Basic Education vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'WorkForceBasicEducation_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Workforce with Intermediate Education Index v. HDI chart
edu_3_values = secondary_indicators_education.loc[(secondary_indicators_education['indicator_name']=='Labor force with intermediate education (% of total working-age population with intermediate education)')]

# defining variables
x = edu_3_values['value']
y = edu_3_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Working Age Labor force with Intermediate Education (%)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Workforce with Intermediate Education vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'WorkForceIntermediateEducation_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Workforce with Advanced Education Index v. HDI chart
edu_4_values = secondary_indicators_education.loc[(secondary_indicators_education['indicator_name']=='Labor force with advanced education (% of total working-age population with advanced education)')]

# defining variables
x = edu_4_values['value']
y = edu_4_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Working Age Labor force with Advanced Education (%)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Workforce with Advanced Education vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'WorkForceAdvancedEducation_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Government Total Expenditure on Education v. HDI chart
edu_5_values = secondary_indicators_education.loc[(secondary_indicators_education['indicator_name']=='Government expenditure on education, total (% of GDP)')]

# defining variables
x = edu_5_values['value']
y = edu_5_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Government Total Expenditure on Education', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Government Total Expenditure on Education vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'ExpenditureEducation_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Adult Literacy Rate v. HDI chart
edu_6_values = secondary_indicators_education.loc[(secondary_indicators_education['indicator_name']=='Literacy rate, adult total (% of people ages 15 and above)')]

# defining variables
x = edu_6_values['value']
y = edu_6_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Adult Literacy Rate (% of people ages 15 and above)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Adult Literacy Rate vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Literacy_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Gender Inequality Index v. HDI chart
gender_1_values = secondary_indicators_gender.loc[(secondary_indicators_gender['indicator_name']=='Gender Inequality Index')]

# defining variables
x = gender_1_values['value']
y = gender_1_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Gender Inequality Index', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Gender Inequality Index vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'GII_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Female Unemployment v. HDI chart
gender_2_values = secondary_indicators_gender.loc[(secondary_indicators_gender['indicator_name']=='Unemployment, female (% of female labor force) (modeled ILO estimate)')]

# defining variables
x = gender_2_values['value']
y = gender_2_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Unemployed Female Percentage of Labor Force', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Unemployed Female Percentage of Labor Force vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Female_Unemployment_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)


In [None]:
#Create Female Labor Force Percentage v. HDI chart
gender_3_values = secondary_indicators_gender.loc[(secondary_indicators_gender['indicator_name']=='Labor force, female (% of total labor force)')]

# defining variables
x = gender_3_values['value']
y = gender_3_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Labor Force, Female (% of total labor force)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Female Labor Force Percentage vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Female_Labor_Force.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Primary School enrollment Gender parity index (GPI) v. HDI chart
gender_4_values = secondary_indicators_gender.loc[(secondary_indicators_gender['indicator_name']=='School enrollment, primary (gross), gender parity index (GPI)')]

# defining variables
x = gender_4_values['value']
y = gender_4_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Primary School Enrollment (Gross), Gender Parity Index (GPI)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Primary School Enrollment Gender Parity Index vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Primary_GPI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Secondary School enrollment Gender parity index (GPI) v. HDI chart
gender_5_values = secondary_indicators_gender.loc[(secondary_indicators_gender['indicator_name']=='School enrollment, secondary (gross), gender parity index (GPI)')]

# defining variables
x = gender_5_values['value']
y = gender_5_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Secondary School Enrollment (Gross), Gender Parity Index (GPI)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Secondary School Enrollment Gender Parity Index vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Secondary_GPI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Tertiary School enrollment Gender tertiary index (GPI) v. HDI chart
gender_6_values = secondary_indicators_gender.loc[(secondary_indicators_gender['indicator_name']=='School enrollment, tertiary (gross), gender parity index (GPI)')]

# defining variables
x = gender_6_values['value']
y = gender_6_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Tertiary School Enrollment (Gross), Gender Parity Index (GPI)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Tertiary School Enrollment Gender Parity Index vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Tertiary_GPI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Percentage of Seats Held by Women in National Parliaments v. HDI chart
gender_7_values = secondary_indicators_gender.loc[(secondary_indicators_gender['indicator_name']=='Proportion of seats held by women in national parliaments (%)')]

# defining variables
x = gender_7_values['value']
y = gender_7_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Seats Held by Women in National Parliaments (%)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Seats Held by Women in National Parliaments vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Women_ParliamentSeats.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Mobile Subscriptions v. HDI chart
access_1_values = secondary_indicators_access.loc[(secondary_indicators_access['indicator_name']=='Mobile cellular subscriptions (per 100 people)')]

# defining variables
x = access_1_values['value']
y = access_1_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Mobile Cellular Subscriptions (per 100 people)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Mobile Subscriptions vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Mobile_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Access to Electricity v. HDI chart
access_2_values = secondary_indicators_access.loc[(secondary_indicators_access['indicator_name']=='Access to electricity (% of population)')]

# defining variables
x = access_2_values['value']
y = access_2_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Access to Electricity (% of population)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Access to Electricity vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Electric_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Percent of Population Using the Internet v. HDI chart
access_3_values = secondary_indicators_access.loc[(secondary_indicators_access['indicator_name']=='Individuals using the Internet (% of population)')]

# defining variables
x = access_3_values['value']
y = access_3_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Individuals using the Internet (% of population)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Percent of Population Using the Internet vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Internet_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Percent of Population with at least Drinking Water Services v. HDI chart
access_4_values = secondary_indicators_access.loc[(secondary_indicators_access['indicator_name']=='People using at least basic drinking water services (% of population)')]

# defining variables
x = access_4_values['value']
y = access_4_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Population Using at Least Basic Drinking Water Services (%)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Basic Drinking Water Services vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Water_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Percent of Population with Clean Fuels and Technologies for Cooking v. HDI chart
access_5_values = secondary_indicators_access.loc[(secondary_indicators_access['indicator_name']=='Access to clean fuels and technologies for cooking (% of population)')]

# defining variables
x = access_5_values['value']
y = access_5_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)
plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Population with clean fuels and technologies for cooking (%)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Clean Fuels and Technologies for Cooking vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Cooking_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)

In [None]:
#Create Percent of Population with at least Basic Sanitation Services v. HDI chart
access_6_values = secondary_indicators_access.loc[(secondary_indicators_access['indicator_name']=='People using at least basic sanitation services (% of population)')]

# defining variables
x = access_6_values['value']
y = access_6_values['hdi']

# linear regression stats and fitline
slope, intercept, r_value, p_value, _ = stats.linregress(x,y)
fit = intercept + slope*(np.array(x))

# Scatter plot
fig, ax = plt.subplots(figsize = (7,5))
ax.scatter(x, y, color="#233D4D", alpha=0.35, zorder=2)

plt.plot(x, fit, color="#D8D52B", label='fitted line', linewidth=0.8)
plt.grid(zorder=0)

plt.xlabel('Population using at least basic sanitation services (%)', fontsize=14)
plt.ylabel('HDI', fontsize=14)
plt.title("Basic Sanitation Services vs. HDI", fontsize=18)

# Show the figure
plt.tight_layout()
img_path = os.path.join('static/images', 'Sanitation_HDI.png')
plt.savefig(img_path, transparent=True)
plt.show()

print(r_value**2)