In [3]:
# Data Source: https://www.kaggle.com/datasets/kaggle/world-development-indicators
# Folder: 'world-development-indicators' 

<br><p style="font-family: Arial; font-size:3.75em;color:green; font-style:bold">
World Development Indicators</p><br><br>
# Exploring Data Visualization 

In [1]:
#import libraries

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [2]:
# loading the csv files into pandas DFs
indicators = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Indicators.csv')

#series = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Series.csv')
#country = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Country.csv')
#country_notes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/CountryNotes.csv')
#series_notes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/SeriesNotes.csv')
#footnotes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Footnotes.csv')


In [31]:
# Take a look into the dataframes
indicators.head(5)

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,133.5609
1,Arab World,ARB,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,87.7976
2,Arab World,ARB,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,6.634579
3,Arab World,ARB,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,81.02333
4,Arab World,ARB,Arms exports (SIPRI trend indicator values),MS.MIL.XPRT.KD,1960,3000000.0


In [14]:
# This method prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.
indicators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5656458 entries, 0 to 5656457
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   CountryName    object 
 1   CountryCode    object 
 2   IndicatorName  object 
 3   IndicatorCode  object 
 4   Year           int64  
 5   Value          float64
dtypes: float64(1), int64(1), object(4)
memory usage: 258.9+ MB


### How many UNIQUE country names are there ?

In [4]:
countries = indicators['CountryName'].unique().tolist()
len(countries)

247

### Are there same number of country codes ?

In [5]:
# How many unique country codes are there ? (should be the same #)
countryCodes = indicators['CountryCode'].unique().tolist()
len(countryCodes)

247

### Are there many indicators or few ?

In [6]:
# How many unique indicators are there ? (should be the same #)
var_indicators = indicators['IndicatorName'].unique().tolist()
len(var_indicators)

1344

In [20]:
# List the five first Indicators
var_indicators.sort()
var_indicators[:5]

['2005 PPP conversion factor, GDP (LCU per international $)',
 '2005 PPP conversion factor, private consumption (LCU per international $)',
 'ARI treatment (% of children under 5 taken to a health provider)',
 'Access to electricity (% of population)',
 'Access to electricity, rural (% of rural population)']

### How many years of data do we have ?

In [8]:
# How many years of data do we have ?
years = indicators['Year'].unique().tolist()
len(years)

56

### What's the range of years?

In [15]:
print(min(years)," to ",max(years))

1960  to  2015


### Creating masks

we just want the data from <b>Brazil</b>

Creating new DF with the following Indicators:<br>
`Expenditure on education as % of total government expenditure (%)`<br>
`Life expectancy at birth, total (years)`<br>
`Gross domestic income (constant LCU)`

In [35]:
# Creating masks to select only a few indicators
education_expenditure = 'Expenditure on education as % of total'
life_expectancy = 'Life expectancy at birth, total'
gross_domestic_income_LCU = 'Gross domestic income \(constant LCU'
selected_country = 'BRA' #Brazil

mask1 = indicators['IndicatorName'].str.contains(education_expenditure)
mask2 = indicators['IndicatorName'].str.contains(life_expectancy)
mask3 = indicators['IndicatorName'].str.contains(gross_domestic_income_LCU)
mask4 = indicators['CountryCode'].str.contains(selected_country)

# Creating new DF matching the masks 
df_expenditure = indicators[mask1 & mask4]
df_life_expectancy = indicators[mask2 & mask4]
df_gross_domestic_income_LCU = indicators[mask3 & mask4]

<b>Now</b> we have 3 new dataframes:
`df_expenditure`, `df_life_expectancy`, and `df_gross_domestic_income_LCU` 


In [37]:
df_expenditure.head(2)

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
2922667,Brazil,BRA,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1998,11.62986
3060928,Brazil,BRA,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1999,9.60027


In [38]:
df_life_expectancy.head(2)

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
5736,Brazil,BRA,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1960,54.205463
29829,Brazil,BRA,"Life expectancy at birth, total (years)",SP.DYN.LE00.IN,1961,54.718707


In [39]:
df_gross_domestic_income_LCU.head(2)

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
5701,Brazil,BRA,Gross domestic income (constant LCU),NY.GDY.TOTL.KN,1960,200013300000.0
29790,Brazil,BRA,Gross domestic income (constant LCU),NY.GDY.TOTL.KN,1961,218987900000.0


In [40]:
#Cheking for Null values

not_null = 0
not_null == df_expenditure.isnull().sum() & df_life_expectancy.isnull().sum() & df_gross_domestic_income_LCU.isnull().sum()

CountryName      True
CountryCode      True
IndicatorName    True
IndicatorCode    True
Year             True
Value            True
dtype: bool

### aqui detalhar - Expenditure on Education

In [21]:
fig = px.scatter(df_final,x="gdi_value", y="ExpenditureEducationValue",animation_frame="Year", 
 animation_group="Continent_Name",size="gdi_value", 
 color="Continent_Name",
 hover_name="CountryName", log_x=True, 
 size_max=45,range_x=[200,150000], range_y=[10,100]
 )
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 900
fig.show()