In [3]:
# Data Source: https://www.kaggle.com/datasets/kaggle/world-development-indicators
# Folder: 'world-development-indicators' 

<br><p style="font-family: Arial; font-size:3.75em;color:purple; font-style:bold">
World Development Indicators</p><br><br>
# Exploring Data Visualization 

In [1]:
#import libraries

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [2]:
# loading the csv files into pandas DFs
indicators = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Indicators.csv')
country = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Country.csv')
country_notes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/CountryNotes.csv')
series = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Series.csv')
series_notes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/SeriesNotes.csv')
footnotes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Footnotes.csv')


In [3]:
indicators.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
0,Arab World,ARB,"Adolescent fertility rate (births per 1,000 wo...",SP.ADO.TFRT,1960,133.5609
1,Arab World,ARB,Age dependency ratio (% of working-age populat...,SP.POP.DPND,1960,87.7976
2,Arab World,ARB,"Age dependency ratio, old (% of working-age po...",SP.POP.DPND.OL,1960,6.634579
3,Arab World,ARB,"Age dependency ratio, young (% of working-age ...",SP.POP.DPND.YG,1960,81.02333
4,Arab World,ARB,Arms exports (SIPRI trend indicator values),MS.MIL.XPRT.KD,1960,3000000.0


### How many UNIQUE country names are there ?

In [3]:
countries = indicators['CountryName'].unique().tolist()
len(countries)

247

### Are there same number of country codes ?

In [4]:
# How many unique country codes are there ? (should be the same #)
countryCodes = indicators['CountryCode'].unique().tolist()
len(countryCodes)

247

### Are there many indicators or few ?

In [5]:
# How many unique indicators are there ? (should be the same #)
var_indicators = indicators['IndicatorName'].unique().tolist()
len(var_indicators)

1344

In [6]:
# List with the Indicators
var_indicators.sort()
var_indicators[:5]

['2005 PPP conversion factor, GDP (LCU per international $)',
 '2005 PPP conversion factor, private consumption (LCU per international $)',
 'ARI treatment (% of children under 5 taken to a health provider)',
 'Access to electricity (% of population)',
 'Access to electricity, rural (% of rural population)']

### How many years of data do we have ?

In [7]:
# How many years of data do we have ?
years = indicators['Year'].unique().tolist()
len(years)

56

### What's the range of years?

In [8]:
print(min(years)," to ",max(years))

1960  to  2015


Creating new DF with the following Indicators:<br>
`Expenditure on education as % of total government expenditure (%)`<br>
`Life expectancy at birth, total (years)`<br>
`Gross domestic income (constant LCU)`

In [27]:
# Creating masks to select only a few indicators
education_expenditure = 'Expenditure on education as % of total'
life_expectancy = 'Life expectancy at birth, total'
gross_domestic_income_LCU = 'Gross domestic income \(constant LCU'

mask1 = indicators['IndicatorName'].str.contains(education_expenditure)
mask2 = indicators['IndicatorName'].str.contains(life_expectancy)
mask3 = indicators['IndicatorName'].str.contains(gross_domestic_income_LCU)

# Creating new DF matching the masks
df_expenditure = indicators[mask1]
df_life_expectancy = indicators[mask2]
df_gross_domestic_income_LCU = indicators[mask3]

In [28]:
#Cheking for Null values

not_null = 0
not_null == df_expenditure.isnull().sum() & df_life_expectancy.isnull().sum() & df_gross_domestic_income_LCU.isnull().sum()

CountryName      True
CountryCode      True
IndicatorName    True
IndicatorCode    True
Year             True
Value            True
dtype: bool

### aqui detalhar - Expenditure on Education

In [29]:
df_expenditure.head(2)

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value
2506276,Australia,AUS,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1995,14.11191
2772785,Albania,ALB,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1997,11.15339


In [30]:
# import CountryCode

CountryCode=pd.read_csv('https://pkgstore.datahub.io/JohnSnowLabs/country-and-continent-codes-list/country-and-continent-codes-list-csv_csv/data/b7876b7f496677669644f3d1069d3121/country-and-continent-codes-list-csv_csv.csv',sep=',')
CountryCode.head(2)

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0


I only need two columns - `Continent_Name` and `Three_Letter_Country_Code`

In [32]:
CountryCode = CountryCode[['Continent_Name', 'Three_Letter_Country_Code']]

In [33]:
# Merging the DFs and creating a new column
df_expenditure_final = pd.merge(df_expenditure, CountryCode, left_on='CountryCode', right_on='Three_Letter_Country_Code', how='left')
df_expenditure_final = df_expenditure_final.iloc[:, :-1] #remove last column

# Now, we get the final dataset which includes the continent information.
df_expenditure_final.head(2)

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value,Continent_Name
0,Australia,AUS,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1995,14.11191,Oceania
1,Albania,ALB,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1997,11.15339,Europe


In [37]:
# Now we drop all the missing values and sort the dataset with Year

df_expenditure_final = df_expenditure_final.dropna(how='any')
df_expenditure_final = df_expenditure_final.sort_values(by=['Year'])
df_expenditure_final.head()

Unnamed: 0,CountryName,CountryCode,IndicatorName,IndicatorCode,Year,Value,Continent_Name
0,Australia,AUS,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1995,14.11191,Oceania
19,United Arab Emirates,ARE,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1997,5.02373,Asia
18,Ukraine,UKR,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1997,12.92467,Europe
16,Slovak Republic,SVK,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1997,8.55968,Europe
15,Philippines,PHL,Expenditure on education as % of total governm...,SE.XPD.TOTL.GB.ZS,1997,15.3684,Asia


In [None]:
fig = px.scatter(df_expenditure_final,x="Value", y="Year",animation_frame="Year", 
 animation_group=”Country”,size=”Population”, 
 color=”Continent_Name”,
 hover_name=”Country”, log_x=True, 
 size_max=45,range_x=[200,150000], range_y=[10,100]
 )
fig.layout.updatemenus[0].buttons[0].args[1][“frame”][“duration”] = 700
fig.show()