In [None]:
# Data Source: https://www.kaggle.com/datasets/kaggle/world-development-indicators
# Folder: 'world-development-indicators' 

<br><p style="font-family: Arial; font-size:3.75em;color:purple; font-style:bold">
World Development Indicators</p><br><br>
# Exploring Data Visualization 

In [None]:
#import libraries

import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px

In [None]:
# loading the csv files into pandas DFs
indicators = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Indicators.csv')
country = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Country.csv')
country_notes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/CountryNotes.csv')
series = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Series.csv')
series_notes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/SeriesNotes.csv')
footnotes = pd.read_csv('../DS_World_Development_Indicators/data_wdi/Footnotes.csv')


In [None]:
indicators.head()

### How many UNIQUE country names are there ?

In [None]:
countries = indicators['CountryName'].unique().tolist()
len(countries)

### Are there same number of country codes ?

In [None]:
# How many unique country codes are there ? (should be the same #)
countryCodes = indicators['CountryCode'].unique().tolist()
len(countryCodes)

### Are there many indicators or few ?

In [None]:
# How many unique indicators are there ? (should be the same #)
var_indicators = indicators['IndicatorName'].unique().tolist()
len(var_indicators)

In [None]:
# List with the Indicators
var_indicators.sort()
var_indicators[:5]

### How many years of data do we have ?

In [None]:
# How many years of data do we have ?
years = indicators['Year'].unique().tolist()
len(years)

### What's the range of years?

In [None]:
print(min(years)," to ",max(years))

Creating new DF with the following Indicators:<br>
`Expenditure on education as % of total government expenditure (%)`<br>
`Life expectancy at birth, total (years)`<br>
`Gross domestic income (constant LCU)`

In [None]:
# Creating masks to select only a few indicators
education_expenditure = 'Expenditure on education as % of total'
life_expectancy = 'Life expectancy at birth, total'
gross_domestic_income_LCU = 'Gross domestic income \(constant LCU'

mask1 = indicators['IndicatorName'].str.contains(education_expenditure)
mask2 = indicators['IndicatorName'].str.contains(life_expectancy)
mask3 = indicators['IndicatorName'].str.contains(gross_domestic_income_LCU)

# Creating new DF matching the masks
df_expenditure = indicators[mask1]
df_life_expectancy = indicators[mask2]
df_gross_domestic_income_LCU = indicators[mask3]

In [None]:
#Cheking for Null values

not_null = 0
not_null == df_expenditure.isnull().sum() & df_life_expectancy.isnull().sum() & df_gross_domestic_income_LCU.isnull().sum()

### aqui detalhar - Expenditure on Education

In [None]:
df_expenditure.head(2)

In [None]:
# import CountryCode

CountryCode=pd.read_csv('https://pkgstore.datahub.io/JohnSnowLabs/country-and-continent-codes-list/country-and-continent-codes-list-csv_csv/data/b7876b7f496677669644f3d1069d3121/country-and-continent-codes-list-csv_csv.csv',sep=',')
CountryCode.head(2)

I only need two columns - `Continent_Name` and `Three_Letter_Country_Code`

In [None]:
CountryCode = CountryCode[['Continent_Name', 'Three_Letter_Country_Code']]

In [None]:
# Merging the DFs and creating a new column
df_expenditure_final = pd.merge(df_expenditure, CountryCode, left_on='CountryCode', right_on='Three_Letter_Country_Code', how='left')
df_expenditure_final = df_expenditure_final.iloc[:, :-1] #remove last column

# Now, we get the final dataset which includes the continent information.
df_expenditure_final.head(2)

In [None]:
# Now we drop all the missing values and sort the dataset with Year

df_expenditure_final = df_expenditure_final.dropna(how='any')
#df_expenditure_final = df_expenditure_final.sort_values(by=['Year'])
df_expenditure_final.rename(columns={'IndicatorName':'ExpenditureEducation','Value':'ExpenditureEducationValue'}, inplace = True)
df_expenditure_final.head(2)

Now we want to <b>merge</b> the `df_gross_domestic_income_LCU` columns IndicatorName and Value with the `df_expenditure_final` dataframe

In [None]:
# First let's change the values of column IndicatorName to --> GDI
df_gross_domestic_income_LCU.loc[df_gross_domestic_income_LCU['IndicatorName'] 
                                 == 'Gross domestic income (constant LCU)', 'IndicatorName'] = 'GDI'


gdi_final = df_gross_domestic_income_LCU[['IndicatorName', 'Value', 'Year','CountryCode']]

# Selecting from 1995 which is the start date of the "df_expenditure_final" dataframe
gdi_final = gdi_final[gdi_final['Year'] > 1994]

gdi_final.rename(columns={'IndicatorName':'GrossDomesticIncome','Value':'gdi_value'}, inplace = True)
gdi_final.head(2)

In [None]:
df_final = pd.merge(df_expenditure_final, gdi_final, how='inner', 
                    left_on=['CountryCode','Year'], right_on=['CountryCode', 'Year'])
df_final = df_final.sort_values(by=['Year'])
df_final.head(2)



In [None]:
df1 = df_final
df1.round(10)

df1.head()


In [None]:
fig = px.scatter(df_final,x="gdi_value", y="ExpenditureEducationValue",animation_frame="Year", 
 animation_group="Continent_Name",size="gdi_value", 
 color="Continent_Name",
 hover_name="CountryName", log_x=True, 
 size_max=45,range_x=[200,150000], range_y=[10,100]
 )
fig.layout.updatemenus[0].buttons[0].args[1]["frame"]["duration"] = 900
fig.show()