# Creating an Interactive Animated Scatterplot 
# of Life Expectancy vs Income of 200 Countries over 240 Years

### The work in the notebook consists of two parts:
### 1. Loading and merging the data
### 2. Creating the animated interactive scatterplot

In [1]:
#imports
import os
import pandas as pd
import plotly_express as px
import plotly.io as pio

#pio configuration
pio.renderers.default = "notebook_connected"


### The CSV-files contain data from the world bank which is freely accessible via Gapminder.com. They were downloaded prior to creating the notebook.

In [2]:
#loading data from CSV-files
life = pd.read_csv('data/life_expectancy.csv', sep = ',')
income = pd.read_csv('data/income.csv', sep = ',')
continents = pd.read_csv('data/continents.csv', sep = ';')
population = pd.read_csv('data/population.csv', sep = ',')

life

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
0,Afghanistan,28.2,28.2,28.2,28.2,28.2,28.2,28.1,28.1,28.1,...,76.5,76.6,76.7,76.9,77.0,77.1,77.3,77.4,77.5,77.7
1,Albania,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,...,87.4,87.5,87.6,87.7,87.8,87.9,88.0,88.1,88.2,88.3
2,Algeria,28.8,28.8,28.8,28.8,28.8,28.8,28.8,28.8,28.8,...,88.3,88.4,88.5,88.6,88.7,88.8,88.9,89.0,89.1,89.2
3,Andorra,,,,,,,,,,...,,,,,,,,,,
4,Angola,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,...,78.7,78.9,79.0,79.1,79.3,79.4,79.5,79.7,79.8,79.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,Venezuela,32.2,32.2,32.2,32.2,32.2,32.2,32.2,32.2,32.2,...,86.2,86.3,86.5,86.6,86.7,86.9,87.0,87.1,87.2,87.3
183,Vietnam,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,...,84.3,84.4,84.5,84.6,84.7,84.8,84.9,85.0,85.2,85.3
184,Yemen,23.4,23.4,23.4,23.4,23.4,23.4,23.4,23.4,23.4,...,77.3,77.4,77.5,77.7,77.8,77.9,78.0,78.2,78.3,78.4
185,Zambia,32.6,32.6,32.6,32.6,32.6,32.6,32.6,32.6,32.6,...,76.8,77.0,77.1,77.3,77.4,77.6,77.7,77.8,78.0,78.1


### The tables are first converted from a wide format into a long format. Thus providing one column for every variable and one row for each observation.

In [3]:
life = life.melt('country')
life.columns =['country', 'year', 'life_expectancy']
life

Unnamed: 0,country,year,life_expectancy
0,Afghanistan,1800,28.2
1,Albania,1800,35.4
2,Algeria,1800,28.8
3,Andorra,1800,
4,Angola,1800,27.0
...,...,...,...
56282,Venezuela,2100,87.3
56283,Vietnam,2100,85.3
56284,Yemen,2100,78.4
56285,Zambia,2100,78.1


In [4]:
income = income.melt('country')
income.columns = ['country', 'year', 'income']
income

Unnamed: 0,country,year,income
0,Afghanistan,1800,603
1,Albania,1800,667
2,Algeria,1800,715
3,Andorra,1800,1200
4,Angola,1800,618
...,...,...,...
46508,Venezuela,2040,9880
46509,Vietnam,2040,14400
46510,Yemen,2040,3870
46511,Zambia,2040,4180



### Quick check of the data with a familiar country to confirm the authenticity of the data.

In [5]:
population[population['country']== 'Germany']

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
64,Germany,18000000,18300000,18600000,18900000,19200000,19500000,19800000,20200000,20500000,...,74700000,74700000,74700000,74700000,74700000,74700000,74700000,74700000,74700000,74700000


In [6]:
population = population.melt('country')
population.columns = ['country', 'year', 'population']
population

Unnamed: 0,country,year,population
0,Afghanistan,1800,3280000
1,Albania,1800,400000
2,Algeria,1800,2500000
3,Andorra,1800,2650
4,Angola,1800,1570000
...,...,...,...
58690,Venezuela,2100,34200000
58691,Vietnam,2100,97400000
58692,Yemen,2100,53200000
58693,Zambia,2100,81500000


In [7]:
continents

Unnamed: 0,continent,country
0,Africa,Algeria
1,Africa,Angola
2,Africa,Benin
3,Africa,Botswana
4,Africa,Burkina
...,...,...
189,South America,Paraguay
190,South America,Peru
191,South America,Suriname
192,South America,Uruguay


### Mapping the continents to the countries & merging life expectancy, income and population into one data frame

In [8]:
df_all = life.merge(continents)
df_all

Unnamed: 0,country,year,life_expectancy,continent
0,Afghanistan,1800,28.2,Asia
1,Afghanistan,1801,28.2,Asia
2,Afghanistan,1802,28.2,Asia
3,Afghanistan,1803,28.2,Asia
4,Afghanistan,1804,28.2,Asia
...,...,...,...,...
50864,Zimbabwe,2096,75.1,Africa
50865,Zimbabwe,2097,75.3,Africa
50866,Zimbabwe,2098,75.4,Africa
50867,Zimbabwe,2099,75.5,Africa


In [9]:
df_all = df_all.merge(income)
df_all

Unnamed: 0,country,year,life_expectancy,continent,income
0,Afghanistan,1800,28.2,Asia,603
1,Afghanistan,1801,28.2,Asia,603
2,Afghanistan,1802,28.2,Asia,603
3,Afghanistan,1803,28.2,Asia,603
4,Afghanistan,1804,28.2,Asia,603
...,...,...,...,...,...
40724,Zimbabwe,2036,66.0,Africa,2900
40725,Zimbabwe,2037,66.2,Africa,2960
40726,Zimbabwe,2038,66.4,Africa,3020
40727,Zimbabwe,2039,66.6,Africa,3080


In [10]:
df_all = df_all.merge(population)
df_all.tail(50)

Unnamed: 0,country,year,life_expectancy,continent,income,population
40679,Zimbabwe,1991,59.5,Africa,2910,10700000
40680,Zimbabwe,1992,57.6,Africa,2590,10900000
40681,Zimbabwe,1993,55.6,Africa,2570,11100000
40682,Zimbabwe,1994,53.8,Africa,2770,11300000
40683,Zimbabwe,1995,52.0,Africa,2740,11400000
40684,Zimbabwe,1996,50.8,Africa,2990,11500000
40685,Zimbabwe,1997,49.6,Africa,3040,11700000
40686,Zimbabwe,1998,48.5,Africa,3100,11700000
40687,Zimbabwe,1999,47.5,Africa,3050,11800000
40688,Zimbabwe,2000,46.6,Africa,2950,11900000


### Last check for null values and data types:

In [11]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40729 entries, 0 to 40728
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          40729 non-null  object 
 1   year             40729 non-null  object 
 2   life_expectancy  40150 non-null  float64
 3   continent        40729 non-null  object 
 4   income           40729 non-null  int64  
 5   population       40729 non-null  int64  
dtypes: float64(1), int64(2), object(3)
memory usage: 2.2+ MB


### The life expectancy has the smallest amount of observations it is only projected until 2040. Our timeline will automatically stop at the last value of this variable, namely 2040.

# Live Expectancy and Income of 200 Countries over 240 years 
### The Animated Interactive Scatterplot 

In [12]:
fig = px.scatter(df_all, x="income", y="life_expectancy", animation_frame="year", animation_group="country", size="population", color="continent", hover_name="country", log_x = True, size_max=50, range_x=[100,100000], range_y=[25,90])

fig.show()