In [None]:
import numpy
import pandas
from matplotlib import pyplot
%matplotlib inline

#Import rcParams to set font styles
from matplotlib import rcParams

#Set font style and size 
rcParams['font.family'] = 'serif'
rcParams['font.size'] = 16

In [None]:
url = 'https://python-graph-gallery.com/wp-content/uploads/gapminderData.csv'
life_expect = pandas.read_csv(url)

In [None]:
life_expect[0:5]

In [None]:
life_expect.shape

We can get a useful summary of the dataframe with the [`DataFrame.info()`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.info.html) method: it tells us the number of rows and the number of columns (matching the output of the `shape` attribute) and then for each column, it tells us the number of rows that are populated (have non-null entries) and the type of the entries; finally it gives a breakdown of the types of data and an estimate of the memory used by the dataframe.

In [None]:
life_expect.info()

In [None]:
life_expect['year'].value_counts()

We have an even 142 occurrences of each year in the dataframe. The distinct entries correspond to each country. It also is clear that we have data every five years, starting 1952 and ending 2007.

In [None]:
by_year = life_expect.groupby('year')

In [None]:
type(by_year)

The [`GroupBy.first()`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.groupby.GroupBy.first.html) method returns the first value in each group—applied to `by_year`, it should show the list of years, with the first country in each year-group.

In [None]:
by_year.first()

All the year-groups have the same first country, Afghanistan, so what we see is the life expectancy and per-capita income in Afghanistan for all the available years.
Let's save that into a new dataframe, and make a line plot of the population and per-capita income over the years.

In [None]:
Afghanistan = by_year.first()

In [None]:
Afghanistan['pop'].plot(figsize=(8,4),
                       title='Population of Afghanistan');

In [None]:
Afghanistan['lifeExp'].plot(figsize=(8,4),
                       title='Life expectancy of Afghanistan');

In [None]:
Afghanistan.describe()

In [None]:
by_country = life_expect.groupby('country')

In [None]:
by_country.first()

The first year for all groups-by-country is 1952. Let's save that first group into a new dataframe, and keep playing with it.

In [None]:
year1952 = by_country.first()

In [None]:
type(year1952)

In [None]:
year1952[0:5]

In [None]:
year1952['pop'].min()

In [None]:
populations = year1952['pop'].values

In [None]:
year1952.plot.scatter(figsize=(12,8), 
                       x='gdpPercap', y='lifeExp', s=populations/60000, 
                       title='Life expectancy in the year 1952',
                       edgecolors="white")
pyplot.xscale('log');

Matplotlib [colormaps](https://matplotlib.org/examples/color/colormaps_reference.html) offer several options for _qualitative_ data, using discrete colors mapped to a sequence of numbers. We'd like to use the `Accent` colormap to code countries by continent. We need a numeric code to assign to each continent, so it can be mapped to a color.

In [None]:
pandas.Categorical(year1952['continent'])

In [None]:
colors = pandas.Categorical(year1952['continent']).codes

In [None]:
year1952.plot.scatter(figsize=(12,8), 
                         x='gdpPercap', y='lifeExp', s=populations/60000, 
                         c=colors, cmap='Accent',
                         title='Life expectancy in the year 1952',
                         logx = 'True',
                         ylim = (25,85),
                         edgecolors="white",
                         alpha=0.6);

In [None]:
fig = pyplot.figure(figsize=(12,8))
axis = fig.add_subplot(1,1,1)

axis.spines["top"].set_visible(False)       
axis.spines["right"].set_visible(False)    
axis.spines["left"].set_visible(False) 

axis.set_title('Life expectancy in the years 1952–2007, across 142 countries')

for key, group in by_country:
    axis.plot(group['year'], group['lifeExp'], alpha=0.4)

Something catastrophic happened to one country in 1977, and to another country in 1992.
Let's investigate.


In [None]:
type(by_year.get_group(1977))

In [None]:
type(by_year['lifeExp'].get_group(1977))

We can find the minimum value of the life expectancy at the specific years of interest.

In [None]:
min_lifeExp1977 = by_year['lifeExp'].get_group(1977).min()
min_lifeExp1977

In [None]:
min_lifeExp1992 = by_year['lifeExp'].get_group(1992).min()
min_lifeExp1992

Those values of life expectancy are just terrible. We'd like to know, of course, what countries experienced the dramatic drops in life expectancy.

In [None]:
life_expect[life_expect['lifeExp'] == min_lifeExp1977].index[0]

In [None]:
life_expect['country'][221]

In [None]:
life_expect[life_expect['country'] == 'Cambodia']

There is a problem with the data.
Cambodia's life expectancy in 1977 was actually much lower than the value we see here.

In [None]:
life_expect[life_expect['lifeExp'] == min_lifeExp1992].index[0]

In [None]:
life_expect['country'][1292]

In [None]:
for y in life_expect.year.unique():
    frame = life_expect[ life_expect.year == y ]
    minpop = frame['pop'].min()

In [None]:
# Execute this cell to load the notebook's style sheet, then ignore it
from IPython.core.display import HTML
css_file = '../../style/custom.css'
HTML(open(css_file, "r").read())