# Processing of tabular data 

__Import Statements__

In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

## Research question

> ### How does the population change over time in Germany, France and Italy?

## Read the data

The data is obtained form [Global Health Data Exchange](http://ghdx.healthdata.org/), the world’s most comprehensive catalog of surveys, censuses, vital statistics, and other health-related data. 

__Read__ `.csv` __file__

In [None]:
pop = pd.read_csv("../data/IHME_GBD_2016_POPULATION_ESTIMATES_1990_2016_Y2017M09D14.CSV")

In [None]:
pop

## Inspect the data

__Columns__

In [None]:
pop.columns


## Exploratory Data Analysis

In [None]:
pop.location_name.nunique()

In [None]:
pop.location_name.sample(10)

In [None]:
pop.sex_name.unique()

In [None]:
pop.age_group_name.unique()

## Analyzing a subset of the dataset

__Subsetting__

In [None]:
ages = ['5-14 years', '15-49 years', '50-69 years','70+ years']
location = ["Germany", "France", "Italy"]
sex = ['Male', 'Female']
cond = (
    pop.age_group_name.isin(ages) & 
    pop.location_name.isin(location) & 
    pop.sex_name.isin(sex)
)

In [None]:
pop_subset = pop.loc[cond,:]
print(pop_subset.shape)
pop_subset.sample(5)

### Split-Apply-Combine

![](_img/split-apply-combine.png)
Image source: [Jake VanderPlas 2016, Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)

**Question: How did the population of Germany, Italy and France change over time?**

In [None]:
gb = pop_subset.groupby(["year_id", "location_name"])["pop"].sum()
gb

### Plotting

In [None]:
gb.unstack().plot(figsize=(14,4))
plt.grid()

***