## Preliminaries

In [1]:
import zipfile
import os
import glob

import pandas as pd

  from pkg_resources import resource_stream


In [2]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls

In [3]:
DATA = os.path.join('.', 'data')
FIGS = os.path.join('.', 'fig')
# ZIP_F = os.path.join(DATA, 'HFAAllData20140516.csv.zip')
# CSV_F = 'HFAAllData.csv'  # tab-separated file inside the zip archive
PKL_F = os.path.join(DATA, 'HFA.pkl')  # pickle file of the pandas DataFrame

## Import Dataset

The fillowing utility functions import a CSV file into a Pandas DataFrame and perform some cleaning up. At the end of the process, if no error is encountered, there should be a global dataframe called `HFA` available for the rest of the script.

### Utilities

The `indicator_id` column is a string with the characters `'HFA-'` added as a prefix to the numerical id. The `tidy_df()` function extracts the numerical id and stores it as an integer in the DataFrame.

In [4]:
def tidy_df(df):
    '''Clean up the dataframe indicator column; return tidy DataFrame'''
    assert isinstance(df, pd.DataFrame)
    df.indicator_id = df.indicator_id.str.split('-')
    df.indicator_id = df.indicator_id.str.get(1)
    df.indicator_id = df.indicator_id.astype(int)
    return df

The `retrieve_csv_from_zip(zip_f)` function makes a few assumptions: that there is a zip archive inside the local data folder; that there is only one such archive and it has a filename ending in `.zip`; that the first member inside the archive is a file in CSV format with tab separators between fields and with a filename that ends in `.csv`.

If all these assumptions are met, the function returns a dataframe with the contents of the CSV file as well as the filename of the ZIP archive.

In [5]:
def retrieve_csv_from_zip(zip_f):
    hfa_df = None
    csv_file = ''
    
    fh = open(zip_f, 'rb')
    with zipfile.ZipFile(fh) as z:
        # get list of objects inside the archive
        zip_objs = z.infolist()
        if not zip_objs:
            raise IOError('Zip archive in data folder is empty.')
            
        # check if the first member is a CSV file
        csv_file = zip_objs[0].filename
        if not csv_file.endswith('.csv'):
            raise IOError('Non-CSV file found in Zip archive.')
        
        # compressed file found in Zip archive has .csv suffix
        # let's try to read it
        with z.open(csv_file) as f:
            try:
                hfa_df = pd.read_csv(f, sep='\t', header=None,
                                     names=['year', 'indicator_id',
                                            'indicator', 'country',
                                            'value'])
            except:
                raise IOError('Invalid CSV contents found; error in Pandas import.')
    return zip_f, hfa_df

Once the ZIP archive has been processed (by extracting the dataframe from inside it), the file is renamed by adding the suffice `.processed` so that it will not be used again for importation but remains available for reference.

In [6]:
def rename_zip(data_dir, zf):
    try:
        pf = '{}.processed'.format(zf)
        os.rename(zf, pf)
        return True
    except:
        raise IOError('Could not rename zip file after processing.')

The importation logic is governed by this function, `retrieve_df(data_dir, pkl_file)` and follows this sequence:

- Is there an unprocessed zip archive in the data folder? 
    - If so, import the csv file, tidy up the data, serialize the dataframe, and mark the zip file as processed.
    - If not, then is there a pickled Dataframe in the data folder? If so, load it. 
    - If all the above steps are unsuccessful, raise `IOError`

In [7]:
def retrieve_df(data_dir, pkl_file):
    hfa = None
    zip_pattern = '{}/*.zip'.format(data_dir)
    zip_f_list = glob.glob(zip_pattern)
    
    if zip_f_list:
        zip_f = zip_f_list[0]
        zip_f, hfa = retrieve_csv_from_zip(zip_f)
        hfa = tidy_df(hfa)
        hfa.to_pickle(pkl_file)
        rename_zip(data_dir, zip_f)
    elif os.path.isfile(pkl_file):
        hfa = pd.read_pickle(pkl_file)
    else:
        raise IOError('No valid data source discovered.')
    return hfa

In [8]:
HFA = retrieve_df(data_dir=DATA, pkl_file=PKL_F)

HFA.head()

Unnamed: 0,year,indicator_id,indicator,country,value
0,1970,10,Mid-year population,Albania,2138000
1,1970,10,Mid-year population,Austria,7390900
2,1970,10,Mid-year population,Azerbaijan,5172050
3,1970,10,Mid-year population,Belgium,9637700
4,1970,10,Mid-year population,Bulgaria,8489600


The DataFrame is now in 'tidy' format with each column a variable and each row an observation.

### Indicators Utility Function

The next cell creates a DataFrame called `INDICATORS` with distinct values of all the HFA indicators as well as their serial numbers. 

The function `find_indicators()` accepts a list of Python strings and returns the rows in `INDICATORS` that contain all of these strings; it searches the indicators using a logical `AND` on all the string fragments.

In [9]:
INDICATORS = HFA[['indicator_id', 'indicator']]
INDICATORS = INDICATORS.drop_duplicates()

def find_indicators(search_list):
    result = INDICATORS.copy()
    for pattern in search_list:
        result = result[result.indicator.str.contains(pattern)]
    return result

In [16]:
len(INDICATORS)  # number of rows = number of unique indicators

595

## Plotting a Simple Trend

In [20]:
country = 'Russian Federation'
indicator = 10

data = HFA.ix[(HFA.indicator_id==indicator) & (HFA.country == country), :]
data.head()

Unnamed: 0,year,indicator_id,indicator,country,value
26,1970,10,Mid-year population,Russian Federation,129941000
6606,1971,10,Mid-year population,Russian Federation,130563000
13760,1972,10,Mid-year population,Russian Federation,131304000
20911,1973,10,Mid-year population,Russian Federation,132069000
28070,1974,10,Mid-year population,Russian Federation,132799000


In [38]:
INDICATORS.indicator[INDICATORS.indicator_id == 10][0]

'Mid-year population'

In [39]:
x = data.year
y = data.value
indicator_label = INDICATORS.indicator[INDICATORS.indicator_id == indicator][0]

title = '{} - {}<br>(from {} to {})'.format(country, 
                                            indicator_label, 
                                            data.year.min(), 
                                            data.year.max()
                                            )

trace = Scatter(
    x=x,
    y=y
)
D = Data([trace])

L = Layout(
    title=title,
    xaxis=XAxis(
        title='Year',
        showgrid=False,
        zeroline=True
    ),
    yaxis=YAxis(
        title='Mid-year Population',
        showline=False
    )
)
fig = Figure(data=D, layout=L)
py.iplot(fig, filename='hfa/tmp')

# Graphing Function

In [None]:
def plot_trend(country, indicator, title='', figsize=(10, 6)):
    data = HFA.ix[(HFA.indicator_id==indicator) & (HFA.country == country), :]
    data.plot(x='year', y='value', title=title, legend=False, figsize=figsize)

## Graphs

In [None]:
find_indicators(['circ', '0-64', ' male'])

In [None]:
country = 'Russian Federation'
indicator = 1311
title='{}\nStandardised Circulatory Mortality, 0-64 years, per 100,000, Male'.format(country)

plot_trend(country, indicator, title)

In [None]:
country = 'Republic of Moldova'
indicator = 1311
title='{}\nStandardised Circulatory Mortality, 0-64 years, per 100,000, Male'.format(country)

plot_trend(country, indicator, title)

In [None]:
country = 'Kazakhstan'
indicator = 1311
title='{}\nStandardised Circulatory Mortality, 0-64 years, per 100,000, Male'.format(country)

plot_trend(country, indicator, title)

## GDP

In [None]:
find_indicators(['GDP'])

In [None]:
countries = ['Russian Federation', 'Republic of Moldova', 'Kazakhstan']
indicator = 260


for country in countries:
    title='{}\nGross domestic product (GDP), US$ per capita'.format(country)
    plot_trend(country, indicator, title, figsize=(10,2))