In [3]:

import pickle
import pandas as pd
import os

# Handy list of the different types of encodings
encoding = ['latin1', 'iso8859-1', 'utf-8'][1]

# Change this to your data and saves folders
data_folder = r'../../data/'
saves_folder = r'../../saves/'

def load_object(obj_name, download_url=None):
    pickle_path = saves_folder + 'pickle/' + obj_name + '.pickle'
    if not os.path.isfile(pickle_path):
        csv_path = saves_folder + 'csv/' + obj_name + '.csv'
        if not os.path.isfile(csv_path):
            object = pd.read_csv(download_url, low_memory=False,
                                 encoding=encoding)
        else:
            object = pd.read_csv(csv_path, low_memory=False,
                                 encoding=encoding)
        try:
            if isinstance(object, pd.DataFrame):
                object.to_pickle(pickle_path)
            else:
                with open(pickle_path, 'wb') as handle:
                    pickle.dump(object, handle, pickle.HIGHEST_PROTOCOL)
        except:
            with open(pickle_path, 'wb') as handle:
                pickle.dump(object, handle, pickle.HIGHEST_PROTOCOL)
    else:
        try:
            object = pd.read_pickle(pickle_path)
        except:
            with open(pickle_path, 'rb') as handle:
                object = pickle.load(handle)
    
    return(object)

# Classes, functions, and methods cannot be pickled
def store_objects(**kwargs):
    for obj_name in kwargs:
        if hasattr(kwargs[obj_name], '__call__'):
            raise RuntimeError('Functions cannot be pickled.')
        obj_path = saves_folder + 'pickle/' + str(obj_name)
        pickle_path = obj_path + '.pickle'
        if isinstance(kwargs[obj_name], pd.DataFrame):
            kwargs[obj_name].to_pickle(pickle_path)
        else:
            with open(pickle_path, 'wb') as handle:
                pickle.dump(kwargs[obj_name], handle, pickle.HIGHEST_PROTOCOL)
                
def attempt_to_pickle(df, pickle_path, raise_exception=False):
    try:
        print('Pickling to ' + pickle_path)
        df.to_pickle(pickle_path)
    except Exception as e:
        os.remove(pickle_path)
        print(e, ': Couldn\'t save ' + str(df.shape[0]*df.shape[1]) + ' cells as a pickle.')
        if raise_exception:
            raise

In [4]:

download_url = 'https://raw.githubusercontent.com/jdantonio/ratistics/master/examples/gapminder.csv'
gapminder_df = load_object('gapminder_df', download_url=download_url)
original_columns = ['country_name', 'income_per_person', 'alcohol_consumption',
                    'armed_forces_rate', 'breast_cancer_per_100th', 'co2_emissions',
                    'female_employment_rate', 'hiv_rate', 'internet_use_rate',
                    'life_expectancy', 'oil_per_person', 'polity_score',
                    'residential_electricity_per_person', 'suicide_per_100th',
                    'employment_rate', 'urban_rate']
if len(gapminder_df.columns) == len(original_columns):
    gapminder_df.columns = original_columns
number_column_list = list(set(original_columns) - set(['country_name']))

In [5]:

obj_path = saves_folder + 'pickle/formal_name_dict.pickle'
if not os.path.isfile(obj_path):
    formal_name_dict = {}
    formal_name_dict['alcohol_consumption'] = '2008 alcohol consumption per adult (age 15+) in litres'
    formal_name_dict['armed_forces_rate'] = 'Armed forces personnel as a % of total labor force'
    formal_name_dict['breast_cancer_per_100th'] = '2002 breast cancer new cases per hundred thousand females'
    formal_name_dict['co2_emissions'] = '2006 cumulative CO2 emission in metric tons'
    formal_name_dict['employment_rate'] = '2007 total employees age 15+ as a % of population'
    formal_name_dict['female_employment_rate'] = '2007 female employees age 15+ as a % of population'
    formal_name_dict['hiv_rate'] = '2009 estimated HIV Prevalence % for Ages 15-49'
    formal_name_dict['income_per_person'] = '2010 Gross Domestic Product per capita in constant 2000 USD'
    formal_name_dict['internet_use_rate'] = '2010 Internet users per 100 people'
    formal_name_dict['life_expectancy'] = '2011 life expectancy at birth in years'
    formal_name_dict['oil_per_person'] = '2010 oil Consumption per capita in tonnes per year and person'
    formal_name_dict['polity_score'] = '2009 Democracy score as measured by Polity'
    formal_name_dict['residential_electricity_per_person'] = '2008 residential electricity consumption per person in kWh'
    formal_name_dict['suicide_per_100th'] = '2005 Suicide age adjusted per hundred thousand'
    formal_name_dict['urban_rate'] = '2008 urban population as a % of total'
    store_objects(formal_name_dict=formal_name_dict)
else:
    formal_name_dict = load_object('formal_name_dict')

In [6]:

obj_path = saves_folder + 'pickle/informal_name_dict.pickle'
if not os.path.isfile(obj_path):
    informal_name_dict = {}
    informal_name_dict['alcohol_consumption'] = 'alcohol consumption'
    informal_name_dict['armed_forces_rate'] = 'armed forces rate'
    informal_name_dict['breast_cancer_per_100th'] = 'breast cancer'
    informal_name_dict['co2_emissions'] = 'CO2 emissions'
    informal_name_dict['employment_rate'] = 'employment rate'
    informal_name_dict['female_employment_rate'] = 'female employment rate'
    informal_name_dict['hiv_rate'] = 'HIV rate'
    informal_name_dict['income_per_person'] = 'income per person'
    informal_name_dict['internet_use_rate'] = 'internet use rate'
    informal_name_dict['life_expectancy'] = 'life expectancy'
    informal_name_dict['oil_per_person'] = 'oil per person'
    informal_name_dict['polity_score'] = 'polity score'
    informal_name_dict['residential_electricity_per_person'] = 'residential electricity'
    informal_name_dict['suicide_per_100th'] = 'suicide rate'
    informal_name_dict['urban_rate'] = 'urban rate'
    store_objects(informal_name_dict=informal_name_dict)
else:
    informal_name_dict = load_object('informal_name_dict')

In [7]:

obj_path = saves_folder + 'pickle/data_provider_dict.pickle'
if not os.path.isfile(obj_path):
    data_provider_dict = {}
    data_provider_dict['income_per_person'] = 'provided by World Bank'
    data_provider_dict['alcohol_consumption'] = 'provided by WHO with additions'
    data_provider_dict['armed_forces_rate'] = 'provided by WDI'
    data_provider_dict['breast_cancer_per_100th'] = 'based on IARC data'
    data_provider_dict['co2_emissions'] = 'provided by CDIAC (Carbon Dioxide Information Analysis Center)'
    data_provider_dict['female_employment_rate'] = 'provided by International Labour Organization'
    data_provider_dict['hiv_rate'] = 'based on UNAIDS'
    data_provider_dict['internet_use_rate'] = 'provided by World Bank'
    data_provider_dict['life_expectancy'] = 'based on various sources'
    data_provider_dict['oil_per_person'] = 'provided by BP'
    data_provider_dict['polity_score'] = 'provided by Polity IV project'
    data_provider_dict['residential_electricity_per_person'] = 'provided by IEA (International Energy Agency)'
    data_provider_dict['suicide_per_100th'] = 'provided by WHO'
    data_provider_dict['employment_rate'] = 'provided by International Labour Organization'
    data_provider_dict['urban_rate'] = 'provided by World Bank'
    store_objects(data_provider_dict=data_provider_dict)
else:
    data_provider_dict = load_object('data_provider_dict')

In [10]:

obj_path = saves_folder + 'pickle/data_procedures_dict.pickle'
if not os.path.isfile(obj_path):
    data_procedures_dict = {}
    data_procedures_dict['income_per_person'] = ('World Bank generally relies on official sources collected at the ' +
                                                 'national level. In calculating income per person, the World Bank ' +
                                                 'uses the Atlas conversion factor.')
    data_procedures_dict['alcohol_consumption'] = ('WHO has been collecting data on alcohol consumption and alcohol control ' +
                                                 'policies from its Member States since 1996. The current survey ' +
                                                 'instrument entitled "Global Survey on Alcohol and Health" includes three ' +
                                                 'sections, namely alcohol policy, alcohol consumption, and surveillance. ' +
                                                 'The information provided is essential for the preparation of the Global ' +
                                                 'Status Report on Alcohol and Health and regional publications, as well ' +
                                                 'as for updating the Global Information Systems on Alcohol and Health ' +
                                                 '(GISAH) and regional information systems, as requested by the World ' +
                                                 'Health Assembly Resolution on Public health problems caused by harmful ' +
                                                 'use of alcohol.')
    data_procedures_dict['armed_forces_rate'] = ('WDI estimates of military spending are based on both the official ' +
                                                 'defence budget and data and estimates for a number of items outside the ' +
                                                 'budget.')
    data_procedures_dict['breast_cancer_per_100th'] = ('The International Association of Cancer Registries (IACR) was formed in ' +
                                                       '1966 to develop and standardize collection methods across registries to ' +
                                                       'make their data as comparable as possible.')
    data_procedures_dict['co2_emissions'] = ('The CDIAC data have been collected over 30 years of operation. The data ' +
                                             'from which these carbon-emissions estimates were derived are values of ' +
                                             'fuel consumed: in billions of cubic feet, for natural gas; in millions ' +
                                             'of barrels, for petroleum products; and in thousands of short tons, for ' +
                                             'coal.')
    data_procedures_dict['female_employment_rate'] = ('The International Labour Organization (ILO) data for female employment ' +
                                                      'rate contains labour force participation rate estimates and projections ' +
                                                      'by sex, for the standardized age group of 15+, and for the years 1990 to ' +
                                                      '2030. The participation rates are harmonized to account for differences ' +
                                                      'in national data collection and tabulation methodologies as well as for ' +
                                                      'other country-specific factors such as military service requirements. ' +
                                                      'The series includes both nationally reported and imputed data and only ' +
                                                      'estimates that are national, meaning there are no geographical ' +
                                                      'limitations on coverage. There are systematic differences in the type of ' +
                                                      'data source related to the methodology of collection, definitions, scope ' +
                                                      'of coverage and reference period that impact the interpretation of ' +
                                                      'female employment rate from one country to another. An effort has been ' +
                                                      'made in the examination of country-level data to remove non-comparable ' +
                                                      'data.')
    data_procedures_dict['hiv_rate'] = ('UNAIDS hiv rate data are based on modelled HIV estimates. Modelled HIV ' +
                                        'estimates are created by country teams using UNAIDS-supported software. ' +
                                        'The country teams are comprised primarily of epidemiologists, ' +
                                        'demographers, monitoring and evaluation specialists and technical ' +
                                        'partners. Country-submitted files are reviewed at UNAIDS, and selected ' +
                                        'HIV service data contained in the files are reviewed and validated in ' +
                                        'partnership with WHO and UNICEF. UNAIDS review aims to ensure ' +
                                        'comparability of results across regions, countries and over time.')
    data_procedures_dict['internet_use_rate'] = ('World Bank generally relies on official sources collected at the ' +
                                                 'national level.')
    data_procedures_dict['life_expectancy'] = ('Life expectancy data mostly come from survey, census, and death registration data.')
    data_procedures_dict['oil_per_person'] = ('The data series for oil per person provided by BP does not necessarily ' +
                                              'meet the definitions, guidelines and practices used for determining ' +
                                              'proved reserves at company level, for instance, as published by the US ' +
                                              'Securities and Exchange Commission, nor does it necessarily represent ' +
                                              'BP’s view of proved reserves by country. Rather, the data series has ' +
                                              'been compiled using a combination of primary official sources and ' +
                                              'third-party data.')
    data_procedures_dict['polity_score'] = ('The Polity IV dataset covers all major, independent states in the global ' +
                                            'system over the period 1800-2015 (i.e., states with a total population ' +
                                            'of 500,000 or more in the most recent year; currently 167 countries). ' +
                                            'With the support of the Political Instability Task Force, the Polity IV ' +
                                            'Project has been transformed into a living data collection effort, ' +
                                            'meaning that it constantly monitors regime changes in all major ' +
                                            'countries and provides annual assessments of regime authority ' +
                                            'characteristics, changes and data updates.')
    data_procedures_dict['residential_electricity_per_person'] = ('The data provided by IEA (International Energy Agency) are based on ' +
                                                                  'years, plant efficiency, and capital costs. Years refer to time of plant ' +
                                                                  'order. Costs include owner’s costs but exclude interest during ' +
                                                                  'construction. Plant efficiency is gross, LHV (lower heating value). The ' +
                                                                  'difference between lower and higher heating value, based on IEA ' +
                                                                  'conventions, is 5% for coal and 10% for gas. Capital costs presented are ' +
                                                                  'a weighted average based on deployment for the given scenario. Capital ' +
                                                                  'costs for renewable energy technologies and CCS-equipped power plants ' +
                                                                  'are projected based on the levels of regional and global deployment, ' +
                                                                  'applying an assumed learning rates for each doubling of capacity. ' +
                                                                  'Capital costs for nuclear power and unabated coal- and gas-fired power ' +
                                                                  'plants are assumed throughout the projection period.')
    data_procedures_dict['suicide_per_100th'] = ('WHO global estimates of the number of suicides in a country in the year ' +
                                                 '2005, divided by the population and multiplied with 100,000, represent ' +
                                                 'the best estimates of WHO, computed using standard categories, ' +
                                                 'definitions and methods to ensure cross-country comparability, and may ' +
                                                 'not be the same as official national estimates. The estimates are ' +
                                                 'rounded to the appropriate number of significant figures and ' +
                                                 'standardized to the WHO World Standard Population.')
    data_procedures_dict['employment_rate'] = ('The International Labour Organization data for employment rate contains ' +
                                               'labour force participation rate estimates and projections for the ' +
                                               'standardized age group of 15+ and for the years 1990 to 2030. The ' +
                                               'participation rates are harmonized to account for differences in ' +
                                               'national data collection and tabulation methodologies as well as for ' +
                                               'other country-specific factors such as military service requirements. ' +
                                               'The series includes both nationally reported and imputed data and only ' +
                                               'estimates that are national, meaning there are no geographical ' +
                                               'limitations on coverage.')
    data_procedures_dict['urban_rate'] = ('Urban population refers to people living in urban areas as defined by ' +
                                          'national statistical offices. The indicator is calculated using World ' +
                                          'Bank population estimates and urban ratios from the United Nations World ' +
                                          'Urbanization Prospects. To estimate urban populations, UN ratios of ' +
                                          'urban to total population were applied to the World Bank’s estimates of ' +
                                          'total population. Countries differ in the way they classify population ' +
                                          'as "urban" or "rural". The population of a city or metropolitan area ' +
                                          'depends on the boundaries chosen.')
    store_objects(data_procedures_dict=data_procedures_dict)
else:
    data_procedures_dict = load_object('data_procedures_dict')

In [13]:

obj_path = saves_folder + 'pickle/data_measures_dict.pickle'
if not os.path.isfile(obj_path):
    data_measures_dict = {}
    data_measures_dict['income_per_person'] = ('gross national income (GNI, formerly referred to as GNP) and GNI per capita in U.S. dollars')
    data_measures_dict['alcohol_consumption'] = ('Alcohol Timeline Followback (TLFB), Form 90, Drinking Self–Monitoring ' +
                                                 'Log (DSML), Lifetime Drinking (LDH), and various Quantity–Frequency (QF) ' +
                                                 'measures')
    data_measures_dict['armed_forces_rate'] = ('WDI estimates of military spending are based on both the official ' +
                                                 'defence budget and data and estimates for a number of items outside the ' +
                                                 'budget.')
    data_measures_dict['breast_cancer_per_100th'] = ('The International Association of Cancer Registries (IACR) was formed in ' +
                                                       '1966 to develop and standardize collection methods across registries to ' +
                                                       'make their data as comparable as possile.')
    data_measures_dict['co2_emissions'] = ('The CDIAC data have been collected over 30 years of operation. The data ' +
                                             'from which these carbon-emissions estimates were derived are values of ' +
                                             'fuel consumed: in billions of cubic feet, for natural gas; in millions ' +
                                             'of barrels, for petroleum products; and in thousands of short tons, for ' +
                                             'coal.')
    data_measures_dict['female_employment_rate'] = ('The International Labour Organization (ILO) data for female employment ' +
                                                      'rate contains labour force participation rate estimates and projections ' +
                                                      'by sex, for the standardized age group of 15+, and for the years 1990 to ' +
                                                      '2030. The participation rates are harmonized to account for differences ' +
                                                      'in national data collection and tabulation methodologies as well as for ' +
                                                      'other country-specific factors such as military service requirements. ' +
                                                      'The series includes both nationally reported and imputed data and only ' +
                                                      'estimates that are national, meaning there are no geographical ' +
                                                      'limitations on coverage. There are systematic differences in the type of ' +
                                                      'data source related to the methodology of collection, definitions, scope ' +
                                                      'of coverage and reference period that impact the interpretation of ' +
                                                      'female employment rate from one country to another. An effort has been ' +
                                                      'made in the examination of country-level data to remove non-comparable ' +
                                                      'data.')
    data_measures_dict['hiv_rate'] = ('UNAIDS hiv rate data are based on modelled HIV estimates. Modelled HIV ' +
                                        'estimates are created by country teams using UNAIDS-supported software. ' +
                                        'The country teams are comprised primarily of epidemiologists, ' +
                                        'demographers, monitoring and evaluation specialists and technical ' +
                                        'partners. Country-submitted files are reviewed at UNAIDS, and selected ' +
                                        'HIV service data contained in the files are reviewed and validated in ' +
                                        'partnership with WHO and UNICEF. UNAIDS review aims to ensure ' +
                                        'comparability of results across regions, countries and over time.')
    data_measures_dict['internet_use_rate'] = ('World Bank generally relies on official sources collected at the ' +
                                                 'national level.')
    data_measures_dict['life_expectancy'] = ('Life expectancy data mostly come from survey, census, and death registration data.')
    data_measures_dict['oil_per_person'] = ('The data series for oil per person provided by BP does not necessarily ' +
                                              'meet the definitions, guidelines and practices used for determining ' +
                                              'proved reserves at company level, for instance, as published by the US ' +
                                              'Securities and Exchange Commission, nor does it necessarily represent ' +
                                              'BP’s view of proved reserves by country. Rather, the data series has ' +
                                              'been compiled using a combination of primary official sources and ' +
                                              'third-party data.')
    data_measures_dict['polity_score'] = ('The Polity IV dataset covers all major, independent states in the global ' +
                                            'system over the period 1800-2015 (i.e., states with a total population ' +
                                            'of 500,000 or more in the most recent year; currently 167 countries). ' +
                                            'With the support of the Political Instability Task Force, the Polity IV ' +
                                            'Project has been transformed into a living data collection effort, ' +
                                            'meaning that it constantly monitors regime changes in all major ' +
                                            'countries and provides annual assessments of regime authority ' +
                                            'characteristics, changes and data updates.')
    data_measures_dict['residential_electricity_per_person'] = ('The data provided by IEA (International Energy Agency) are based on ' +
                                                                  'years, plant efficiency, and capital costs. Years refer to time of plant ' +
                                                                  'order. Costs include owner’s costs but exclude interest during ' +
                                                                  'construction. Plant efficiency is gross, LHV (lower heating value). The ' +
                                                                  'difference between lower and higher heating value, based on IEA ' +
                                                                  'conventions, is 5% for coal and 10% for gas. Capital costs presented are ' +
                                                                  'a weighted average based on deployment for the given scenario. Capital ' +
                                                                  'costs for renewable energy technologies and CCS-equipped power plants ' +
                                                                  'are projected based on the levels of regional and global deployment, ' +
                                                                  'applying an assumed learning rates for each doubling of capacity. ' +
                                                                  'Capital costs for nuclear power and unabated coal- and gas-fired power ' +
                                                                  'plants are assumed throughout the projection period.')
    data_measures_dict['suicide_per_100th'] = ('WHO global estimates of the number of suicides in a country in the year ' +
                                                 '2005, divided by the population and multiplied with 100,000, represent ' +
                                                 'the best estimates of WHO, computed using standard categories, ' +
                                                 'definitions and methods to ensure cross-country comparability, and may ' +
                                                 'not be the same as official national estimates. The estimates are ' +
                                                 'rounded to the appropriate number of significant figures and ' +
                                                 'standardized to the WHO World Standard Population.')
    data_measures_dict['employment_rate'] = ('The International Labour Organization data for employment rate contains ' +
                                               'labour force participation rate estimates and projections for the ' +
                                               'standardized age group of 15+ and for the years 1990 to 2030. The ' +
                                               'participation rates are harmonized to account for differences in ' +
                                               'national data collection and tabulation methodologies as well as for ' +
                                               'other country-specific factors such as military service requirements. ' +
                                               'The series includes both nationally reported and imputed data and only ' +
                                               'estimates that are national, meaning there are no geographical ' +
                                               'limitations on coverage.')
    data_measures_dict['urban_rate'] = ('Urban population refers to people living in urban areas as defined by ' +
                                          'national statistical offices. The indicator is calculated using World ' +
                                          'Bank population estimates and urban ratios from the United Nations World ' +
                                          'Urbanization Prospects. To estimate urban populations, UN ratios of ' +
                                          'urban to total population were applied to the World Bank’s estimates of ' +
                                          'total population. Countries differ in the way they classify population ' +
                                          'as "urban" or "rural". The population of a city or metropolitan area ' +
                                          'depends on the boundaries chosen.')
    store_objects(data_measures_dict=data_measures_dict)
else:
    data_measures_dict = load_object('data_measures_dict')

In [None]:

def create_binned_categories(df, number_of_categories, column_name, prefix):
    
    # Get the percentiles
    out_categorical, percentiles_list = pd.cut([0, 1], number_of_categories, retbins=True)
    describe_series = df[column_name].describe(percentiles=percentiles_list[1:-1]).copy()

    # Get the bin list and group names
    bad_list = ['count', 'mean', 'std']
    if (number_of_categories % 2) == 1:
        bad_list += ['50%']
    
    # array of indexes, e.g. ['min', '50%', 'max']
    index_list = [x for x in describe_series.index.tolist() if x not in bad_list]
    bin_list = describe_series.loc[index_list].tolist()
    if len(set(bin_list)) == len(bin_list):
        
        # Create the extra column
        df[prefix+'_categories'] = pd.cut(x=df[column_name],
                                          bins=bin_list).map(lambda x: (x.left + x.right)/2.)
    else:

        # array of quantiles, e.g. [0, .25, .5, .75, 1.]
        quantiles_list = []
        for index in index_list:
            if index == 'min':
                quantiles_list.append(0)
            elif index == 'max':
                quantiles_list.append(1.)
            else:
                quantiles_list.append(float(index.split('%')[0])/100.)
        
        # Create the extra column
        df[prefix+'_categories'] = pd.qcut(x=df[column_name], q=quantiles_list,
                                           duplicates='drop').map(lambda x: (x.left + x.right)/2.)

    # Fix the bottom row
    null_series = df[prefix+'_categories'].isnull()
    df.loc[null_series, prefix+'_categories'] = df[~null_series][prefix+'_categories'].min()

    return df

In [None]:

from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import squareform, pdist, euclidean
import numpy as np

# From https://stackoverflow.com/questions/2827393/angles-between-two-n-dimensional-vectors-in-python
def unit_vector(vector):
    """ Returns the unit vector of the vector.  """
    return vector / np.linalg.norm(vector)

def angle_between(v1, v2):
    """ Returns the angle in radians between vectors 'v1' and 'v2'::

            >>> angle_between((1, 0, 0), (0, 1, 0))
            1.5707963267948966
            >>> angle_between((1, 0, 0), (1, 0, 0))
            0.0
            >>> angle_between((1, 0, 0), (-1, 0, 0))
            3.141592653589793
    """
    v1_u = unit_vector(v1)
    v2_u = unit_vector(v2)
    return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

def round_down(num, divisor):
    
    return num - (num%divisor)

def round_up(num, divisor):
    
    return num - (num%divisor) + divisor

def get_min_max(df, column_name, circle_min=5, circle_max=500):
    min_max_scaler = MinMaxScaler(feature_range=(circle_min, circle_max))
    min_max = min_max_scaler.fit_transform(df[column_name].values.reshape(-1, 1))
    
    return min_max

def conjunctify_list(noun_list):
    
    return ', and '.join([', '.join(noun_list[:-1])] + [noun_list[-1]])