In [2]:

%run ../data-visualization/load-gapminder_df.ipynb
%who

MinMaxScaler	 angle_between	 conjunctify_list	 create_binned_categories	 data_folder	 download_url	 encoding	 euclidean	 formal_name_dict	 
gapminder_df	 get_min_max	 informal_name_dict	 load_object	 np	 number_column_list	 obj_path	 os	 pd	 
pdist	 pickle	 round_down	 round_up	 saves_folder	 squareform	 store_objects	 unit_vector	 


In [None]:

class Descriptions(object):

    def __init__(self, df, **kwargs):
        prop_defaults = {
            'confounder1_category_count': 5,
            'cc1_labels_list': ['Very Low', 'Low', 'Moderate', 'High', 'Very High'],
            'verbose_html': '',
            'cc1_informal_name': 'ethnic',
        }

        for (prop, default) in prop_defaults.items():
            setattr(self, prop, kwargs.get(prop, default))
        
        self.df = df #.dropna(how='any', subset=self.subset_columns).copy()
        self.row_count = self.df.shape[0]
        
        # Categorical Confounder
        if 'cc1_categories' in self.df.columns:
            self.cc1_list = sorted(self.df['cc1_categories'].astype('float64').unique().tolist())
            self.cc1_labels_dict = dict(zip(self.cc1_list, self.cc1_labels_list))
            self.confounder1_category_count = len(self.cc1_list)
            self.confounder1_description_list = []
            for group_label, group_count in self.df['cc1_categories'].value_counts().iteritems():

                # African American (n = 414, 34.4%)
                group_description = '{0} (n = {1:d}, {2:.2%})'.format(self.cc1_labels_dict[group_label],
                                                                      group_count, group_count/self.row_count)

                self.confounde1r_description_list.append(group_description)

    def sample_description(self):
        self.sample_description_html = ('The sample of ' +
                                        self.row_count + ' countries was drawn from the GapMinder.org dataset. The ' +
                                        self.cc1_informal_name + ' composition was ' + 
                                        conjunctify_list(self.confounder1_description_list) + '. The ' +
                                        self.cc2_informal_name + ' composition was ' + 
                                        conjunctify_list(self.confounder2_description_list) + '. The ' +
                                        self.cc3_informal_name + ' composition was ' + 
                                        conjunctify_list(self.confounder3_description_list) + '. The ' +
                                        self.cc4_informal_name + ' composition was ' + 
                                        conjunctify_list(self.confounder4_description_list) + '.')

    def procedures_description(self):
        self.procedures_description_html = ('Random sampling was used to recruit participants for this study.' +
                                            'Surveyors went to considerable lengths to secure a high completion rate,' +
                                            'including up to four call-backs, letters, and monetary incentives.' +
                                            'Trained researcher assistants conducted face-to-face interviews with all study participants.' +
                                            'Sensitive questions about substance use and sexual behavior were asked using' +
                                            'computer assisted interviewing to increase the reliability of responses.')

    def measures_description(self):
        self.measures_description_html = ('The measure of tuberculosis (TB) was drawn from country level surveillance data' +
                                          'compiled by the World Health Organization in their Global Tuberculosis Database (' +
                                          'www.who.int/globalatlas/dataQuery/default.asp' +
                                          '), and made available for download through the Gapminder web site (www.gapminder.org).' +
                                          'It measures the estimated number of new TB cases (all forms) among 100,000 residents' +
                                          'in each country during 2008. For the current analysis, it was binned into four' +
                                          'categories based on a quartile split.')

In [10]:

file_path = data_folder + 'html/gapminder_codebook.html'
gapminder_codebook_df = pd.read_html(file_path)[0]
gapminder_codebook_df.columns

Index(['Indicator name', 'Data provider', 'Category', 'Subcategory',
       'Download View Visualize'],
      dtype='object')

In [9]:

formal_name_list = []
for column_name in gapminder_df.columns[1:]:
    formal_name_list.append(formal_name_dict[column_name])

In [13]:
# %load ../../../load_magic/lists.py

from difflib import SequenceMatcher
import time
import pandas as pd

def check_4_doubles(item_list):
    t0 = time.time()
    rows_list = []
    n = len(item_list)
    for i in range(n-1):
        first_item = item_list[i]
        max_similarity = 0.0
        max_item = first_item
        for j in range(i+1, n):
            second_item = item_list[j]

            # Assume the first item is never identical to the second item
            this_similarity = similar(str(first_item), str(second_item))
            
            if this_similarity > max_similarity:
                max_similarity = this_similarity
                max_item = second_item

        # Get input row in dictionary format; key = col_name
        row_dict = {}
        row_dict['first_item'] = first_item
        row_dict['second_item'] = max_item
        row_dict['first_bytes'] = '-'.join(str(x) for x in bytearray(str(first_item),
                                                                     encoding=encoding, errors="replace"))
        row_dict['second_bytes'] = '-'.join(str(x) for x in bytearray(str(max_item),
                                                                      encoding=encoding, errors="replace"))
        row_dict['max_similarity'] = max_similarity

        rows_list.append(row_dict)

    column_list = ['first_item', 'second_item', 'first_bytes', 'second_bytes', 'max_similarity']
    item_similarities_df = pd.DataFrame(rows_list, columns=column_list)
    t1 = time.time()
    print(t1-t0, time.ctime(t1))

    return item_similarities_df

def similar(a, b):
    return SequenceMatcher(None, str(a), str(b)).ratio()

#Check the closest names for typos
def check_for_typos(left_list, right_list):
    t0 = time.time()
    rows_list = []
    for left_item in left_list:
        max_similarity = 0.0
        max_item = left_item
        for right_item in right_list:
            this_similarity = similar(left_item, right_item)
            if this_similarity > max_similarity:
                max_similarity = this_similarity
                max_item = right_item

        # Get input row in dictionary format; key = col_name
        row_dict = {}
        row_dict['left_item'] = left_item
        row_dict['right_item'] = max_item
        row_dict['max_similarity'] = max_similarity

        rows_list.append(row_dict)

    column_list = ['left_item', 'right_item', 'max_similarity']
    name_similarities_df = pd.DataFrame(rows_list, columns=column_list)
    t1 = time.time()
    print(t1-t0, time.ctime(t1))
    
    return name_similarities_df

In [14]:

name_similarities_df = check_for_typos(formal_name_list, gapminder_codebook_df['Indicator name'])

1.1289007663726807 Tue Dec 26 17:47:59 2017


In [37]:

name_similarities_df.sort_values('max_similarity', ascending=False)

Unnamed: 0,left_item,right_item,max_similarity
7,2010 Internet users per 100 people,Internet users (per 100 people),0.892308
2,Armed forces personnel as a % of total labor f...,Armed forces personnel (% of labor force),0.857143
1,2008 alcohol consumption per adult (age 15+) i...,Alcohol consumption per adult 15+ (litres),0.8125
14,2008 urban population as a % of total,Urban population (% of total),0.787879
11,2008 residential electricity consumption per p...,"Residential electricity use, per person",0.721649
4,2006 cumulative CO2 emission in metric tons,Cumulative CO2 emissions (tonnes),0.710526
8,2011 life expectancy at birth in years,"Life expectancy at birth, temporary update",0.675
10,2009 Democracy score as measured by Polity,Democracy score (use as color),0.583333
3,2002 breast cancer new cases per hundred thous...,"Breast cancer, new cases per 100,000 women",0.565657
9,2010 oil Consumption per capita in tonnes per ...,"Oil consumption, per person",0.545455


In [38]:

problem_child = '2007 total employees age 15+ as a % of population'
match_series = (name_similarities_df['left_item'] == problem_child)
indicator_name = name_similarities_df[match_series]['right_item'].tolist()[0]
match_series = (gapminder_codebook_df['Indicator name'] == indicator_name)
gapminder_codebook_df[match_series].T

Unnamed: 0,283
Indicator name,Males aged 15+ labour force participation rate...
Data provider,International Labour Organization
Category,Work
Subcategory,Labour force participation
Download View Visualize,


In [29]:

# It looks like you need to add the males and females together
match_series = (name_similarities_df['left_item'] == problem_child)
item_list = list(set([problem_child] + gapminder_codebook_df['Indicator name'].tolist()) -
                 set(name_similarities_df[match_series]['right_item'].tolist()))
item_similarities_df = check_4_doubles(item_list)

match_series = (item_similarities_df['first_item'] == problem_child)
item_similarities_df[match_series].T

16.446537256240845 Tue Dec 26 19:08:26 2017


Unnamed: 0,306
first_item,2007 total employees age 15+ as a % of population
second_item,Females aged 15+ labour force participation ra...
first_bytes,50-48-48-55-32-116-111-116-97-108-32-101-109-1...
second_bytes,70-101-109-97-108-101-115-32-97-103-101-100-32...
max_similarity,0.475248


In [33]:

indicator_name = item_similarities_df[match_series]['second_item'].tolist()[0]
match_series = (gapminder_codebook_df['Indicator name'] == indicator_name)
gapminder_codebook_df[match_series].T

Unnamed: 0,161
Indicator name,Females aged 15+ labour force participation ra...
Data provider,International Labour Organization
Category,Work
Subcategory,Labour force participation
Download View Visualize,


In [34]:

problem_child = 'HIV'

match_series = (name_similarities_df['left_item'] == problem_child)
item_list = list(set([problem_child] + gapminder_codebook_df['Indicator name'].tolist()) -
                 set(name_similarities_df[match_series]['right_item'].tolist()))
item_similarities_df = check_4_doubles(item_list)

match_series = (item_similarities_df['first_item'] == problem_child)
item_similarities_df[match_series].T

16.4240140914917 Tue Dec 26 19:11:26 2017


Unnamed: 0,150
first_item,HIV
second_item,"Adults with HIV (%, age 15-49)"
first_bytes,72-73-86
second_bytes,65-100-117-108-116-115-32-119-105-116-104-32-7...
max_similarity,0.181818


In [35]:

indicator_name = item_similarities_df[match_series]['second_item'].tolist()[0]
match_series = (gapminder_codebook_df['Indicator name'] == indicator_name)
gapminder_codebook_df[match_series].T

Unnamed: 0,0
Indicator name,"Adults with HIV (%, age 15-49)"
Data provider,Based on UNAIDS
Category,Health
Subcategory,HIV
Download View Visualize,


In [39]:

gapminder_df.columns

Index(['country_name', 'income_per_person', 'alcohol_consumption',
       'armed_forces_rate', 'breast_cancer_per_100th', 'co2_emissions',
       'female_employment_rate', 'hiv_rate', 'internet_use_rate',
       'life_expectancy', 'oil_per_person', 'polity_score',
       'residential_electricity_per_person', 'suicide_per_100th',
       'employment_rate', 'urban_rate'],
      dtype='object')

In [40]:

data_provider_dict = {}
for column_name in gapminder_df.columns[1:]:
    formal_name = formal_name_dict[column_name]
    match_series = (name_similarities_df['left_item'] == formal_name)
    indicator_name = name_similarities_df[match_series]['right_item'].tolist()[0]
    match_series = (gapminder_codebook_df['Indicator name'] == indicator_name)
    data_provider = gapminder_codebook_df[match_series]['Data provider'].tolist()[0]
    data_provider_dict[column_name] = data_provider
data_provider_dict['hiv_rate'] = 'Based on UNAIDS'
store_objects(data_provider_dict=data_provider_dict)