
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Obesity Equivalents


## Clean and prepare US States dataset

In [51]:

url = 'https://en.wikipedia.org/wiki/Obesity_in_the_United_States'
tables_list = ssu.get_page_tables(url)

[(0, (56, 6)), (1, (24, 2)), (10, (11, 2)), (12, (11, 2)), (4, (9, 2)), (5, (8, 2)), (6, (6, 2)), (8, (4, 2)), (2, (3, 2)), (9, (3, 2)), (11, (2, 2)), (3, (1, 2)), (7, (1, 2))]


In [56]:

us_states_df = tables_list[0].copy()
us_states_df.columns = ['state_name', 'obesity_rank', 'adults_obesity_rate_2005', 'adults_obesity_rate_2020',
                                'adults_overweight_rate_2005', 'children_and_adolescents_obesity_rate_2005']
def f(x):
    rate_float = np.nan
    rate_str = str(x)
    if '%' in rate_str:
        rate_float = float(rate_str.split('%')[0])
    
    return rate_float
for cn in ['adults_obesity_rate_2005', 'adults_obesity_rate_2020',
           'adults_overweight_rate_2005', 'children_and_adolescents_obesity_rate_2005']:
    us_states_df[cn] = us_states_df[cn].map(f)
us_states_df.sample(5)

Unnamed: 0,state_name,obesity_rank,adults_obesity_rate_2005,adults_obesity_rate_2020,adults_overweight_rate_2005,children_and_adolescents_obesity_rate_2005
19,Kentucky,8,28.4,34.3,66.8,20.6
0,Alabama,5,30.1,36.3,65.4,16.7
14,Idaho,32,24.6,29.3,61.4,10.1
35,North Carolina,20,27.1,32.1,63.4,19.3
45,South Dakota,22,26.1,31.9,64.2,12.1


In [57]:

mask_series = (us_states_df.state_name == 'District of Columbia')
us_states_df = us_states_df[~mask_series]

In [58]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(ssu.us_stats_df.index)))
print(states_list)
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = us_states_df.duplicated(subset=['state_name'], keep=False)
if us_states_df[mask_series].shape[0]:
    display(us_states_df[mask_series])

['American Samoa', 'District of Columbia', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Virgin Islands (U.S.)']


In [60]:

s.store_objects(obesity_us_states_df=us_states_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\obesity_us_states_df.pkl



## Clean and prepare Countries dataset

In [61]:

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_obesity_rate'
# driver = ssu.get_driver()
tables_list = ssu.get_page_tables(url, driver=None)
# driver.close()

[(0, (191, 3)), (1, (12, 2))]


In [62]:

countries_df = tables_list[0].copy()
countries_df.columns = ['country_name', 'obesity_rank', 'obesity_rate_2016']
countries_df.obesity_rate_2016 = countries_df.obesity_rate_2016.map(lambda x: float(x))
countries_df.sample(5)

Unnamed: 0,country_name,obesity_rank,obesity_rate_2016
8,Kiribati,9,46.0
25,Canada,26,29.4
1,Cook Islands,2,55.9
136,Mauritius,137,10.8
155,Equatorial Guinea,156,8.0


In [63]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = countries_df.duplicated(subset=['country_name'], keep=False)
if countries_df[mask_series].shape[0]:
    display(countries_df[mask_series])

Unnamed: 0,first_item,second_item,max_similarity
8,British Virgin Islands,US Virgin Islands,0.769231
10,Cayman Islands,Åland Islands,0.740741
48,Sint Maarten,St. Martin,0.727273
24,Guernsey,Jersey,0.714286
47,San Marino,St. Martin,0.7
6,Bouvet Island,Faroe Islands,0.692308
16,Faroe Islands,Åland Islands,0.692308
17,French Guiana,French Polynesia,0.62069


In [64]:

s.store_objects(obesity_countries_df=countries_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\obesity_countries_df.pkl



## Prepare for and Create Choropleth

In [34]:

state_to_country_equivalent_dict, country_to_state_equivalent_dict = ssu.get_country_state_equivalents(countries_df, 'country_name',
                                                                                                       'obesity_rate_2016',
                                                                                                       us_states_df, 'state_name',
                                                                                                       'adults_obesity_rate_2005',
                                                                                                       verbose=False)
state_to_country_equivalent_dict, country_to_state_equivalent_dict = ssu.get_country_state_equivalents(
    countries_df, 'country_name', countries_target_column_name,
    us_states_df, 'state_name', states_target_column_name,
    cn_col_explanation=None, st_col_explanation=None,
    countries_set=None, states_set=None, verbose=False)

In [38]:

column_description_dict['adults_obesity_rate_2020'] = 'Obese adults (2020)'
column_description_dict['adults_obesity_rate_2005'] = 'Obese adults (mid-2000s)'
s.store_objects(column_description_dict=column_description_dict)
us_stats_df['Country_Equivalent_Obesity'] = us_stats_df.index.map(lambda x: state_to_country_equivalent_dict.get(x, x))

In [39]:

obesity_dict = us_states_df.set_index('state_name').adults_obesity_rate_2020.to_dict()
obesity_min = us_states_df.adults_obesity_rate_2020.min()
us_stats_df['adults_obesity_rate_2020'] = us_stats_df.index.map(lambda x: obesity_dict.get(x, obesity_min))

In [40]:

obesity_dict = us_states_df.set_index('state_name').adults_obesity_rate_2005.to_dict()
obesity_min = us_states_df.adults_obesity_rate_2005.min()
us_stats_df['adults_obesity_rate_2005'] = us_stats_df.index.map(lambda x: obesity_dict.get(x, obesity_min))


---
## Choropleth

In [41]:

c.create_label_line_file()
numeric_column_name = 'adults_obesity_rate_2005'
string_column_name = 'Country_Equivalent_Obesity'
svg_file_path = c.create_country_colored_labeled_map(numeric_column_name=numeric_column_name,
                                                     string_column_name=string_column_name,
                                                     one_country_df=us_stats_df)
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_adults_obesity_rate_2005_Country_Equivalent_Obesity.svg
