
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Health Care Costs Equivalents


## Clean and prepare US States dataset

In [4]:

url = 'https://worldpopulationreview.com/state-rankings/health-care-costs-by-state'
tables_list = ssu.get_page_tables(url)

[(0, (50, 2))]


In [5]:

us_states_df = tables_list[0].copy()
states_target_column_name = 'spending_per_capita'
us_states_df.columns = ['state_name', states_target_column_name]
us_states_df[states_target_column_name] = pd.to_numeric(us_states_df[states_target_column_name].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x))),
                                                        errors='coerce', downcast='integer')
us_states_df.sample(5)

Unnamed: 0,state_name,spending_per_capita
30,Indiana,7651.0
40,New Hampshire,7214.0
34,Arizona,7549.0
49,Wyoming,
4,New York,9851.0


In [25]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(us_stats_df.index)))
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity



## Clean and prepare Countries dataset

In [6]:

driver = ssu.get_driver()
tables_list = ssu.get_page_tables('https://data.worldbank.org/indicator/SH.XPD.CHEX.PC.CD', driver=driver)
driver.close()
if not tables_list:
    tables_list = ssu.get_page_tables('../data/html/world_bank_healthcare_apending_per_capita_by_country.html')

Getting the FireFox driver
No tables found
[]
[(0, (248, 3))]


In [7]:

countries_df = tables_list[0].copy()
countries_target_column_name = 'spending_per_capita'
countries_df.columns = ['country_name', 'study_year', countries_target_column_name]
# print(countries_df.columns.tolist())
countries_df[countries_target_column_name] = pd.to_numeric(countries_df[countries_target_column_name].map(lambda x: re.sub(r'[^0-9\.]+', '',
                                                                                                                           str(x))),
                                                        errors='coerce', downcast='float')
countries_df.sample(5)

Unnamed: 0,country_name,study_year,spending_per_capita
128,"Micronesia, Fed. Sts.",2019.0,4152.0
138,Nepal,2019.0,5325.0
137,Nauru,2019.0,104945.0
185,Sudan,2019.0,4693.0
1,Albania,2018.0,27491.0


In [None]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)


## Create Equivalence Dictionaries

In [10]:

state_to_country_equivalent_dict, country_to_state_equivalent_dict = ssu.get_country_state_equivalents(
    countries_df, 'country_name', countries_target_column_name,
    us_states_df, 'state_name', states_target_column_name,
    cn_col_explanation=None, st_col_explanation=None,
    countries_set=None, states_set=None, verbose=False)

In [11]:

string_column_name = 'Country_Equivalent_Health_Care_Costs'
us_stats_df[string_column_name] = us_stats_df.index.map(lambda x: state_to_country_equivalent_dict.get(x, x))

In [12]:

states_dict = us_states_df.set_index('state_name')[states_target_column_name].to_dict()
states_min = us_states_df[states_target_column_name].min()
us_stats_df[states_target_column_name] = us_stats_df.index.map(lambda x: states_dict.get(x, states_min))
column_description_dict[states_target_column_name] = 'Health Care Costs by State (2022)'
s.store_objects(column_description_dict=column_description_dict)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\column_description_dict.pkl



---
## Choropleth

In [13]:

c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(numeric_column_name=states_target_column_name,
                                                     string_column_name=string_column_name,
                                                     one_country_df=us_stats_df)
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_spending_per_capita_Country_Equivalent_Health_Care_Costs.svg
