
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Physicians per Capita Equivalents


## Clean and prepare US States dataset

In [4]:

url = 'https://www.beckershospitalreview.com/workforce/this-state-has-the-most-physicians-per-capita.html'
tables_list = ssu.get_page_tables(url)

[(0, (52, 4))]


In [5]:

us_states_df = tables_list[0].copy()
states_target_column_name = 'physicians_per_10k'
us_states_df.columns = ['state_name', 'total_population', 'total_active_physicians', states_target_column_name]
us_states_df = us_states_df.iloc[1:]
us_states_df[states_target_column_name] = pd.to_numeric(us_states_df[states_target_column_name].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x))),
                                                        errors='coerce', downcast='float')
us_states_df[states_target_column_name] = us_states_df[states_target_column_name].map(lambda x: x/10)
us_states_df.sample(5)

Unnamed: 0,state_name,total_population,total_active_physicians,physicians_per_10k
22,Alaska,731545,2101,287.2
1,District of Columbia,705749,6147,87.1
32,Arizona,7278717,18343,25.2
2,Massachusetts,6892503,32116,46.6
15,Ohio,11689100,35333,302.3


In [14]:

# Why does MA only have 46.6 physicians per 10K whilst NY has 389.4?
mask_series = us_states_df.state_name.isin(['Massachusetts', 'New York'])
us_states_df[mask_series]

Unnamed: 0,state_name,total_population,total_active_physicians,physicians_per_10k
2,Massachusetts,6892503,32116,46.6
4,New York,19453561,75749,389.4


In [6]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(us_stats_df.index)))
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity



## Clean and prepare Countries dataset

In [7]:

tables_list = ssu.get_page_tables('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_number_of_physicians', driver=None)

[(1, (188, 5)), (0, (1, 2))]


In [8]:

countries_df = tables_list[1].copy()
# print(countries_df.columns.tolist())
countries_target_column_name = 'physicians_per_10k'
LATEST_COLUMN_NAME = countries_target_column_name + '_latest'
EARLIER_COLUMN_NAME = countries_target_column_name + '_2013'
EARLIEST_COLUMN_NAME = countries_target_column_name + '_2009'
countries_df.columns = ['country_name', 'total_population', EARLIEST_COLUMN_NAME, EARLIER_COLUMN_NAME, LATEST_COLUMN_NAME]
for cn in [EARLIEST_COLUMN_NAME, EARLIER_COLUMN_NAME, LATEST_COLUMN_NAME]:
    countries_df[cn] = pd.to_numeric(countries_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x))), errors='coerce', downcast='float')
def f(row_series):
    latest = row_series[LATEST_COLUMN_NAME]
    earlier = row_series[EARLIER_COLUMN_NAME]
    earliest = row_series[EARLIEST_COLUMN_NAME]
    for cv in [latest, earlier, earliest]:
        if str(cv) != 'nan':

            return cv
countries_df[countries_target_column_name] = countries_df.apply(f, axis='columns')
countries_df.head(5)

Unnamed: 0,country_name,total_population,physicians_per_10k_2009,physicians_per_10k_2013,physicians_per_10k_latest,physicians_per_10k
0,Australia,19612,100.0,327.0,,327.0
1,Austria,31175,380.0,483.0,,483.0
2,Azerbaijan,32388,380.0,340.0,,340.0
3,Albania,3626,110.0,115.0,,115.0
4,Algeria,40857,120.0,121.0,172.0,172.0


In [16]:

# France has 327.0 physicians per 10K whilst NY has 389.4?
mask_series = countries_df.country_name.isin(['France'])
countries_df[mask_series]

Unnamed: 0,country_name,total_population,physicians_per_10k_2009,physicians_per_10k_2013,physicians_per_10k_latest,physicians_per_10k
170,France,227683,370.0,319.0,327.0,327.0


In [9]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity
12,British Virgin Islands,US Virgin Islands,0.769231
31,Greenland,Grenada,0.75
16,Cayman Islands,Åland Islands,0.740741
59,Sint Maarten,St. Martin,0.727273
35,Guernsey,Jersey,0.714286
10,Bouvet Island,Faroe Islands,0.692308
25,Faroe Islands,Åland Islands,0.692308
27,French Guiana,French Polynesia,0.62069



## Create Equivalence Dictionaries

In [10]:

state_to_country_equivalent_dict, country_to_state_equivalent_dict = ssu.get_country_state_equivalents(
    countries_df, 'country_name', countries_target_column_name,
    us_states_df, 'state_name', states_target_column_name,
    cn_col_explanation=None, st_col_explanation=None,
    countries_set=None, states_set=None, verbose=False)

In [11]:

string_column_name = 'Country_Equivalent_Physicians_per_Capita'
us_stats_df[string_column_name] = us_stats_df.index.map(lambda x: state_to_country_equivalent_dict.get(x, x))
s.store_objects(us_stats_df=us_stats_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\us_stats_df.pkl


In [12]:

states_dict = us_states_df.set_index('state_name')[states_target_column_name].to_dict()
states_min = us_states_df[states_target_column_name].min()
us_stats_df[states_target_column_name] = us_stats_df.index.map(lambda x: states_dict.get(x, states_min))
column_description_dict[states_target_column_name] = 'Physicians per Capita by State (latest)'
s.store_objects(column_description_dict=column_description_dict)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\column_description_dict.pkl



## Choropleth

In [13]:

c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(numeric_column_name=states_target_column_name,
                                                     string_column_name=string_column_name,
                                                     one_country_df=us_stats_df)
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_physicians_per_10k_Country_Equivalent_Physicians_per_Capita.svg
