
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Life Expectency Equivalents


## Clean and prepare US States dataset

In [5]:

url = 'https://worldpopulationreview.com/state-rankings/life-expectancy-by-state'
tables_list = ssu.get_page_tables(url)

[(0, (50, 7))]


In [6]:

life_expectancy_us_states_df = tables_list[0].copy()
life_expectancy_us_states_df.columns = ['state_name', 'life_expectancy', 'life_expectancy_black', 'life_expectancy_latino',
                                        'life_expectancy_asian', 'life_expectancy_native_american', 'life_expectancy_white']
life_expectancy_us_states_df.sample(5)

Unnamed: 0,state_name,life_expectancy,life_expectancy_black,life_expectancy_latino,life_expectancy_asian,life_expectancy_native_american,life_expectancy_white
33,North Carolina,77.8,74.7,,88.9,76.6,78.3
41,South Carolina,76.2,74.0,,,,77.8
8,Massachusetts,79.9,78.8,87.1,89.1,,80.4
47,Alabama,74.9,72.9,,76.0,,76.0
5,New Jersey,80.4,75.5,84.7,89.4,,80.3



## Clean and prepare Countries dataset

In [7]:

url = 'https://worldpopulationreview.com/countries/life-expectancy'
driver = ssu.get_driver()
tables_list = ssu.get_page_tables(url, driver=driver)
driver.close()

Getting the FireFox driver
[(0, (237, 4))]


In [8]:

life_expectancy_countries_df = tables_list[0].copy()
life_expectancy_countries_df.columns = ['country_name', 'life_expectancy', 'life_expectancy_males', 'life_expectancy_females']
life_expectancy_countries_df.sample(5)

Unnamed: 0,country_name,life_expectancy,life_expectancy_males,life_expectancy_females
177,Puerto Rico,79.72,75.58,83.9
55,Senegal,67.91,65.47,70.2
13,Seychelles,71.74,68.24,76.04
162,Bahamas,74.36,70.76,77.84
114,Moldova,68.62,64.22,73.32


In [9]:

mask_series = life_expectancy_countries_df.duplicated(subset=['country_name'], keep=False)
life_expectancy_countries_df[mask_series]

Unnamed: 0,country_name,life_expectancy,life_expectancy_males,life_expectancy_females


In [10]:

life_expectancy_countries_df.country_name = life_expectancy_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(life_expectancy_countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity
13,Falkland Islands,Åland Islands,0.83


In [11]:

states_list = sorted(set(life_expectancy_us_states_df.state_name).symmetric_difference(set(us_stats_df.index)))
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity


In [12]:

state_to_country_equivalent_dict, country_to_state_equivalent_dict = ssu.get_country_state_equivalents(life_expectancy_countries_df,
                                                                                                       'country_name',
                                                                                                       'life_expectancy',
                                                                                                       life_expectancy_us_states_df,
                                                                                                       'state_name',
                                                                                                       'life_expectancy',
                                                                                                       verbose=False)
state_to_country_equivalent_dict, country_to_state_equivalent_dict = ssu.get_country_state_equivalents(
    countries_df, 'country_name', countries_target_column_name,
    us_states_df, 'state_name', states_target_column_name,
    cn_col_explanation=None, st_col_explanation=None,
    countries_set=None, states_set=None, verbose=False)

In [13]:

us_stats_df['Country_Equivalent_Life_Expectancy'] = us_stats_df.index.map(lambda x: state_to_country_equivalent_dict.get(x, x))

In [14]:

life_expectancy_dict = life_expectancy_us_states_df.set_index('state_name').life_expectancy.to_dict()
min_life = life_expectancy_us_states_df.life_expectancy.min()
us_stats_df['life_expectancy'] = us_stats_df.index.map(lambda x: life_expectancy_dict.get(x, min_life))
column_description_dict['life_expectancy'] = 'Overall average life expectency (2020)'
s.store_objects(column_description_dict=column_description_dict)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\column_description_dict.pkl



---
## Choropleth

In [15]:

c.create_label_line_file()
numeric_column_name = 'life_expectancy'
string_column_name = 'Country_Equivalent_Life_Expectancy'
svg_file_path = c.create_country_colored_labeled_map(numeric_column_name=numeric_column_name,
                                                     string_column_name=string_column_name,
                                                     one_country_df=us_stats_df)
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_life_expectancy_Country_Equivalent_Life_Expectancy.svg
