
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [3]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Fertility Rate Equivalents


## Clean and prepare US States dataset

In [4]:

# Downloaded CSV from https://www.cdc.gov/nchs/pressroom/sosmap/fertility_rate/fertility_rates.htm
csv_name = 'fertility_rates_by_us_state'
us_states_df = s.load_csv(csv_name)
# print(us_states_df.columns.tolist())
us_states_df.columns = ['source_year', 'state_abbreviation', 'fertility_rate', 'births_count', 'source_url']
for cn in ['source_year', 'fertility_rate', 'births_count']:
    us_states_df[cn] = us_states_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x)))
    us_states_df[cn] = pd.to_numeric(us_states_df[cn], errors='coerce', downcast='integer')
abbreviation_dict = {v: k for k, v in ssu.us_states_abbreviation_dict.items()}
mask_series = us_states_df.state_abbreviation.isnull()
us_states_df = us_states_df[~mask_series]
us_states_df['state_name'] = us_states_df.state_abbreviation.map(lambda x: abbreviation_dict.get(x, x))
us_states_df.source_url = us_states_df.source_url.map(lambda x: 'https://www.cdc.gov' + x)
mask_series = (us_states_df.source_year == 2020)
us_states_df = us_states_df[mask_series]
us_states_df.sample(5).T

Unnamed: 0,5,40,4,21,20
source_year,2020,2020,2020,2020,2020
state_abbreviation,CO,SD,CA,MI,MA
fertility_rate,51.5,66.7,52.4,55.6,47.6
births_count,61494,10960,420259,104074,66428
source_url,https://www.cdc.gov/nchs/pressroom/states/colo...,https://www.cdc.gov/nchs/pressroom/states/sout...,https://www.cdc.gov/nchs/pressroom/states/cali...,https://www.cdc.gov/nchs/pressroom/states/mich...,https://www.cdc.gov/nchs/pressroom/states/mass...
state_name,Colorado,South Dakota,California,Michigan,Massachusetts


In [5]:

url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_fertility_rate'
tables_list = ssu.get_page_tables(url)

[(0, (57, 14)), (1, (11, 2))]


In [6]:

us_states_df = tables_list[0].copy()
# print(us_states_df.columns.tolist())
columns_list = [f'total_fertility_rate_{i}' for i in range(2008, 2021)]
us_states_df.columns = ['state_name'] + columns_list
for cn in columns_list:
    us_states_df[cn] = us_states_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x)))
    us_states_df[cn] = pd.to_numeric(us_states_df[cn], errors='coerce', downcast='integer')
us_states_df.sample(5).T

Unnamed: 0,4,40,0,7,32
state_name,North Dakota,Montana,Guam,Alaska,North Carolina
total_fertility_rate_2008,2.13,2.08,2.73,2.41,2.12
total_fertility_rate_2009,2.12,1.89,2.66,2.27,2.01
total_fertility_rate_2010,2.04,1.99,3.0,2.35,1.91
total_fertility_rate_2011,2.08,1.96,2.88,2.28,1.86
total_fertility_rate_2012,2.12,1.96,3.15,2.19,1.84
total_fertility_rate_2013,2.14,1.97,2.87,2.22,1.82
total_fertility_rate_2014,2.24,1.95,2.96,2.19,1.83
total_fertility_rate_2015,2.16,1.96,2.93,2.17,1.82
total_fertility_rate_2016,2.17,1.89,3.07,2.14,1.79


In [7]:

# Remove DC from the comparison so it doesn't skew the results
mask_series = (us_states_df.state_name.isin(['District of Columbia']))
us_states_df = us_states_df[~mask_series]

In [8]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(ssu.us_stats_df.index)))
print(states_list)
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = us_states_df.duplicated(subset=['state_name'], keep=False)
if us_states_df[mask_series].shape[0]:
    display(us_states_df[mask_series])

['American Samoa', 'District of Columbia', 'Guam', 'Northern Mariana Is.', 'Puerto Rico', 'Total United States', 'U.S. Virgin Islands']



## Clean and prepare Countries dataset

In [9]:

url = 'https://data.worldbank.org/indicator/SP.DYN.TFRT.IN'
# driver = ssu.get_driver()
tables_list = ssu.get_page_tables(url, driver=None)
# driver.close()
if not tables_list:
    tables_list = ssu.get_page_tables('../data/html/world_bank_fertility_rate_by_country.html')
    # page_soup = ssu.get_page_soup(url)
    # tables_df_list = []
    # div_soups_list = page_soup.find_all('div', attrs={'class': 'infinite'})
    # columns_list = []
    # for div_soup in div_soups_list:
    #     title_soups_list = div_soup.find_all('div', attrs={'class': 'title'})
    #     for title_soup in title_soups_list:
    #         print(title_soup.next_sibling)
    #         th_soups_list = title_soup.find_all('div', attrs={'class': 'th'})
    #         for th_soup in th_soups_list:
    #             columns_list.append(th_soup.text.strip())
    # print(columns_list)

No tables found
[]
[(0, (215, 3))]


In [10]:

countries_df = tables_list[0].copy()
print(countries_df.columns.tolist())
columns_list = ['country_name', 'survey_year', 'fertility_rate']
countries_df.columns = columns_list
for cn in ['fertility_rate']:
    countries_df[cn] = countries_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x).split(' ')[0]))
    countries_df[cn] = pd.to_numeric(countries_df[cn], errors='coerce', downcast='float')

mask_series = countries_df.country_name.isnull()
countries_df = countries_df[~mask_series][columns_list]

mask_series = countries_df.fertility_rate.isnull()
countries_df = countries_df[~mask_series][columns_list]

countries_df.sample(9).T

['country_name', 'survey_year', 'fertility_rate']


Unnamed: 0,24,192,146,0,28,122,171,39,108
country_name,Botswana,Togo,Oman,Afghanistan,Burkina Faso,Marshall Islands,Slovak Republic,China,Lebanon
survey_year,2020.0,2020.0,2020.0,2020.0,2020.0,2011.0,2020.0,2020.0,2020.0
fertility_rate,2.8,4.2,2.8,4.2,5.0,4.0,1.6,1.7,2.1


In [11]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
if countries_list:
    print(countries_list)
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
if doubles_df[mask_series].shape[0]:
    columns_list = ['first_item', 'second_item', 'max_similarity']
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))

['American Samoa', 'Anguilla', 'Antarctica', 'Bonaire, Sint Eustatius & Saba', 'Bouvet Island', 'British Indian Ocean Territory', 'British Virgin Islands', 'Cabo Verde', 'Cayman Islands', 'Channel Islands', 'Christmas Island', 'Cocos (Keeling) Islands', 'Cook Islands', 'Egypt', 'Egypt, Arab Rep.', 'Falkland Islands (Malvinas)', 'Federated States of Micronesia', 'French Guiana', 'French Southern Territories', 'Gibraltar', 'Guadeloupe', 'Guernsey', 'Heard Island & McDonald Islands', 'Holy See', 'Iran', 'Iran, Islamic Rep.', 'Jersey', 'Kosovo', 'Martinique', 'Mayotte', 'Micronesia, Fed. Sts.', 'Monaco', 'Montserrat', 'Nauru', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Palestine', 'Pitcairn', 'ROC', 'Réunion', 'South Georgia & the South Sandwich Islands', 'St. Barthélemy', 'St. Helena, Ascension & Tristan da Cunha', 'St. Martin', 'St. Martin (French part)', 'St. Pierre & Miquelon', 'Svalbard & Jan Mayen', 'Taiwan', 'Tokelau', 'Turks & Caicos Islands', 'Tuvalu', 'United States M

Unnamed: 0,first_item,second_item,max_similarity
12,Cook Islands,Norfolk Island,0.769231
8,Cayman Islands,Channel Islands,0.758621
9,Channel Islands,Åland Islands,0.714286
21,Guernsey,Jersey,0.714286
4,Bouvet Island,Cook Islands,0.64
11,Cocos (Keeling) Islands,Cook Islands,0.628571


In [12]:

# These countries cause redditors to make hurtful comments *sniff*
country_set = set(all_countries_df.country_name) - set(ssu.derisable_countries_list)
mask_series = countries_df.country_name.isin(country_set)
countries_df = countries_df[mask_series]

mask_series = countries_df.duplicated(subset=['country_name'], keep=False)
if countries_df[mask_series].shape[0]:
    display(countries_df[mask_series].sort_values('country_name'))

In [13]:

s.store_objects(fertility_countries_df=countries_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\fertility_countries_df.pkl



## Prepare for and Create Choropleth

In [22]:

equivalence_column_name = 'Country_Equivalent_Fertility_Rate'
states_target_column_name = 'total_fertility_rate_2020'
mask_series = countries_df.country_name.isin(all_countries_df.country_name)
ssu.prepare_for_choroplething(countries_df[mask_series], 'fertility_rate', us_states_df,
                              st_col_name=states_target_column_name,
                              st_col_explanation='Fertility_Rate (2020)',
                              equivalence_column_name=equivalence_column_name, verbose=True)


Afghanistan (4.20) is close to the fertility rate of Guam (2.64)
Albania (1.60) is close to the fertility rate of Nevada (1.60)
Algeria (2.90) is close to the fertility rate of Guam (2.64)
Andorra (1.30) is close to the fertility rate of Vermont (1.36)
Angola (5.40) is close to the fertility rate of Guam (2.64)
Antigua & Barbuda (2) is close to the fertility rate of South Dakota (1.98)
Argentina (2.20) is close to the fertility rate of Northern Mariana Is. (2.41)
Armenia (1.80) is close to the fertility rate of Kentucky (1.80)
Aruba (1.90) is close to the fertility rate of Alaska (1.89)
Australia (1.60) is close to the fertility rate of Nevada (1.60)
Austria (1.40) is close to the fertility rate of U.S. Virgin Islands (1.39)
Azerbaijan (1.70) is close to the fertility rate of Tennessee (1.70)
Bahamas (1.70) is close to the fertility rate of Tennessee (1.70)
Bahrain (1.90) is close to the fertility rate of Alaska (1.89)
Bangladesh (2) is close to the fertility rate of South Dakota (1.9

In [23]:

c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df, all_countries_df=all_countries_df)
c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(
    numeric_column_name=states_target_column_name, string_column_name=equivalence_column_name,
    one_country_df=ssu.us_stats_df, cmap='summer')
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_total_fertility_rate_2020_Country_Equivalent_Fertility_Rate.svg
