
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)


----
# Get State/Country Income Equivalents


## Clean and prepare US States dataset

In [4]:

if s.pickle_exists('disposable_income_states_df'): disposable_income_states_df = s.load_object('disposable_income_states_df')
else:
    tables_url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_income'
    page_tables_list = ssu.get_page_tables(tables_url)
    disposable_income_states_df = page_tables_list[3].copy().iloc[:, [0, 1, 4]]
    disposable_income_states_df.columns = ['Rank', 'State', 'Income']
    disposable_income_states_df['Year'] = 2020
    disposable_income_states_df.Income = pd.to_numeric(disposable_income_states_df.Income.map(lambda x: re.sub(r'\D+', '', str(x))), errors='coerce')
    mask_series = ~disposable_income_states_df.Income.isnull()
    disposable_income_states_df = disposable_income_states_df[mask_series]
    mask_series = (disposable_income_states_df.State == 'Washington, D.C.')
    disposable_income_states_df.loc[mask_series, 'State'] = 'District of Columbia'
    s.store_objects(disposable_income_states_df=disposable_income_states_df)
disposable_income_states_df

[(3, (57, 10)), (1, (52, 14)), (5, (11, 2)), (0, (6, 1)), (2, (5, 14)), (4, (5, 2))]
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\disposable_income_states_df.pkl


Unnamed: 0,Rank,State,Income,Year
0,,United States,52853.0,2020
1,1.0,District of Columbia,73568.0,2020
2,2.0,Massachusetts,66978.0,2020
3,3.0,Connecticut,67110.0,2020
4,4.0,New Jersey,64031.0,2020
5,5.0,Maryland,57829.0,2020
6,6.0,New York,62773.0,2020
7,7.0,Washington,60468.0,2020
8,8.0,New Hampshire,60715.0,2020
9,9.0,Colorado,56415.0,2020


In [5]:

# Remove US states duplicates and misspellings
states_list = sorted(set(disposable_income_states_df.State).symmetric_difference(set(ssu.us_stats_df.index)))
print(states_list)
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

['United States']


Unnamed: 0,first_item,second_item,max_similarity



## Clean and prepare Countries dataset

In [10]:

if s.pickle_exists('disposable_income_countries_df'): disposable_income_countries_df = s.load_object('disposable_income_countries_df')
else:
    tables_url = 'https://en.wikipedia.org/wiki/Disposable_household_and_per_capita_income'
    page_tables_list = ssu.get_page_tables(tables_url)
    disposable_income_countries_df = page_tables_list[2].copy().iloc[2:].reset_index(drop=True)
    disposable_income_countries_df.columns = ['Rank', 'Country', 'Income', 'Year']
    disposable_income_countries_df.Income = pd.to_numeric(disposable_income_countries_df.Income.map(lambda x: re.sub(r'\D+', '', str(x))), errors='coerce')
    mask_series = ~disposable_income_countries_df.Income.isnull()
    disposable_income_countries_df = disposable_income_countries_df[mask_series]
    mask_series = (disposable_income_countries_df.Country == 'Slovak Republic')
    disposable_income_countries_df.loc[mask_series, 'Country'] = 'Slovakia'
    s.store_objects(disposable_income_countries_df=disposable_income_countries_df)
disposable_income_countries_df

[(2, (45, 4)), (3, (43, 4)), (0, (38, 3)), (1, (36, 3)), (4, (32, 6)), (5, (29, 2)), (13, (13, 2)), (7, (12, 2)), (8, (11, 2)), (9, (8, 2)), (10, (8, 2)), (11, (7, 2)), (15, (7, 2)), (12, (4, 2)), (6, (2, 2)), (14, (2, 2))]
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\disposable_income_countries_df.pkl


Unnamed: 0,Rank,Country,Income,Year
0,1,Luxembourg,47300,2020
1,2,United States,46600,2021
2,3,Norway,41600,2021
3,4,Canada,38500,2020
4,5,Switzerland,37900,2019
5,6,Austria,37000,2020
6,7,Netherlands,35900,2021
7,8,Australia,35700,2020
8,9,Belgium,35300,2020
9,10,Iceland,34300,2017


In [11]:

# Remove country duplicates and misspellings
disposable_income_countries_df.country_name = disposable_income_countries_df.Country.map(lambda x: ssu.country_name_dict.get(x, x))
c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df)
countries_list = sorted(set(disposable_income_countries_df.Country).symmetric_difference(set(c.all_countries_df.country_name)))
print(countries_list)
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antarctica', 'Antigua & Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire, Sint Eustatius & Saba', 'Bosnia & Herzegovina', 'Botswana', 'Bouvet Island', 'British Indian Ocean Territory', 'British Virgin Islands', 'Brunei', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon', 'Cayman Islands', 'Central African Republic', 'Chad', 'Christmas Island', 'Cocos (Keeling) Islands', 'Colombia', 'Comoros', 'Cook Islands', 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czech Republic', 'Czechia', "Côte d'Ivoire", 'DRC', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Federated States of Micronesia', 'Fiji', 'French Guiana', 'French Polynesia', 'French Southe

Unnamed: 0,first_item,second_item,max_similarity
130,Niger,Nigeria,0.833333
68,Gambia,Zambia,0.833333
107,Malawi,Mali,0.800000
138,Pakistan,Tajikistan,0.777778
41,Cook Islands,Norfolk Island,0.769231
...,...,...,...
131,Nigeria,Serbia,0.615385
51,Dominica,Dominican Republic,0.615385
69,Georgia,Serbia,0.615385
134,North Korea,North Macedonia,0.615385



## Create Equivalence Dictionaries

In [20]:

state_to_country_equivalent_dict, country_to_state_equivalent_dict = ssu.get_country_state_equivalents(
    disposable_income_countries_df, 'Country', 'Income',
    disposable_income_states_df, 'State', 'Income',
    cn_col_explanation='Median Household Net Income by Country', st_col_explanation='Disposable Personal per capita Income by State',
    countries_set=None, states_set=None, verbose=False)

In [21]:

string_column_name = 'Country_Equivalent_Disposable_Income'
ssu.us_stats_df[string_column_name] = ssu.us_stats_df.index.map(lambda x: state_to_country_equivalent_dict.get(x, x))

In [24]:

states_dict = disposable_income_states_df.set_index('State')['Income'].to_dict()
states_min = disposable_income_states_df['Income'].min()
ssu.us_stats_df['Income'] = ssu.us_stats_df.index.map(lambda x: states_dict.get(x, states_min))
column_description_dict = s.load_object('column_description_dict')
column_description_dict['Income'] = 'Disposable Personal per capita Income by State (2022)'
s.store_objects(column_description_dict=column_description_dict)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\column_description_dict.pkl



## Prepare for and Create Choropleth

In [25]:

c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df)
dir(c)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'add_docname', 'all_countries_df', 'axes_str', 'clean_up_district_common_dict', 'clean_up_district_merge_dataframe', 'clean_up_district_unique_dict', 'clean_up_suggestion_list_dict', 'color_distance_from', 'conjunctify_nouns', 'convert_svg_to_dataframe', 'copy_file_name', 'copy_file_path', 'create_country_colored_labeled_map', 'create_country_colored_map', 'create_country_labeled_map', 'create_district_first_dict', 'create_label_line_file', 'create_suggestion_list_dictionary', 'create_svg_file_beginning', 'create_text_tag_xml', 'create_us_google_suggest_labeled_map', 'district_path_str', 'figure_str', 'fill_style_prefix', 'fill_style_str', 'get_co

In [26]:

# Close the Notepad++ window after you're finished in order to completely run this cell
numeric_column_name = 'White_Percent'
string_column_name = 'Country_Equivalent_Disposable_Income'
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
if (numeric_column_name in ssu.us_stats_df.columns) and (string_column_name in ssu.us_stats_df.columns):
    svg_file_path = os.path.abspath(c.create_country_colored_labeled_map(numeric_column_name=numeric_column_name,
                                                                         string_column_name=string_column_name,
                                                                         one_country_df=c.one_country_df))
    !"{text_editor_path}" "{svg_file_path}"