
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Income Inequality Equivalents


## Clean and prepare US States dataset

In [None]:

url = 'https://worldpopulationreview.com/state-rankings/income-inequality-by-state'
tables_list = ssu.get_page_tables(url)

In [5]:

us_states_df = tables_list[0].copy()
# print(us_states_df.columns.tolist())
us_states_df.columns = ['state_name', 'gini_coefficient']
for cn in ['gini_coefficient']:
    us_states_df[cn] = us_states_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x)))
    us_states_df[cn] = pd.to_numeric(us_states_df[cn], errors='coerce', downcast='float')
mask_series = us_states_df.state_name.isnull()
us_states_df = us_states_df[~mask_series]
us_states_df.sample(5).T

Unnamed: 0,25,4,0,40,23
state_name,West Virginia,Florida,New York,Nebraska,Ohio
gini_coefficient,46.209999,49.0,51.02,44.200001,46.41


In [6]:

mask_series = (us_states_df.state_name == 'District of Columbia')
us_states_df = us_states_df[~mask_series]

In [7]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(ssu.us_stats_df.index)))
print(states_list)
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = us_states_df.duplicated(subset=['state_name'], keep=False)
if us_states_df[mask_series].shape[0]:
    display(us_states_df[mask_series])

['District of Columbia']


In [8]:

s.store_objects(income_inequality_us_states_df=us_states_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\income_inequality_us_states_df.pkl



## Clean and prepare Countries dataset

In [9]:

if s.pickle_exists('income_inequality_countries_df'):
    countries_df = s.load_object('income_inequality_countries_df')
else:
    driver = ssu.get_driver()
    tables_list = ssu.get_page_tables('https://worldpopulationreview.com/country-rankings/wealth-inequality-by-country', driver=driver)
    driver.close()
    countries_df = tables_list[0].copy()
    # print(countries_df.columns.tolist())
    countries_df.columns = ['country_name', 'gini_index', 'country_population_2022']
    for cn in ['gini_index']:
        countries_df[cn] = pd.to_numeric(countries_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x))), errors='coerce', downcast='float')
    mask_series = countries_df.country_name.isnull()
    countries_df = countries_df[~mask_series]
countries_df.head(9).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
country_name,South Africa,Namibia,Suriname,Zambia,São Tomé & Príncipe,Central African Republic,Eswatini,Mozambique,Brazil
gini_index,63.0,59.099998,57.900002,57.099998,56.299999,56.200001,54.599998,54.0,53.400002
country_population_2022,59893885,2567012,618040,20017675,227380,5579144,1201670,32969518,215313498


In [10]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = countries_df.duplicated(subset=['country_name'], keep=False)
if countries_df[mask_series].shape[0]:
    display(countries_df[mask_series])

Unnamed: 0,first_item,second_item,max_similarity
14,British Virgin Islands,US Virgin Islands,0.769231
20,Cook Islands,Norfolk Island,0.769231
34,Greenland,Grenada,0.75
17,Cayman Islands,Åland Islands,0.740741
72,Sint Maarten,St. Martin,0.727273
38,Guernsey,Jersey,0.714286
69,San Marino,St. Martin,0.7
12,Bouvet Island,Faroe Islands,0.692308
28,Faroe Islands,Åland Islands,0.692308
6,Aruba,Cuba,0.666667


In [11]:

s.store_objects(income_inequality_countries_df=countries_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\income_inequality_countries_df.pkl



## Prepare for and Create Choropleth

In [None]:

equivalence_column_name = 'Country_Equivalent_Income_Inequality'
states_target_column_name = 'gini_coefficient'
# mask_series = countries_df.country_name.isin(all_countries_df.country_name)
mask_series = countries_df.country_name.isin(ssu.oecd_countries_list)
ssu.prepare_for_choroplething(countries_df[mask_series], 'gini_index', us_states_df, st_col_name=states_target_column_name,
                              st_col_explanation='Gini Coefficient (2022)',
                              equivalence_column_name=equivalence_column_name, verbose=True)

In [50]:

c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df, all_countries_df=all_countries_df)
c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(
    numeric_column_name=states_target_column_name, string_column_name=equivalence_column_name,
    one_country_df=ssu.us_stats_df, cmap='summer')
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_gini_coefficient_Country_Equivalent_Income_Inequality.svg
