
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Prison Population Equivalents


## Clean and prepare US States dataset

In [4]:

url = 'https://worldpopulationreview.com/state-rankings/prison-population-by-state'
tables_list = ssu.get_page_tables(url)

[(0, (50, 4))]


In [5]:

us_states_df = tables_list[0].copy()
print(us_states_df.columns.tolist())
columns_list = ['imprisonment_rate_per_100k', 'total_prison_population', 'state_population_2022']
us_states_df.columns = ['state_name'] + columns_list
for cn in columns_list:
    us_states_df[cn] = us_states_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x)))
    us_states_df[cn] = pd.to_numeric(us_states_df[cn], errors='coerce', downcast='integer')
mask_series = us_states_df.state_name.isnull()
us_states_df = us_states_df[~mask_series]
us_states_df.sample(5).T

['State', 'Imprisonment Rate (per 100K)', 'Total Prison Population', '2022 Pop.']


Unnamed: 0,38,37,2,11,0
state_name,Alaska,Connecticut,Oklahoma,Wyoming,Louisiana
imprisonment_rate_per_100k,241,242,633,428,674
total_prison_population,1782,8751,25338,2479,31584
state_population_2022,738023,3612314,4000953,579495,4682633


In [6]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(ssu.us_stats_df.index)))
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = us_states_df.duplicated(subset=['state_name'], keep=False)
if us_states_df[mask_series].shape[0]:
    display(us_states_df[mask_series])


## Clean and prepare Countries dataset

In [7]:

url = 'https://worldpopulationreview.com/country-rankings/incarceration-rates-by-country'
tables_list = ssu.get_page_tables(url)

[(0, (220, 5))]


In [8]:

countries_df = tables_list[0].copy()
print(countries_df.columns.tolist())
columns_list = ['incarceration_rate',  'total_prison_population', 'male_prison_population_percent', 'female_prison_population_percent']
countries_df.columns = ['country_name'] + columns_list
for cn in columns_list:
    countries_df[cn] = countries_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x).split('%')[0]))
    countries_df[cn] = pd.to_numeric(countries_df[cn], errors='coerce', downcast='integer')
mask_series = countries_df.country_name.isnull()
countries_df = countries_df[~mask_series]
countries_df.country_name = countries_df.country_name.map(lambda x: str(x).split('*')[0].strip())
countries_df.head(9).T

['Country', 'Incarceration Rate', 'Total Incarcerated', '% Male', '% Female']


Unnamed: 0,0,1,2,3,4,5,6,7,8
country_name,United States,Rwanda,Turkmenistan,El Salvador,Cuba,Palau,British Virgin Islands,Thailand,Panama
incarceration_rate,629.0,580.0,576.0,564.0,510.0,478.0,477.0,445.0,434.0
total_prison_population,2068800.0,76099.0,35000.0,36663.0,57337.0,86.0,143.0,309282.0,18942.0
male_prison_population_percent,90,95,94,93,0,95,96,89,95
female_prison_population_percent,10,5,7,7,0,5,5,12,5


In [9]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
if countries_list:
    print(countries_list)
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = countries_df.duplicated(subset=['country_name'], keep=False)
if countries_df[mask_series].shape[0]:
    display(countries_df[mask_series])

['Antarctica', 'Bonaire, Sint Eustatius & Saba', 'Bouvet Island', 'British Indian Ocean Territory', 'Cape Verde', 'Christmas Island', 'Cocos (Keeling) Islands', "Côte d'Ivoire", 'Falkland Islands (Malvinas)', 'Federated States of Micronesia', 'French Southern Territories', 'Heard Island and McDonald Islands', 'Holy See', 'Ivory Coast', 'Micronesia', 'Montserrat', 'Niue', 'Norfolk Island', 'Palestine', 'Pitcairn', 'ROC', 'Réunion', 'Saint Helena, Ascension & Tristan da Cunha', 'South Georgia and the South Sandwich Islands', 'St. Barthélemy', 'St. Martin', 'St. Pierre & Miquelon', 'Svalbard and Jan Mayen', 'Tokelau', 'Turks & Caicos Islands', 'UK', 'United States Minor Outlying Islands', 'Wallis & Futuna', 'Western Sahara', 'Åland Islands']



## Prepare for and Create Choropleth

In [None]:

equivalence_column_name = 'Country_Equivalent_Prison_Population'
states_target_column_name = 'total_prison_population'
mask_series = countries_df.country_name.isin(all_countries_df.country_name)
ssu.prepare_for_choroplething(countries_df[mask_series], 'total_prison_population', us_states_df, st_col_name=states_target_column_name,
                              st_col_explanation='Total Incarcerated (2020)',
                              equivalence_column_name=equivalence_column_name, verbose=True)

In [12]:

c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df, all_countries_df=all_countries_df)
c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(numeric_column_name=states_target_column_name,
                                                     string_column_name=equivalence_column_name,
                                                     one_country_df=ssu.us_stats_df)
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_total_prison_population_Country_Equivalent_Prison_Population.svg
