
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Intentional Homicide Equivalents


## Clean and prepare US States dataset

In [4]:

url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_intentional_homicide_rate'
tables_list = ssu.get_page_tables(url)

[(0, (54, 12)), (1, (11, 2))]


In [5]:

us_states_df = tables_list[0].copy()
# print(us_states_df.columns.tolist())
columns_list = ['murder_victums_2020'] + [f'murder_rates_per_100k_people_{i}' for i in range(2020, 2010, -1)]
us_states_df.columns = ['state_name'] + columns_list
for cn in columns_list:
    us_states_df[cn] = us_states_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x)))
    us_states_df[cn] = pd.to_numeric(us_states_df[cn], errors='coerce', downcast='float')
mask_series = us_states_df.state_name.isnull()
us_states_df = us_states_df[~mask_series]
us_states_df.sample(5).T

Unnamed: 0,30,44,29,37,52
state_name,Wisconsin,Wyoming,California,Washington,New Hampshire
murder_victums_2020,308.0,18.0,2203.0,301.0,12.0
murder_rates_per_100k_people_2020,5.3,3.1,5.6,3.9,0.9
murder_rates_per_100k_people_2019,3.2,2.2,4.3,2.7,2.4
murder_rates_per_100k_people_2018,3.0,2.4,4.4,3.1,1.6
murder_rates_per_100k_people_2017,3.3,2.4,4.6,3.0,1.0
murder_rates_per_100k_people_2016,4.0,3.4,4.9,2.9,1.4
murder_rates_per_100k_people_2015,4.2,2.7,4.8,2.5,1.1
murder_rates_per_100k_people_2014,2.8,2.7,4.4,2.3,1.2
murder_rates_per_100k_people_2013,2.8,2.9,4.6,3.1,1.7


In [6]:

mask_series = (us_states_df.state_name == 'District of Columbia')
us_states_df = us_states_df[~mask_series]

In [7]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(ssu.us_stats_df.index)))
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = us_states_df.duplicated(subset=['state_name'], keep=False)
if us_states_df[mask_series].shape[0]:
    display(us_states_df[mask_series])


## Clean and prepare Countries dataset

In [8]:

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_intentional_homicide_rate'
tables_list = ssu.get_page_tables(url)

[(1, (196, 7)), (2, (49, 2)), (3, (16, 2)), (0, (6, 3)), (4, (6, 2))]


In [9]:

countries_df = tables_list[1].copy()
# print(countries_df.columns.tolist())
countries_df.columns = ['country_name', 'region_name',  'subregion_name', 'homicide_rate',
                        'body_count', 'data_source_year', 'data_source']
for cn in ['homicide_rate', 'body_count', 'data_source_year']:
    countries_df[cn] = countries_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x).split('[')[0]))
    countries_df[cn] = pd.to_numeric(countries_df[cn], errors='coerce', downcast='integer')
mask_series = countries_df.country_name.isnull()
countries_df = countries_df[~mask_series]
countries_df.country_name = countries_df.country_name.map(lambda x: str(x).split('*')[0].strip())
countries_df.head(9).T

Unnamed: 0,1,2,3,4,5,6,7,8,9
country_name,Afghanistan,Albania,Algeria,Andorra,Angola,Anguilla,Antigua and Barbuda,Argentina,Armenia
region_name,Asia,Europe,Africa,Europe,Africa,Americas,Americas,Americas,Asia
subregion_name,Southern Asia,Southern Europe,Northern Africa,Southern Europe,Middle Africa,Caribbean,Caribbean,South America,Western Asia
homicide_rate,6.7,2.1,1.3,2.6,4.8,28.3,11.1,5.3,1.8
body_count,2474.0,61.0,580.0,2.0,1217.0,4.0,10.0,2362.0,52.0
data_source_year,2018.0,2020.0,2020.0,2020.0,2012.0,2014.0,2012.0,2018.0,2020.0
data_source,NSO,NSO,CTS,CTS,NSO,SDG,OAS,MoS,CTS


In [10]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = countries_df.duplicated(subset=['country_name'], keep=False)
if countries_df[mask_series].shape[0]:
    display(countries_df[mask_series])

Unnamed: 0,first_item,second_item,max_similarity
11,Cook Islands,Norfolk Island,0.769231
7,Channel Islands,Faroe Islands,0.714286
28,Guernsey,Jersey,0.714286
57,Solomon Islands,Åland Islands,0.714286
47,Northern Ireland,Northern Mariana Islands,0.7
3,Bouvet Island,Faroe Islands,0.692308
20,Faroe Islands,Åland Islands,0.692308
46,North Korea,Northern Ireland,0.666667
63,Togo,Tonga,0.666667
9,Cocos (Keeling) Islands,Cook Islands,0.628571



## Prepare for and Create Choropleth

In [11]:

equivalence_column_name = 'Country_Equivalent_Intentional_Homicide_Rate'
states_target_column_name = 'murder_rates_per_100k_people_2020'
mask_series = countries_df.country_name.isin(all_countries_df.country_name)
ssu.prepare_for_choroplething(countries_df[mask_series], 'homicide_rate', us_states_df, st_col_name=states_target_column_name,
                              st_col_explanation='Intentional Homicides per 100,000 Population (2020)',
                              equivalence_column_name=equivalence_column_name, verbose=False)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\us_stats_df.pkl
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\column_description_dict.pkl


In [20]:

c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df, all_countries_df=all_countries_df)
c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(numeric_column_name=states_target_column_name,
                                                     string_column_name=equivalence_column_name,
                                                     one_country_df=ssu.us_stats_df, cmap='summer')
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_murder_rates_per_100k_people_2020_Country_Equivalent_Intentional_Homicide_Rate.svg
