
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Suicide Rate Equivalents


## Clean and prepare US States dataset

In [4]:

url = 'https://www.cdc.gov/nchs/pressroom/sosmap/suicide-mortality/suicide.htm'
tables_list = ssu.get_page_tables(url)
if not tables_list:
    tables_list = [pd.read_csv('../data/csv/suicide_2019.csv', encoding=s.encoding_type)]

No tables found
[]


In [5]:

us_states_df = tables_list[0].copy()
print(us_states_df.columns.tolist())
us_states_df.columns = ['source_year', 'state_abbreviation', 'suicide_rate', 'body_count', 'source_url']
for cn in ['source_year', 'suicide_rate', 'body_count']:
    us_states_df[cn] = us_states_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x)))
    us_states_df[cn] = pd.to_numeric(us_states_df[cn], errors='coerce', downcast='integer')
abbreviation_dict = {v: k for k, v in ssu.us_states_abbreviation_dict.items()}
mask_series = us_states_df.state_abbreviation.isnull()
us_states_df = us_states_df[~mask_series]
us_states_df['state_name'] = us_states_df.state_abbreviation.map(lambda x: abbreviation_dict.get(x, x))
mask_series = (us_states_df.source_year == 2019)
us_states_df = us_states_df[mask_series]
us_states_df.sample(5).T

['YEAR', 'STATE', 'RATE', 'DEATHS', 'URL']


Unnamed: 0,82,71,54,67,57
source_year,2019,2019,2019,2019,2019
state_abbreviation,NC,MI,CA,LA,DE
suicide_rate,12.5,14.3,10.7,15.0,11.3
body_count,1358,1472,4436,704,111
source_url,https://www.cdc.gov/nchs/pressroom/states/nort...,https://www.cdc.gov/nchs/pressroom/states/mich...,https://www.cdc.gov/nchs/pressroom/states/cali...,https://www.cdc.gov/nchs/pressroom/states/loui...,https://www.cdc.gov/nchs/pressroom/states/dela...
state_name,North Carolina,Michigan,California,Louisiana,Delaware


In [6]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(ssu.us_stats_df.index)))
print(states_list)
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = us_states_df.duplicated(subset=['state_name'], keep=False)
if us_states_df[mask_series].shape[0]:
    display(us_states_df[mask_series])

['District of Columbia']



## Clean and prepare Countries dataset

In [7]:

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_suicide_rate'
tables_list = ssu.get_page_tables(url)

[(1, (190, 4)), (2, (184, 21)), (3, (184, 21)), (4, (184, 21)), (6, (112, 5)), (7, (12, 2)), (0, (10, 1)), (9, (8, 2)), (8, (6, 2)), (5, (5, 5))]


In [8]:

countries_df = tables_list[1].copy()
print(countries_df.columns.tolist())
columns_list = ['suicide_rate', 'male_suicide_rate', 'female_suicide_rate']
countries_df.columns = ['country_name'] + columns_list
for cn in columns_list:
    countries_df[cn] = countries_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x)))
    countries_df[cn] = pd.to_numeric(countries_df[cn], errors='coerce', downcast='float')
mask_series = countries_df.country_name.isnull()
countries_df = countries_df[~mask_series]
countries_df.country_name = countries_df.country_name.map(lambda x: str(x).split('*')[0].strip())
countries_df.head(9).T

['Country', 'All', 'Male', 'Female']


Unnamed: 0,0,1,2,3,4,5,6,7,8
country_name,Afghanistan,Albania,Algeria,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria
suicide_rate,6.0,3.7,2.6,12.6,0.3,8.1,2.7,11.3,10.4
male_suicide_rate,6.2,5.3,3.3,21.700001,0.0,13.5,4.9,17.0,16.6
female_suicide_rate,5.7,2.2,1.9,4.7,0.6,3.3,1.0,5.6,4.6


In [9]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
if countries_list:
    print(countries_list)
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = countries_df.duplicated(subset=['country_name'], keep=False)
if countries_df[mask_series].shape[0]:
    display(countries_df[mask_series])

['Africa', 'American Samoa', 'Americas', 'Andorra', 'Anguilla', 'Antarctica', 'Aruba', 'Bermuda', 'Bonaire, Sint Eustatius & Saba', 'Bouvet Island', 'British Indian Ocean Territory', 'British Virgin Islands', 'Cape Verde', 'Cayman Islands', 'Christmas Island', 'Cocos (Keeling) Islands', 'Cook Islands', 'Curaçao', "Côte d'Ivoire", 'Dominica', 'East Timor', 'Eastern Mediterranean', 'Europe', 'Falkland Islands (Malvinas)', 'Faroe Islands', 'Federated States of Micronesia', 'French Guiana', 'French Polynesia', 'French Southern Territories', 'Gibraltar', 'Global', 'Greenland', 'Guadeloupe', 'Guam', 'Guernsey', 'Heard Island and McDonald Islands', 'Holy See', 'Hong Kong', 'Isle of Man', 'Ivory Coast', 'Jersey', 'Liechtenstein', 'Macau', 'Marshall Islands', 'Martinique', 'Mayotte', 'Micronesia', 'Monaco', 'Montserrat', 'Nauru', 'New Caledonia', 'Niue', 'Norfolk Island', 'Northern Mariana Islands', 'Palau', 'Palestine', 'Pitcairn', 'Puerto Rico', 'ROC', 'Réunion', 'Saint Helena, Ascension & Tr

Unnamed: 0,first_item,second_item,max_similarity
11,British Virgin Islands,US Virgin Islands,0.769231
16,Cook Islands,Norfolk Island,0.769231
13,Cayman Islands,Åland Islands,0.740741
62,Sint Maarten,St. Martin,0.727273
0,Africa,Americas,0.714286
34,Guernsey,Jersey,0.714286
61,San Marino,St. Martin,0.7
9,Bouvet Island,Faroe Islands,0.692308
24,Faroe Islands,Åland Islands,0.692308
1,American Samoa,Americas,0.636364



## Prepare for and Create Choropleth

In [None]:

equivalence_column_name = 'Country_Equivalent_Suicide_Rate'
states_target_column_name = 'suicide_rate'
mask_series = countries_df.country_name.isin(all_countries_df.country_name)
ssu.prepare_for_choroplething(countries_df[mask_series], 'suicide_rate', us_states_df, st_col_name=states_target_column_name,
                              st_col_explanation='Suicide Rate per 100,000 Population (2019)',
                              equivalence_column_name=equivalence_column_name, verbose=False)

In [12]:

c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df, all_countries_df=all_countries_df)
c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(numeric_column_name=states_target_column_name,
                                                     string_column_name=equivalence_column_name,
                                                     one_country_df=ssu.us_stats_df, cmap='winter')
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_suicide_rate_Country_Equivalent_Suicice_Rate.svg
