
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country Car Fatality Equivalents


## Clean and prepare US States dataset

In [4]:

# url = 'https://worldpopulationreview.com/state-rankings/fatal-car-accidents-by-state'
url = 'https://crashstats.nhtsa.dot.gov/Api/Public/Publication/812581'
file_name = '2016_State_Traffic_Data_CrashStats_NHTSA.pdf'
tables_list = ssu.get_page_tables(url, pdf_file_name=file_name)

Got stderr: Aug 17, 2022 6:14:27 PM org.apache.pdfbox.pdfparser.COSParser parseXref



[(3, (56, 9)), (7, (56, 6)), (6, (55, 3)), (8, (55, 7)), (9, (55, 7)), (10, (55, 5)), (4, (54, 9)), (5, (54, 7)), (2, (46, 3)), (0, (26, 19)), (11, (21, 2)), (1, (5, 4))]


In [5]:

us_states_df = tables_list[3].copy()
# columns_list = []
# for t1, t2, t3, t4 in zip(us_states_df.columns.tolist(), us_states_df.iloc[0].T.tolist(), us_states_df.iloc[1].T.tolist(),
#                           us_states_df.iloc[2].T.tolist()):
#     column_names_list = []
#     if not t1.startswith('Unnamed'):
#         column_names_list.append(t1.strip().lower().replace(' ', '_').replace(',000', 'k'))
#     for t in [t2, t3, t4]:
#         if str(t) != 'nan':
#             column_names_list.append(t.strip().lower().replace(' ', '_').replace(',000', 'k'))
#     columns_list.append('_'.join(column_names_list))
# print(columns_list)
columns_list = ['traffic_fatalities', 'population_in_thousands', 'licensed_drivers_in_thousands',
                'registered_vehicles_in_thousands', 'vehicle_miles_traveled_in_millions', 'fatality_rates_per_100k_population']
us_states_df.columns = ['state_name'] + columns_list + ['fixme', 'fatality_rates_per_100_million_vehicle_miles_traveled']
us_states_df = us_states_df.iloc[3:]
fixme_columns_list = ['fatality_rates_per_100k_licensed_drivers', 'fatality_rates_per_100k_registered_vehicles']
for i in range(2):
    us_states_df[fixme_columns_list[i]] = us_states_df.fixme.map(lambda x: re.sub(r'[^0-9\.]+', '', x.split(' ')[i]))
for cn in fixme_columns_list:
    us_states_df[cn] = pd.to_numeric(us_states_df[cn], errors='coerce', downcast='float')
columns_list += ['fatality_rates_per_100_million_vehicle_miles_traveled']
for cn in columns_list:
    us_states_df[cn] = us_states_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x)))
    us_states_df[cn] = pd.to_numeric(us_states_df[cn], errors='coerce', downcast='float')
columns_list = ['state_name'] + columns_list + fixme_columns_list
us_states_df[columns_list].head(5).T

Unnamed: 0,3,4,5,6,7
state_name,Alabama,Alaska,Arizona,Arkansas,California
Traffic Fatalities,1038.0,84.0,962.0,545.0,3623.0
Population (thousands),4863.0,742.0,6931.0,2988.0,39250.0
Licensed Drivers (thousands),3943.0,535.0,5082.0,2391.0,26199.0
Registered Vehicles (thousands),5468.0,795.0,5787.0,2808.0,30221.0
Vehicle Miles Traveled (millions),69227.0,5259.0,65786.0,35755.0,340115.0
fatality_rates_per_100k_population,21.34,11.32,13.88,18.24,9.23
Fatality Rates per 100 Million Vehicle Miles Traveled,1.5,1.6,1.46,1.52,1.07
"Fatality Rates per 100,000 Licensed Drivers",26.32,15.71,18.93,22.790001,13.83
"Fatality Rates per 100,000 Registered Vehicles",18.98,10.57,16.620001,19.41,11.99


In [6]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(ssu.us_stats_df.index)))
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity



## Clean and prepare Countries dataset

In [7]:

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_traffic-related_death_rate'
tables_list = ssu.get_page_tables(url)

[(1, (189, 7)), (2, (4, 2)), (0, (1, 2))]


In [8]:

countries_df = tables_list[1].copy()
# print(countries_df.columns.tolist())
countries_df.columns = ['country_name', 'continent_name',  'road_deaths_per_100k_inhabitants',
                        'road_deaths_per_100k_motor_vehicles', 'road_deaths_per_100B_vehicle_kms', 'road_deaths_total',
                        'data_source_year']
for cn in ['road_deaths_per_100k_inhabitants', 'road_deaths_per_100k_motor_vehicles', 'road_deaths_per_100B_vehicle_kms',
           'road_deaths_total', 'data_source_year']:
    countries_df[cn] = countries_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x).split('[')[0]))
    countries_df[cn] = pd.to_numeric(countries_df[cn], errors='coerce', downcast='float')
countries_df.head(9).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
country_name,Global,Africa,Eastern Mediterranean,Western Pacific,South-East Asia,Americas,Europe,Afghanistan,Albania
continent_name,,,,,,,,Asia,Europe
road_deaths_per_100k_inhabitants,18.200001,26.6,18.0,16.9,20.700001,15.6,9.3,15.5,15.1
road_deaths_per_100k_motor_vehicles,,574.0,139.0,69.0,101.0,33.0,19.0,722.400024,107.199997
road_deaths_per_100B_vehicle_kms,,,,,,,,,
road_deaths_total,1350000.0,246719.0,122730.0,328591.0,316080.0,153789.0,85629.0,4734.0,478.0
data_source_year,2016.0,2016.0,2016.0,2016.0,2016.0,2016.0,2016.0,2013.0,2016.0


In [9]:

mask_series = (countries_df.country_name == 'Iceland')
countries_df[mask_series].T

Unnamed: 0,76
country_name,Iceland
continent_name,Europe
road_deaths_per_100k_inhabitants,3.8
road_deaths_per_100k_motor_vehicles,7.6
road_deaths_per_100B_vehicle_kms,4.9
road_deaths_total,8.0
data_source_year,5.20162


In [10]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity
12,British Virgin Islands,US Virgin Islands,0.769231
35,Greenland,Grenada,0.75
16,Cayman Islands,Åland Islands,0.740741
65,Sint Maarten,St. Martin,0.727273
0,Africa,Americas,0.714286
39,Guernsey,Jersey,0.714286
10,Bouvet Island,Faroe Islands,0.692308
29,Faroe Islands,Åland Islands,0.692308
1,American Samoa,Americas,0.636364
30,French Guiana,French Polynesia,0.62069


In [11]:

mask_series = countries_df.duplicated(subset=['country_name'], keep=False)
countries_df[mask_series].T

country_name
continent_name
road_deaths_per_100k_inhabitants
road_deaths_per_100k_motor_vehicles
road_deaths_per_100B_vehicle_kms
road_deaths_total
data_source_year



## Prepare for and Create Choropleth

In [12]:

equivalence_column_name = 'Country_Equivalent_Road_Deaths_per_100k_Inhabitants'
states_target_column_name = 'fatality_rates_per_100k_population'
mask_series = countries_df.country_name.isin(all_countries_df.country_name)
ssu.prepare_for_choroplething(countries_df[mask_series], 'road_deaths_per_100k_inhabitants', us_states_df, st_col_name=states_target_column_name,
                              st_col_explanation='Road Deaths per 100,000 Population',
                              equivalence_column_name=equivalence_column_name, verbose=False)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\us_stats_df.pkl
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\column_description_dict.pkl


In [13]:

c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df, all_countries_df=all_countries_df)
c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(numeric_column_name=states_target_column_name,
                                                     string_column_name=equivalence_column_name,
                                                     one_country_df=ssu.us_stats_df)
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_fatality_rates_per_100k_population_Country_Equivalent_Road_Deaths_per_100k_Inhabitants.svg
