
# Create the Eurasia DataFrame

In [None]:

# Extract the paths from the SVG
import xml.etree.ElementTree as et
import collections

file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\Eurasia_location_map.svg'
root = et.parse(file_path).getroot()
rows_list = []
for el in root.iter():
    rows_list.append(el.attrib)
eurasia_df = pd.DataFrame(rows_list).rename(columns={'{http://www.w3.org/XML/1998/namespace}space': 'namespace',
                                                     '{http://www.inkscape.org/namespaces/inkscape}version': 'inkscape_version',
                                                     '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}docname':'docname',
                                                     '{http://www.inkscape.org/namespaces/inkscape}pageshadow': 'pageshadow',
                                                     '{http://www.inkscape.org/namespaces/inkscape}pageopacity': 'pageopacity',
                                                     '{http://www.inkscape.org/namespaces/inkscape}pagecheckerboard': 'pagecheckerboard',
                                                     '{http://www.inkscape.org/namespaces/inkscape}zoom': 'zoom',
                                                     '{http://www.inkscape.org/namespaces/inkscape}cx': 'cx',
                                                     '{http://www.inkscape.org/namespaces/inkscape}cy': 'cy',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-width': 'window_width',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-height': 'window_height',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-x': 'window_x',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-y': 'window_y',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-maximized': 'window_maximized',
                                                     '{http://www.inkscape.org/namespaces/inkscape}current-layer': 'current_layer',
                                                     'id': 'fr_country_name',
                                                     'd': 'outline_d'})
assert all(map(lambda x: x==1, collections.Counter(eurasia_df.columns).values())), "You doubled up the column names somehow"

In [None]:

# Convert the french name to an english name
import numpy as np

eurasia_df = eurasia_df.iloc[4:]
country_names_list = ['Montenegro', 'Slovenia', 'Croatia', 'Serbia', 'North Macedonia', 'Bosnia and Herzegovina', 'Albania', 'Lithuania',
                      'Estonia', 'Latvia', 'Ukraine', 'Belarus', 'Moldova, Republic of', 'Italy', 'Poland', 'Slovakia', 'Czechia', 'Hungary',
                      'Romania', 'Bulgaria', 'Greece', 'United Kingdom of Great Britain and Northern Ireland', 'Ireland', 'Iceland', 'Spain',
                      'Portugal', 'Denmark', 'Germany', 'Switzerland', 'Austria', 'France', 'Netherlands', 'Belgium', 'Luxembourg', 'Sweden',
                      'Norway', 'Finland', 'Russian Federation', 'Turkey', 'Georgia', 'Syrian Arab Republic', 'Jordan', 'Lebanon', 'Israel',
                      'Kuwait', 'Saudi Arabia', 'United Arab Emirates', 'Qatar', 'Yemen', 'Oman', 'Iraq', 'Iran (Islamic Republic of)',
                      'Armenia', 'Kazakhstan', 'Uzbekistan', 'Azerbaijan', 'Turkmenistan', 'Tajikistan', 'Kyrgyzstan', 'Afghanistan',
                      'Pakistan', 'India', 'Sri Lanka', 'Nepal', 'Bhutan', 'Bangladesh', 'Cambodia', 'Myanmar', 'Viet Nam', 'Malaysia',
                      'Thailand', "Lao People's Democratic Republic", 'Brunei Darussalam', 'Indonesia', 'Philippines', 'Papua New Guinea',
                      'Mongolia', 'China', 'Japan', 'Taiwan, Province of China', "Korea (Democratic People's Republic of)",
                      'Korea, Republic of']
fr_en_dict = {fr: en for fr, en in zip(eurasia_df.fr_country_name.tolist(), country_names_list)}
eurasia_df['country_name'] = eurasia_df.fr_country_name.map(lambda x: fr_en_dict.get(x, np.nan))

# Check for duplicate country paths
mask_series = eurasia_df.duplicated(subset=['country_name'], keep=False)
assert eurasia_df[mask_series].shape[0] == 0, "You've duplicated some country names"

mask_series = eurasia_df.country_name.isnull()
eurasia_df = eurasia_df[~mask_series]

In [9]:

# Get the GDP countries dataframe
%run ../load_magic/storage.py

s = Storage()
if s.pickle_exists('gdp_countries_df'):
    gdp_countries_df = s.load_object('gdp_countries_df')
    mask_series = gdp_countries_df.country_name.isin(['Moldova, Republic of', 'Czechia', 'United Kingdom of Great Britain and Northern Ireland',
                                                      'Russian Federation', 'Syrian Arab Republic', 'Iran (Islamic Republic of)', 'Viet Nam',
                                                      "Lao People's Democratic Republic", 'Brunei Darussalam', 'Taiwan, Province of China',
                                                      "Korea (Democratic People's Republic of)", 'Korea, Republic of'])
    if not gdp_countries_df[mask_series].shape[0]:
        rows_list = []

        # The Gross Domestic Product (GDP) in Moldova was worth 11.91 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Moldova, Republic of'
        row_dict['wb_estimate'] = 11.91*1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in Czech Republic was worth 245.35 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Czechia'
        row_dict['wb_estimate'] = 245.35*1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in the United Kingdom was worth 2707.74 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'United Kingdom of Great Britain and Northern Ireland'
        row_dict['wb_estimate'] = 2707.74 * 1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in Russia was worth 1483.50 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Russian Federation'
        row_dict['wb_estimate'] = 1483.50 * 1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in Syria was worth 65 billion US dollars in 2019,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Syrian Arab Republic'
        row_dict['wb_estimate'] = 65 * 1_000
        row_dict['wb_year'] = 2019
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in Iran was worth 191.72 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Iran (Islamic Republic of)'
        row_dict['wb_estimate'] = 191.72 * 1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in Vietnam was worth 271.16 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Viet Nam'
        row_dict['wb_estimate'] = 271.16 * 1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in Laos was worth 19.14 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = "Lao People's Democratic Republic"
        row_dict['wb_estimate'] = 19.14 * 1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in Brunei was worth 12.02 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Brunei Darussalam'
        row_dict['wb_estimate'] = 12.02 * 1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in Taiwan was worth 668.50 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Taiwan, Province of China'
        row_dict['wb_estimate'] = 668.50 * 1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in North Korea was worth 18 billion US dollars in 2019,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = "Korea (Democratic People's Republic of)"
        row_dict['wb_estimate'] = 18 * 1_000
        row_dict['wb_year'] = 2019
        rows_list.append(row_dict)

        # The Gross Domestic Product (GDP) in South Korea was worth 1630.53 billion US dollars in 2020,
        # according to official data from the World Bank.
        row_dict = {}
        row_dict['country_name'] = 'Korea, Republic of'
        row_dict['wb_estimate'] = 1630.53 * 1_000
        row_dict['wb_year'] = 2020
        rows_list.append(row_dict)
        gdp_countries_df = pd.concat([gdp_countries_df, pd.DataFrame(rows_list)])
        s.store_objects(gdp_countries_df=gdp_countries_df)

In [10]:

# Get the state equivalent column
eurasia_df = s.load_object('eurasia_df')

# Check for duplicate country names
mask_series = gdp_countries_df.duplicated(subset=['country_name'], keep=False)
assert gdp_countries_df[mask_series].shape[0] == 0, "You've duplicated some country names in the GDP countries"

countries_list = [country_name for country_name in eurasia_df.country_name if str(country_name) != 'nan']
COUNTRY_TUPLES_LIST = [(r.country_name, r.wb_estimate) for i, r in gdp_countries_df.iterrows() if r.country_name in countries_list]

In [None]:

gdp_us_states_df = s.load_object('gdp_us_states_df')

# Check for duplicate state name
mask_series = gdp_us_states_df.duplicated(subset=['state_name'], keep=False)
assert gdp_us_states_df[mask_series].shape[0] == 0, "You've duplicated some state names in the GDP states"

In [None]:

# Get the gdp equivalents
state_tuples_list = [(r.state_name, r.gdp_millions_usd_2021) for i, r in gdp_us_states_df.iterrows() if str(r.state_name) != 'nan']
rows_list = []
for country_tuple in COUNTRY_TUPLES_LIST:
    candidate_tuple = sorted([s for s in state_tuples_list], key=lambda x: abs(x[1] - country_tuple[1]))[0]
    state_name = candidate_tuple[0]
    country_name = country_tuple[0]
    # print(f'{country_name} is close to the GDP of {state_name}')
    row_dict = {}
    row_dict['country_name'] = country_name
    row_dict['state_name'] = state_name
    rows_list.append(row_dict)
country_to_state_gdp_equivalent_dict = pd.DataFrame(rows_list).set_index('country_name').state_name.to_dict()
COUNTRY_TUPLES_LIST = [(r.country_name, r.wb_estimate) for i, r in gdp_countries_df.iterrows() if str(r.country_name) != 'nan']
for state_tuple in state_tuples_list:
    candidate_tuple = sorted([s for s in COUNTRY_TUPLES_LIST], key=lambda x: abs(x[1] - state_tuple[1]))[0]
    country_name = candidate_tuple[0]
    # print(f'{state_name} is close to the GDP of {country_name}')
    row_dict = {}
    state_name = state_tuple[0]
    row_dict['state_name'] = state_name
    row_dict['country_name'] = country_name
    rows_list.append(row_dict)
state_to_country_gdp_equivalent_dict = pd.DataFrame(rows_list).set_index('state_name').country_name.to_dict()
eurasia_df['state_equivalent_gdp'] = eurasia_df.country_name.map(lambda x: country_to_state_gdp_equivalent_dict.get(x, np.nan))

In [None]:

# Merge in the GDP and other country info
import re

suffixes_list = ['_all', '_euro', '_gdp', '_merge']
suffixes_regex = re.compile('_(all|euro|gdp|merge)')

# Check for merge cycling
assert all([all(map(lambda x: not column_name.endswith(x), suffixes_list)) for column_name in eurasia_df.columns])

print(f'eurasia_df has {eurasia_df.country_name.unique().shape[0]} unique countries')
# print(sorted(eurasia_df.columns.tolist(), key=lambda x: x[::-1]))

In [None]:

all_countries_df = s.load_object('all_countries_df')

# Check for duplicate country names
mask_series = all_countries_df.duplicated(subset=['country_name'], keep=False)
assert all_countries_df[mask_series].shape[0] == 0, "You've duplicated some country names in the all countries"

# Check for merge cycling
assert all([all(map(lambda x: not column_name.endswith(x), suffixes_list)) for column_name in all_countries_df.columns])

print(f'all_countries_df has {all_countries_df.country_name.unique().shape[0]} unique countries')
# print(sorted(all_countries_df.columns.tolist(), key=lambda x: x[::-1]))

In [None]:

merge_df = all_countries_df.merge(eurasia_df, how='right', on='country_name', suffixes=('_all', '_euro'))

# Check for duplicate country names
mask_series = merge_df.duplicated(subset=['country_name'], keep=False)
assert merge_df[mask_series].shape[0] == 0, "You've duplicated some country names in the first merge"

# Check for merge cycling
assert all([all(map(lambda x: not column_name.endswith(x), suffixes_list)) for column_name in merge_df.columns])

print(f'merge_df has {merge_df.country_name.unique().shape[0]} unique countries')
# print(sorted(merge_df.columns.tolist(), key=lambda x: x[::-1]))

gdp_countries_df = s.load_object('gdp_countries_df')
print(f'gdp_countries_df has {gdp_countries_df.country_name.unique().shape[0]} unique countries')
# print(sorted(gdp_countries_df.columns.tolist(), key=lambda x: x[::-1]))

In [None]:

merge_df = gdp_countries_df.merge(merge_df, how='right', on='country_name', suffixes=('_gdp', '_merge'))

# Check for duplicate country names
mask_series = merge_df.duplicated(subset=['country_name'], keep=False)
assert merge_df[mask_series].shape[0] == 0, "You've duplicated some country names in the second merge"

# Check for duplicate column names
assert all(map(lambda x: x==1, collections.Counter(merge_df.columns).values())), "You doubled up the column names somehow"

# Check for merge cycling
assert all([all(map(lambda x: not column_name.endswith(x), suffixes_list)) for column_name in merge_df.columns])

print(f'merge_df now has {merge_df.country_name.unique().shape[0]} unique countries')
print(sorted(merge_df.columns.tolist(), key=lambda x: x[::-1]))

In [None]:

# Initialize the inkscape settings
eurasia_df['text_x'] = np.nan
eurasia_df['text_y'] = np.nan
eurasia_df ['font_size'] = 12
eurasia_df ['svg_width'] = 1867.787
eurasia_df ['svg_height'] = 916.003
eurasia_df ['inkscape_cx'] = 341.81217
eurasia_df ['inkscape_cy'] = 167.65197
eurasia_df ['inkscape_zoom'] = 1.9206455
eurasia_df ['legend_transform'] = 'translate(-12.768599,191.52893))'
eurasia_df ['colorbar_transform'] = 'matrix(1.478224,0,0,1.478224,1637.317,265.71569)'
eurasia_df['label_line_d'] = np.nan
def get_country_code(x):
    country_code = str(x).split(':')[-1]
    if country_code == 'nan':
        country_code = np.nan

    return country_code
eurasia_df['country_code'] = eurasia_df.iso_3166_2.map(get_country_code)

In [None]:

# Compute the centroids
from svgpathtools import Line, Path
from shapely.geometry import Polygon
import math

# Convert paths to polygons
def path_to_poly(inpath):
    points = []
    for path in inpath:
        if isinstance(path, Line):
            points.append([path.end.real, path.end.imag])
        else:
            num_segments = math.ceil(path.length() / 1.0)
            for seg_i in range(int(num_segments + 1)):
                points.append([path.point(seg_i / num_segments).real,
                                path.point(seg_i / num_segments).imag])

    return Polygon(points)

import svgpathtools

eurasia_df['centroid_x'] = np.nan
eurasia_df['centroid_y'] = np.nan
mask_series = eurasia_df.outline_d.isnull()
for state_name, row_series in eurasia_df[~mask_series].iterrows():
    path_obj = svgpathtools.parse_path(pathdef=row_series.outline_d, current_pos=0j)
    poly_obj = path_to_poly(path_obj)
    eurasia_df.loc[state_name, 'centroid_x'] = poly_obj.centroid.x
    eurasia_df.loc[state_name, 'centroid_y'] = poly_obj.centroid.y

In [None]:

# Set the text positions for each country
eurasia_df.text_x = eurasia_df.centroid_x
eurasia_df.text_y = eurasia_df.centroid_y
mask_series = (eurasia_df.country_code == 'AL')
eurasia_df.loc[mask_series, 'font_size'] = 8
mask_series = (eurasia_df.country_code == 'BE')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'BY')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'ID')
eurasia_df.loc[mask_series, 'text_x'] = 1250.8167
eurasia_df.loc[mask_series, 'text_y'] = 794.68213
mask_series = (eurasia_df.country_code == 'IT')
eurasia_df.loc[mask_series, 'text_x'] = 312.67813
eurasia_df.loc[mask_series, 'text_y'] = 387.79126
mask_series = (eurasia_df.country_code == 'JP')
eurasia_df.loc[mask_series, 'text_x'] = 1415.007
eurasia_df.loc[mask_series, 'text_y'] = 463.56656
mask_series = (eurasia_df.country_code == 'JO')
eurasia_df.loc[mask_series, 'text_x'] = 534.21313
eurasia_df.loc[mask_series, 'text_y'] = 514.51532
mask_series = (eurasia_df.country_code == 'KW')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'LA')
eurasia_df.loc[mask_series, 'text_x'] = 1096.7466
eurasia_df.loc[mask_series, 'text_y'] = 613.78656
eurasia_df.loc[mask_series, 'font_size'] = 10
mask_series = (eurasia_df.country_code == 'LB')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'LU')
eurasia_df.loc[mask_series, 'text_x'] = 274.6127
eurasia_df.loc[mask_series, 'text_y'] = 346.93298
eurasia_df.loc[mask_series, 'font_size'] = 4
mask_series = (eurasia_df.country_code == 'MY')
eurasia_df.loc[mask_series, 'text_x'] = 1097.296
eurasia_df.loc[mask_series, 'text_y'] = 740.30682
mask_series = (eurasia_df.country_code == 'MD')
eurasia_df.loc[mask_series, 'font_size'] = 8
mask_series = (eurasia_df.country_code == 'ME')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'NP')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'NL')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'MK')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'NO')
eurasia_df.loc[mask_series, 'text_x'] = 296.73975
eurasia_df.loc[mask_series, 'text_y'] = 253.39131
mask_series = (eurasia_df.country_code == 'OM')
eurasia_df.loc[mask_series, 'text_x'] = 695.00201
eurasia_df.loc[mask_series, 'text_y'] = 607.63654
mask_series = (eurasia_df.country_code == 'PH')
eurasia_df.loc[mask_series, 'text_x'] = 1260.7877
eurasia_df.loc[mask_series, 'text_y'] = 630.81744
eurasia_df.loc[mask_series, 'font_size'] = 8
mask_series = (eurasia_df.country_code == 'PT')
eurasia_df.loc[mask_series, 'font_size'] = 8
mask_series = (eurasia_df.country_code == 'QA')
eurasia_df.loc[mask_series, 'font_size'] = 4
mask_series = (eurasia_df.country_code == 'RS')
eurasia_df.loc[mask_series, 'font_size'] = 10
mask_series = (eurasia_df.country_code == 'SK')
eurasia_df.loc[mask_series, 'text_x'] = 383.66235
eurasia_df.loc[mask_series, 'text_y'] = 355.78854
eurasia_df.loc[mask_series, 'font_size'] = 8
mask_series = (eurasia_df.country_code == 'SI')
eurasia_df.loc[mask_series, 'text_x'] = 343.6947
eurasia_df.loc[mask_series, 'text_y'] = 379.68457
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'LK')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'CH')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'TW')
eurasia_df.loc[mask_series, 'font_size'] = 6
mask_series = (eurasia_df.country_code == 'AE')
eurasia_df.loc[mask_series, 'text_x'] = 674.8313
eurasia_df.loc[mask_series, 'text_y'] = 576.52356
eurasia_df.loc[mask_series, 'font_size'] = 10
mask_series = (eurasia_df.country_code == 'GB')
eurasia_df.loc[mask_series, 'text_x'] = 195.78926
eurasia_df.loc[mask_series, 'text_y'] = 327.91925
eurasia_df.loc[mask_series, 'font_size'] = 10
mask_series = (eurasia_df.country_code == 'VN')
eurasia_df.loc[mask_series, 'text_x'] = 1111.8936
eurasia_df.loc[mask_series, 'text_y'] = 590.16235
eurasia_df.loc[mask_series, 'font_size'] = 8
mask_series = (eurasia_df.country_code == 'YE')
eurasia_df.loc[mask_series, 'text_x'] = 629.91254
eurasia_df.loc[mask_series, 'text_y'] = 639.30615

In [None]:

# Get the correct size outlines
file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\AF_codes_state_equivalent_district_abbreviation.svg'
root = et.parse(file_path).getroot()
rows_list = []
for el in root.iter():
    rows_list.append(el.attrib)
df = pd.DataFrame(rows_list).rename(columns={'d': 'outline_d', 'data-name': 'data_name'})
mask_series = df.data_name.isnull()
outline_d_dict = df[~mask_series].set_index('data_name').outline_d.to_dict()
outline_d_series = eurasia_df.country_name.map(lambda x: outline_d_dict.get(x, np.nan))
assert any(outline_d_series.isnull()) == False
eurasia_df.outline_d = outline_d_series
mask_series = eurasia_df.outline_d.isnull()
for state_name, row_series in eurasia_df[~mask_series].iterrows():
    path_obj = svgpathtools.parse_path(pathdef=row_series.outline_d, current_pos=0j)
    poly_obj = path_to_poly(path_obj)
    eurasia_df.loc[state_name, 'centroid_x'] = poly_obj.centroid.x
    eurasia_df.loc[state_name, 'centroid_y'] = poly_obj.centroid.y

In [38]:

# Extract the paths from the SVG
import xml.etree.ElementTree as et
import pandas as pd
import collections

file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\AF_codes_state_equivalent_district_abbreviation.svg'
root = et.parse(file_path).getroot()
rows_list = []
for el in root.iter():
    rows_list.append(el.attrib)
df = pd.DataFrame(rows_list).rename(columns={'{http://www.w3.org/XML/1998/namespace}space': 'namespace',
                                             '{http://www.inkscape.org/namespaces/inkscape}version': 'inkscape_version',
                                             '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}docname':'docname',
                                             '{http://www.inkscape.org/namespaces/inkscape}pageshadow': 'pageshadow',
                                             '{http://www.inkscape.org/namespaces/inkscape}pageopacity': 'pageopacity',
                                             '{http://www.inkscape.org/namespaces/inkscape}pagecheckerboard': 'pagecheckerboard',
                                             '{http://www.inkscape.org/namespaces/inkscape}zoom': 'zoom',
                                             '{http://www.inkscape.org/namespaces/inkscape}cx': 'cx',
                                             '{http://www.inkscape.org/namespaces/inkscape}cy': 'cy',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-width': 'window_width',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-height': 'window_height',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-x': 'window_x',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-y': 'window_y',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-maximized': 'window_maximized',
                                             '{http://www.inkscape.org/namespaces/inkscape}current-layer': 'current_layer',
                                             'id': 'tag_id',
                                             'd': 'outline_d',
                                             '{http://www.inkscape.org/namespaces/inkscape}connector-curvature': 'connector_curvature',
                                             '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about': 'about',
                                             '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'resource',
                                             '{http://www.inkscape.org/namespaces/inkscape}label': 'label',
                                             '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}role': 'role',
                                             '{http://www.w3.org/1999/xlink}href': 'href',
                                             '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}insensitive': 'insensitive'})
df.columns = df.columns.map(lambda x: x.replace('-', '_'))
assert all(map(lambda x: x==1, collections.Counter(df.columns).values())), "You doubled up the column names somehow"
sorted(df.columns.tolist())

['about', 'bordercolor', 'borderopacity', 'clip_path', 'connector_curvature', 'current_layer', 'cx', 'cy', 'data_id', 'data_name', 'docname', 'height', 'href', 'inkscape_version', 'label', 'namespace', 'outline_d', 'pagecheckerboard', 'pagecolor', 'pageopacity', 'pageshadow', 'resource', 'role', 'showgrid', 'style', 'tag_id', 'transform', 'type', 'version', 'viewBox', 'width', 'window_height', 'window_maximized', 'window_width', 'window_x', 'window_y', 'x', 'y', 'zoom']

In [None]:

# Compute the centroids
from svgpathtools import Line, Path
from shapely.geometry import Polygon
import math
import svgpathtools

# Convert paths to polygons
def path_to_poly(inpath):
    points = []
    for path in inpath:
        if isinstance(path, Line):
            points.append([path.end.real, path.end.imag])
        else:
            num_segments = math.ceil(path.length() / 1.0)
            for seg_i in range(int(num_segments + 1)):
                points.append([path.point(seg_i / num_segments).real,
                                path.point(seg_i / num_segments).imag])
    
    return Polygon(points)

eurasia_df = s.load_object('eurasia_df')
mask_series = eurasia_df.outline_d.isnull()
for state_name, row_series in eurasia_df[~mask_series].iterrows():
    path_obj = svgpathtools.parse_path(pathdef=row_series.outline_d, current_pos=0j)
    poly_obj = path_to_poly(path_obj)
    eurasia_df.loc[state_name, 'centroid_x'] = poly_obj.centroid.x
    eurasia_df.loc[state_name, 'centroid_y'] = poly_obj.centroid.y
s.store_objects(eurasia_df=eurasia_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle


In [42]:

mask_series = df.data_name.isnull()
print(df[~mask_series].data_name.tolist())
df[mask_series].sample(min(8, df[mask_series].shape[0])).dropna(axis='columns', how='all').T

['Yemen', 'Viet Nam', 'Uzbekistan', 'United Kingdom of Great Britain and Northern Ireland', 'United Arab Emirates', 'Ukraine', 'Turkmenistan', 'Turkey', 'Thailand', 'Tajikistan', 'Taiwan, Province of China', 'Syrian Arab Republic', 'Switzerland', 'Sweden', 'Sri Lanka', 'Spain', 'Slovenia', 'Slovakia', 'Serbia', 'Saudi Arabia', 'Russian Federation', 'Romania', 'Qatar', 'Portugal', 'Poland', 'Philippines', 'Papua New Guinea', 'Pakistan', 'Oman', 'Norway', 'North Macedonia', 'Netherlands', 'Nepal', 'Myanmar', 'Montenegro', 'Mongolia', 'Moldova, Republic of', 'Malaysia', 'Luxembourg', 'Lithuania', 'Lebanon', 'Latvia', "Lao People's Democratic Republic", 'Kyrgyzstan', 'Kuwait', 'Korea, Republic of', "Korea (Democratic People's Republic of)", 'Kazakhstan', 'Jordan', 'Japan', 'Italy', 'Israel', 'Ireland', 'Iraq', 'Iran (Islamic Republic of)', 'Indonesia', 'India', 'Iceland', 'Hungary', 'Greece', 'Germany', 'Georgia', 'France', 'Finland', 'Estonia', 'Denmark', 'Czechia', 'Croatia', 'China', 'C

Unnamed: 0,326,362,329,283,348,306,131,356
width,,100%,,,,,,100%
height,,100%,,,,,,100%
tag_id,colorbar_7_text,colorbar_12974_use,DejaVuSans-33,colorbar_12852_defs,DejaVuSans-76,colorbar_12891_g,text-saudi-arabia,colorbar_12962_use
outline_d,,,"m 2597,2516 q 453,-97 707,-404 255,-306 255,-7...",,"M 191,3500 H 800 L 1894,563 2988,3500 h 609 L ...",,,
style,,,,,,,font-style:normal;font-variant:normal;font-wei...,
x,,604.58984,,,,,971.06683,308.10547
y,,0,,,,,645.21332,0
label,,,,,,,Saudi Arabia district abbreviation,
namespace,,,,,,,preserve,
transform,,,scale(0.015625),,scale(0.015625),,,


In [40]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, osp.join(os.pardir, 'py'))

from storage import Storage
from stats_scraping_utils import StatsScrapingUtilities

s = Storage()
ssu = StatsScrapingUtilities(s=s)
column_descriptions_df = ssu.get_column_descriptions(df)
column_descriptions_df.sort_values(['count_uniques'], ascending=[False])

Unnamed: 0,column_name,dtype,count_blanks,count_uniques,count_zeroes,has_dates,min_value,max_value,only_integers
4,tag_id,object,4,373,0,False,DejaVuSans-20,tspan-yemen0,
24,outline_d,object,268,108,0,False,,"m 983.33239,538.83155 c 2.86996,-3.32901 5.327...",
31,label,object,293,84,0,False,Adjacent Country Backgrounds,Yemen district abbreviation,
33,data_name,object,294,83,0,False,Afghanistan,Yemen,
32,data_id,object,294,83,0,False,AE,YE,
25,style,object,198,43,0,False,display:inline;fill:#c8eafb;stroke-width:2.73218,stroke:#000000;stroke-width:0.80000001;stroke-...,
29,x,object,169,37,0,False,0,995.68799,False
38,href,object,334,24,0,False,#DejaVuSans-20,"data:image/png;base64, iVBORw0KGgoAAAANSUhEUgA...",
30,y,object,169,21,0,False,-52,645.21332,False
36,transform,object,345,12,0,False,"matrix(0,-0.1,-0.1,0,96.123438,273.24891)","translate(1665.5921,409.56979)",


In [8]:

eurasia_df = s.load_object('eurasia_df')

column_descriptions_df = ssu.get_column_descriptions(eurasia_df)
# mask_series = (column_descriptions_df.dtype == 'object')
column_descriptions_df.sort_values(['count_uniques'], ascending=[False])

Unnamed: 0,column_name,dtype,count_blanks,count_uniques,count_zeroes,has_dates,min_value,max_value,only_integers
51,country_code,object,0,82,0,False,AE,YE,
15,country_name,object,0,82,0,False,Afghanistan,Yemen,
48,outline_d,object,0,82,0,False,"M 5.3057917,76.885976 C 6.0117246,76.009207 7....","m 99.563364,127.16816 c 0.499796,-0.63533 -0.8...",
25,fr_country_name,object,0,82,0,False,Afghanistan,Yemen,
21,wikipedia_title,object,0,82,0,False,Afghanistan,Yemen,
20,wikipedia_url,object,0,82,0,False,https://en.wikipedia.org/wiki/Afghanistan,https://en.wikipedia.org/wiki/Yemen,
18,iso_3166_2,object,0,82,0,False,ISO 3166-2:AE,ISO 3166-2:YE,
17,cc_tld,object,0,82,0,False,.ae,.ye,
2,imf_estimate,float64,12,71,0,True,2480.0,16862979.0,False
4,un_estimate,float64,12,71,0,True,2483.0,14722801.0,False


In [30]:

eurasia_df = s.load_object('eurasia_df')
mask_series = eurasia_df.state_equivalent_gdp.isnull()
# s.store_objects(eurasia_df=eurasia_df)
print(eurasia_df[mask_series].country_name.tolist())
columns_list = ['country_name', 'country_region', 'imf_estimate', 'imf_year', 'un_estimate', 'un_year', 'wb_estimate', 'wb_year']
eurasia_df[mask_series][columns_list]

[]


Unnamed: 0,country_name,country_region,imf_estimate,imf_year,un_estimate,un_year,wb_estimate,wb_year


In [29]:

eurasia_df = s.load_object('eurasia_df')
mask_series = eurasia_df.state_equivalent_gdp.isnull()
COUNTRY_TUPLES_LIST = [(r.country_name, r.wb_estimate) for i, r in eurasia_df[mask_series].iterrows()]

# Get the gdp equivalents
gdp_us_states_df = s.load_object('gdp_us_states_df')
state_tuples_list = [(r.state_name, r.gdp_millions_usd_2021) for i, r in gdp_us_states_df.iterrows() if str(r.state_name) != 'nan']
for country_tuple in COUNTRY_TUPLES_LIST:
    candidate_tuple = sorted([s for s in state_tuples_list], key=lambda x: abs(x[1] - country_tuple[1]))[0]
    state_name = candidate_tuple[0]
    country_name = country_tuple[0]
    print(f'{country_name} is close to the GDP of {state_name}')
    mask_series = (eurasia_df.country_name == country_name)
    eurasia_df.loc[mask_series, 'state_equivalent_gdp'] = state_name
s.store_objects(eurasia_df=eurasia_df)

Moldova, Republic of is close to the GDP of Vermont
Czechia is close to the GDP of Alabama
United Kingdom of Great Britain and Northern Ireland is close to the GDP of California
Russian Federation is close to the GDP of Florida
Syrian Arab Republic is close to the GDP of Rhode Island
Iran (Islamic Republic of) is close to the GDP of Nevada
Viet Nam is close to the GDP of South Carolina
Lao People's Democratic Republic is close to the GDP of Vermont
Brunei Darussalam is close to the GDP of Vermont
Taiwan, Province of China is close to the GDP of Washington
Korea (Democratic People's Republic of) is close to the GDP of Vermont
Korea, Republic of is close to the GDP of New York
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle


In [19]:

eurasia_df = s.load_object('eurasia_df')
mask_series = (eurasia_df.state_equivalent_gdp == 'District of Columbia')
COUNTRY_TUPLES_LIST = [(r.country_name, r.wb_estimate) for i, r in eurasia_df[mask_series].iterrows()]

# Get the gdp equivalents
gdp_us_states_df = s.load_object('gdp_us_states_df')
state_tuples_list = [(r.state_name, r.gdp_millions_usd_2021) for i, r in gdp_us_states_df.iterrows() if str(r.state_name) != 'District of Columbia']
for country_tuple in COUNTRY_TUPLES_LIST:
    candidate_tuple = sorted([s for s in state_tuples_list], key=lambda x: abs(x[1] - country_tuple[1]))[0]
    state_name = candidate_tuple[0]
    country_name = country_tuple[0]
    print(f'{country_name} is close to the GDP of {state_name}')
    mask_series = (eurasia_df.country_name == country_name)
    eurasia_df.loc[mask_series, 'state_equivalent_gdp'] = state_name
s.store_objects(eurasia_df=eurasia_df)

Ukraine is close to the GDP of Nebraska
Hungary is close to the GDP of Nebraska
Iraq is close to the GDP of Nebraska
Kazakhstan is close to the GDP of Nebraska
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle


In [31]:

import numpy as np

eurasia_df = s.load_object('eurasia_df')
color_names_dict = {'green': '#aad5aa', 'purple': '#ffd5aa', 'orange': '#c1d5ff', 'pink': '#ffffaa', 'yellow': '#d5ffff'}
eurasia_df['country_color'] = np.nan
s.store_objects(eurasia_df=eurasia_df, map_color_names_dict=color_names_dict)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\map_color_names_dict.pickle


In [9]:

# Extract the paths from the SVG
import xml.etree.ElementTree as et
import pandas as pd

file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\AF_district_abbreviation.svg'
root = et.parse(file_path).getroot()
rows_list = []
for el in root.iter():
    rows_list.append(el.attrib)
df = pd.DataFrame(rows_list).rename(columns={
    'width': 'svg_width',
    '{http://www.inkscape.org/namespaces/inkscape}version': 'inkscape_version',
    'height': 'svg_height',
    'id': 'svg_id',
    '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}docname': 'sodipodi_docname',
    'version': 'svg_version',
    'viewBox': 'svg_viewBox',
    'bordercolor': 'svg_bordercolor',
    'borderopacity': 'svg_borderopacity',
    '{http://www.inkscape.org/namespaces/inkscape}current-layer': 'inkscape_current_layer',
    '{http://www.inkscape.org/namespaces/inkscape}cx': 'inkscape_cx',
    '{http://www.inkscape.org/namespaces/inkscape}cy': 'inkscape_cy',
    '{http://www.inkscape.org/namespaces/inkscape}pagecheckerboard': 'inkscape_pagecheckerboard',
    '{http://www.inkscape.org/namespaces/inkscape}pageopacity': 'inkscape_pageopacity',
    '{http://www.inkscape.org/namespaces/inkscape}pageshadow': 'inkscape_pageshadow',
    '{http://www.inkscape.org/namespaces/inkscape}window-height': 'inkscape_window_height',
    '{http://www.inkscape.org/namespaces/inkscape}window-maximized': 'inkscape_window_maximized',
    '{http://www.inkscape.org/namespaces/inkscape}window-width': 'inkscape_window_width',
    '{http://www.inkscape.org/namespaces/inkscape}window-x': 'inkscape_window_x',
    '{http://www.inkscape.org/namespaces/inkscape}window-y': 'inkscape_window_y',
    '{http://www.inkscape.org/namespaces/inkscape}zoom': 'inkscape_zoom',
    'pagecolor': 'svg_pagecolor',
    'showgrid': 'svg_showgrid',
    'type': 'svg_type',
    'd': 'svg_d',
    'style': 'svg_style',
    '{http://www.inkscape.org/namespaces/inkscape}connector-curvature': 'inkscape_connector_curvature',
    '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about': 'rdf_about',
    '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'rdf_resource',
    'x': 'svg_x',
    'y': 'svg_y',
    '{http://www.inkscape.org/namespaces/inkscape}label': 'inkscape_label',
    '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}insensitive': 'sodipodi_insensitive',
    'data-id': 'svg_data_id',
    'data-name': 'svg_data_name',
    '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}role': 'sodipodi_role'})

In [21]:

%run ../load_magic/storage.py
%pprint
s = Storage()

Pretty printing has been turned OFF


In [23]:

mask_series = df.svg_data_name.isnull()
country_color_dict = df[~mask_series].set_index('svg_data_name').svg_style.map(lambda x: str(x).split('fill:')[-1].split(';')[0]).to_dict()
s.store_objects(eurasia_country_name_color_dict=country_color_dict)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_country_name_color_dict.pickle


In [27]:

mask_series = df.svg_data_name.isnull()
country_color_dict = df[~mask_series].set_index('svg_data_id').svg_style.map(lambda x: str(x).split('fill:')[-1].split(';')[0]).to_dict()
s.store_objects(eurasia_country_code_color_dict=country_color_dict)
country_color_dict

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_country_code_color_dict.pickle


{'YE': '#fef9b7', 'VN': '#fad5dc', 'UZ': '#fef9b7', 'GB': '#f1c8b2', 'AE': '#d5c9e1', 'UA': '#fef9b7', 'TM': '#fad5dc', 'TR': '#fad5dc', 'TH': '#fef9b7', 'TJ': '#f1c8b2', 'TW': '#aad5aa', 'SY': '#fef9b7', 'CH': '#aad5aa', 'SE': '#fad5dc', 'LK': '#aad5aa', 'ES': '#fef9b7', 'SI': '#f1c8b2', 'SK': '#aad5aa', 'RS': '#fad5dc', 'SA': '#aad5aa', 'RU': '#aad5aa', 'RO': '#f1c8b2', 'QA': '#fad5dc', 'PT': '#f1c8b2', 'PL': '#d5c9e1', 'PH': '#d5c9e1', 'PG': '#fef9b7', 'PK': '#d5c9e1', 'OM': '#f1c8b2', 'NO': '#d5c9e1', 'MK': '#d5c9e1', 'NL': '#fad5dc', 'NP': '#d5c9e1', 'MM': '#d5c9e1', 'ME': '#aad5aa', 'MN': '#f1c8b2', 'MD': '#d5c9e1', 'MY': '#aad5aa', 'LU': '#fef9b7', 'LT': '#f1c8b2', 'LB': '#aad5aa', 'LV': '#d5c9e1', 'LA': '#aad5aa', 'KG': '#fad5dc', 'KW': '#d5c9e1', 'KR': '#fad5dc', 'KP': '#d5c9e1', 'KZ': '#d5c9e1', 'JO': '#d5c9e1', 'JP': '#f1c8b2', 'IT': '#fad5dc', 'IL': '#fad5dc', 'IE': '#fef9b7', 'IQ': '#f1c8b2', 'IR': '#fef9b7', 'ID': '#f1c8b2', 'IN': '#fad5dc', 'IS': '#aad5aa', 'HU': '#d5c9e

In [28]:

df[~mask_series].sample(6).dropna(axis='columns', how='all').T

Unnamed: 0,87,52,46,84,89,49
svg_id,district-belarus,district-lebanon,district-montenegro,district-bosnia-and-herzegovina,district-azerbaijan,district-malaysia
svg_d,"m 423.80163,309.65117 c 1.86701,0.37418 3.9886...","m 529.42316,476.15526 c 1.32312,0.31631 5.0494...","m 388.11997,413.11639 c -2.2605,-0.22377 -2.49...","m 373.36512,404.39078 c -0.81007,-0.54775 0.82...","m 640.63035,443.43616 c -1.73587,-1.28068 -4.1...","m 1097.0816,719.07212 c 2.8044,0.0385 3.9694,2..."
svg_style,stroke-width:1;fill:#fad5dc;fill-opacity:1,stroke-width:1;fill:#aad5aa;fill-opacity:1,stroke-width:1;fill:#aad5aa;fill-opacity:1,stroke-width:1;fill:#fef9b7;fill-opacity:1,stroke-width:1;fill:#f1c8b2;fill-opacity:1,fill:#aad5aa;fill-opacity:1;stroke-width:1
inkscape_connector_curvature,0,0,0,0,0,0
sodipodi_insensitive,,,,,,true
svg_data_id,BY,LB,ME,BA,AZ,MY
svg_data_name,Belarus,Lebanon,Montenegro,Bosnia and Herzegovina,Azerbaijan,Malaysia


In [29]:

eurasia_df = s.load_object('eurasia_df')
eurasia_df.country_color = eurasia_df.country_code.map(lambda x: country_color_dict.get(x, '#f9f9f9'))
s.store_objects(eurasia_df=eurasia_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle


In [56]:

columns_list = ['country_name', 'country_code', 'state_equivalent_gdp', 'country_color']
eurasia_df[columns_list].groupby(['state_equivalent_gdp',
                                  'country_color']).count().reset_index().groupby(['state_equivalent_gdp']).agg({'country_color': 'max'})

Unnamed: 0_level_0,country_color
state_equivalent_gdp,Unnamed: 1_level_1
Alabama,#fad5dc
Alaska,#fad5dc
Arizona,#fad5dc
Arkansas,#fad5dc
California,#fef9b7
Colorado,#fef9b7
Connecticut,#aad5aa
Delaware,#aad5aa
Florida,#fef9b7
Georgia,#aad5aa


In [36]:

columns_list = ['country_name', 'country_code', 'state_equivalent_gdp', 'country_color']
eurasia_df.groupby(['state_equivalent_gdp', 'country_color']).count().sort_values(['country_name'], ascending=False).country_name

state_equivalent_gdp  country_color
Vermont           #aad5aa          6
                  #d5c9e1          6
                  #fef9b7          5
                  #f1c8b2          5
California        #f1c8b2          3
Nebraska          #d5c9e1          2
Alaska            #f1c8b2          2
Missouri          #d5c9e1          2
New York          #fad5dc          2
Ohio              #fad5dc          1
                  #aad5aa          1
North Dakota      #fef9b7          1
New Mexico        #aad5aa          1
Rhode Island      #fef9b7          1
Nevada            #fef9b7          1
                  #aad5aa          1
Nebraska          #fef9b7          1
Oregon            #d5c9e1          1
Alabama           #f1c8b2          1
South Carolina    #f1c8b2          1
                  #fad5dc          1
Tennessee         #d5c9e1          1
Nebraska          #f1c8b2          1
Texas             #d5c9e1          1
                  #fad5dc          1
Vermont           #fad5dc          1
Vi


----
# Add military expenditures

In [None]:

from pandas import DataFrame

In [59]:

import pandas as pd

url = 'https://worldpopulationreview.com/5a290b41-948f-47a6-aebb-61fea60e1580'
file_path = '../data/csv/military_expendatures_by_country_2021.csv'
military_expenditures_df = pd.read_csv(file_path).rename(columns={
    'country': 'country_name',
    'spending': 'military_spending_usd_2021',
    'pop2022': 'population_2022'})
mask_series = (military_expenditures_df.country_name == 'Vietnam')
military_expenditures_df.loc[mask_series, 'country_name'] = 'Viet Nam'
mask_series = (military_expenditures_df.country_name == 'Czech Republic')
military_expenditures_df.loc[mask_series, 'country_name'] = 'Czechia'
mask_series = (military_expenditures_df.country_name == 'Syria')
military_expenditures_df.loc[mask_series, 'country_name'] = 'Syrian Arab Republic'
mask_series = (military_expenditures_df.country_name == 'Iran')
military_expenditures_df.loc[mask_series, 'country_name'] = 'Iran (Islamic Republic of)'
mask_series = (military_expenditures_df.country_name == 'South Korea')
military_expenditures_df.loc[mask_series, 'country_name'] = 'Korea, Republic of'
mask_series = (military_expenditures_df.country_name == 'North Korea')
military_expenditures_df.loc[mask_series, 'country_name'] = "Korea (Democratic People's Republic of)"
mask_series = (military_expenditures_df.country_name == 'Moldova')
military_expenditures_df.loc[mask_series, 'country_name'] = 'Moldova, Republic of'
mask_series = (military_expenditures_df.country_name == 'Laos')
military_expenditures_df.loc[mask_series, 'country_name'] = "Lao People's Democratic Republic"
mask_series = (military_expenditures_df.country_name == 'Russia')
military_expenditures_df.loc[mask_series, 'country_name'] = 'Russian Federation'
mask_series = (military_expenditures_df.country_name == 'United Kingdom')
military_expenditures_df.loc[mask_series, 'country_name'] = 'United Kingdom of Great Britain and Northern Ireland'
mask_series = (military_expenditures_df.country_name == 'Taiwan')
military_expenditures_df.loc[mask_series, 'country_name'] = 'Taiwan, Province of China'
military_expenditures_df = pd.concat([military_expenditures, DataFrame([
    {'country_name': 'Iceland', 'military_spending_usd_2021': 0, 'population_2022': 344_727},
    {'country_name': 'Brunei Darussalam', 'military_spending_usd_2021': 410_000_000, 'population_2022': 444_480},
    {'country_name': 'Luxembourg', 'military_spending_usd_2021': 430_000_000, 'population_2022': 625_978},

    # Percent of GDP: 1.4 (2005 est.)
    {'country_name': 'Papua New Guinea', 'military_spending_usd_2021': 4_870_000_000*0.014, 'population_2022': 8_950_000}
    
]])).reset_index(drop=True)
s.store_objects(military_expenditures_df=military_expenditures_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\military_expenditures_df.pickle


In [60]:

military_expenditures_df

Unnamed: 0,country_name,military_spending_usd_2021,population_2022
0,United States,7.500000e+11,334805.269
1,China,2.370000e+11,1448471.400
2,Saudi Arabia,6.760000e+10,35844.909
3,India,6.100000e+10,1406631.776
4,United Kingdom of Great Britain and Northern I...,5.510000e+10,68497.907
...,...,...,...
136,Belarus,6.237000e+02,9432.800
137,Iceland,0.000000e+00,344727.000
138,Brunei Darussalam,4.100000e+08,444480.000
139,Luxembourg,4.300000e+08,625978.000


In [62]:

%run ../load_magic/lists.py
df = check_for_typos(eurasia_df.country_name.tolist(), military_expenditures_df.country_name.tolist(), verbose=False)
mask_series = (df.max_similarity < 1.0)
df[mask_series].sort_values('max_similarity', ascending=False).head(20)

Unnamed: 0,left_item,right_item,max_similarity


In [63]:

eurasia_df = s.load_object('eurasia_df')
eurasia_df = eurasia_df.merge(military_expenditures_df, on=['country_name'], how='left')
s.store_objects(eurasia_df=eurasia_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle


In [57]:

eurasia_df = s.load_object('eurasia_df').rename(columns={
    'state_equivalent': 'state_equivalent_gdp'})
s.store_objects(eurasia_df=eurasia_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle


In [58]:

eurasia_df = s.load_object('eurasia_df').rename(columns={'spending_usd_2021': 'military_spending_usd_2021'})
s.store_objects(eurasia_df=eurasia_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle



----
# Create a Military-expenditures Equivalent Column

In [18]:

eurasia_df = s.load_object('eurasia_df')
# eurasia_df.columns.tolist()
columns_list = ['country_name', 'country_region', 'military_spending_usd_2021']
eurasia_df[columns_list]

Unnamed: 0,country_name,country_region,military_spending_usd_2021
0,Montenegro,Europe,6.500000e+07
1,Slovenia,Europe,5.810000e+08
2,Croatia,Europe,8.000000e+08
3,Serbia,Europe,9.070000e+08
4,North Macedonia,Europe,1.081500e+08
...,...,...,...
77,China,Asia,2.370000e+11
78,Japan,Asia,4.900000e+10
79,"Taiwan, Province of China",,1.072500e+10
80,Korea (Democratic People's Republic of),,1.600000e+09


In [68]:

# Get the military expenditure equivalents
import numpy as np

eurasia_df = s.load_object('eurasia_df')
COUNTRY_TUPLES_LIST = [(r.country_name, r.military_spending_usd_2021) for i, r in eurasia_df.iterrows()]
us_stats_df = s.load_object('us_stats_df')
state_tuples_list = [(r.state_name, r.gdp_proportion_of_military_expenditures_2021) for i,
                     r in us_stats_df.reset_index().iterrows() if r.state_name != 'District of Columbia']
rows_list = []
for country_tuple in COUNTRY_TUPLES_LIST:
    candidate_tuple = sorted([s for s in state_tuples_list], key=lambda x: abs(x[1] - country_tuple[1]))[0]
    state_name = candidate_tuple[0]
    country_name = country_tuple[0]
    # print(f'{country_name} is close to the military expenditure of {state_name}')
    row_dict = {}
    row_dict['country_name'] = country_name
    row_dict['state_name'] = state_name
    rows_list.append(row_dict)
country_to_state_mee_dict = pd.DataFrame(rows_list).set_index('country_name').state_name.to_dict()
eurasia_df['state_equivalent_military_expenditure'] = eurasia_df.country_name.map(lambda x: country_to_state_mee_dict.get(x, np.nan))
s.store_objects(eurasia_df=eurasia_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle


In [67]:

eurasia_df = s.load_object('eurasia_df')
columns_list = ['country_name', 'state_equivalent_gdp', 'state_equivalent_military_expenditure']
eurasia_df.groupby(['state_equivalent_gdp', 'state_equivalent_military_expenditure']).count().country_name.sort_values(ascending=False)

state_equivalent_gdp  state_equivalent_military_expenditure
Vermont               Vermont                                  17
Alaska                Vermont                                   4
Vermont               Alaska                                    2
Montana               Vermont                                   2
California            Florida                                   2
Vermont               Wyoming                                   2
Alabama               District of Columbia                      1
Oregon                Missouri                                  1
Ohio                  Virginia                                  1
                      Nebraska                                  1
North Dakota          Vermont                                   1
New York              Florida                                   1
                      Pennsylvania                              1
South Carolina        District of Columbia                      1
New Mexico      

In [51]:

import numpy as np

eurasia_df = s.load_object('eurasia_df')
all_countries_list = eurasia_df.country_name.tolist()
gdp_countries_df = s.load_object('gdp_countries_df')
COUNTRY_TUPLES_LIST = [(r.country_name, r.wb_estimate) for i, r in gdp_countries_df.iterrows() if r.country_name in all_countries_list]
def get_gdp_rows_list(tuples_list, country_tuples_list=COUNTRY_TUPLES_LIST, country_column_name='country_name',
                  column_name='city_or_state_name', verbose=True):
    rows_list = []
    for country_tuple in country_tuples_list:
        candidate_tuple = sorted([s for s in tuples_list], key=lambda x: abs(x[1] - country_tuple[1]))[0]
        city_or_state_name = candidate_tuple[0]
        country_name = country_tuple[0]
        print(f'{country_name} is close to the GDP of {city_or_state_name}')
        row_dict = {}
        row_dict[country_column_name] = country_name
        row_dict[column_name] = city_or_state_name
        rows_list.append(row_dict)
    
    return rows_list

In [52]:

import numpy as np
eurasia_df = s.load_object('eurasia_df')
if 'city_equivalent_gdp' not in eurasia_df.columns:
    
    # Get the city and country lists
    all_countries_list = eurasia_df.country_name.tolist()
    gdp_countries_df = s.load_object('gdp_countries_df')
    COUNTRY_TUPLES_LIST = [(r.country_name, r.wb_estimate) for i, r in gdp_countries_df.iterrows() if r.country_name in all_countries_list]
    gdp_cities_df = s.load_object('gdp_us_cities_df')
    city_tuples_list = [(r.city_name, r.gdp_2018) for i, r in gdp_cities_df.iterrows() if str(r.city_name) != 'nan']
    
    # Get the gdp equivalents
    rows_list = get_gdp_rows_list(city_tuples_list, column_name='city_name', verbose=False)
    country_to_city_gdp_equivalent_dict = pd.DataFrame(rows_list).set_index('country_name').city_name.to_dict()
    eurasia_df['city_equivalent_gdp'] = eurasia_df.country_name.map(lambda x: country_to_city_gdp_equivalent_dict.get(x, np.nan).split(', ')[0])
    s.store_objects(eurasia_df=eurasia_df)

In [54]:

eurasia_df = s.load_object('eurasia_df')
if 'any_equivalent_gdp' not in eurasia_df.columns:
    
    # Get the city list
    gdp_cities_df = s.load_object('gdp_us_cities_df')
    city_tuples_list = [(str(r.city_name).split(', ')[0], r.gdp_2018) for i, r in gdp_cities_df.iterrows() if str(r.city_name) != 'nan']
    
    # Get the state list
    gdp_us_states_df = s.load_object('gdp_us_states_df')
    state_tuples_list = [(r.state_name,
                          r.gdp_millions_usd_2021) for i, r in gdp_us_states_df.iterrows() if str(r.state_name) != 'District of Columbia']
    
    # Get the country list
    all_countries_list = eurasia_df.country_name.tolist()
    gdp_countries_df = s.load_object('gdp_countries_df')
    COUNTRY_TUPLES_LIST = [(r.country_name, r.wb_estimate) for i, r in gdp_countries_df.iterrows() if r.country_name in all_countries_list]
    
    # Get the gdp equivalents
    city_state_tuples_list = city_tuples_list + state_tuples_list
    import numpy as np
    rows_list = get_gdp_rows_list(city_state_tuples_list, verbose=True)
    # country_to_city_gdp_equivalent_dict = pd.DataFrame(rows_list).set_index('country_name').city_or_state_name.to_dict()
    # eurasia_df['any_equivalent_gdp'] = eurasia_df.country_name.map(lambda x: country_to_city_gdp_equivalent_dict.get(x, np.nan).split(', ')[0])
    # s.store_objects(eurasia_df=eurasia_df)

China is close to the GDP of California
Japan is close to the GDP of California
Germany is close to the GDP of California
India is close to the GDP of Texas
France is close to the GDP of Texas
Italy is close to the GDP of New York
Spain is close to the GDP of Florida
Indonesia is close to the GDP of Los Angeles
Netherlands is close to the GDP of Illinois
Saudi Arabia is close to the GDP of Chicago
Switzerland is close to the GDP of Ohio
Turkey is close to the GDP of Ohio
Poland is close to the GDP of Virginia
Sweden is close to the GDP of Washington
Belgium is close to the GDP of Dallas
Thailand is close to the GDP of Dallas
Ireland is close to the GDP of Tennessee
Austria is close to the GDP of Colorado
Israel is close to the GDP of Atlanta
Norway is close to the GDP of Missouri
United Arab Emirates is close to the GDP of Tennessee
Denmark is close to the GDP of Miami
Philippines is close to the GDP of Missouri
Malaysia is close to the GDP of San Jose
Bangladesh is close to the GDP of

In [67]:

if s.pickle_exists('geo_fips_df'):
    geo_fips_df = s.load_object('geo_fips_df')
else:
    try:
        file_name = 'CAGDP2__ALL_AREAS_2001_2020.csv'
        # file_path = os.path.join(s.data_csv_folder, file_name)
        geo_fips_df = s.load_csv(file_name, folder_path=s.data_folder)
    except Exception as e:
        print(f'{e.__class__.__name__} error: {str(e).strip()}')
        excel_path = '../data/xlsx/CAGDP2__ALL_AREAS_2001_2020.xlsx'
        geo_fips_df = pd.read_excel(excel_path)
    # columns_list = ['GeoName', 2020]
    geo_fips_df = geo_fips_df.rename(columns={
        'GeoFIPS': 'geo_fips',
        'GeoName': 'geo_name',
        'Region': 'geo_region',
        'TableName': 'table_name',
        'LineCode': 'line_code',
        'IndustryClassification': 'industry_classification',
        'Description': 'geo_description',
        'Unit': 'geo_unit',
        2001: 'gdp_2001',
        2002: 'gdp_2002',
        2003: 'gdp_2003',
        2004: 'gdp_2004',
        2005: 'gdp_2005',
        2006: 'gdp_2006',
        2007: 'gdp_2007',
        2008: 'gdp_2008',
        2009: 'gdp_2009',
        2010: 'gdp_2010',
        2011: 'gdp_2011',
        2012: 'gdp_2012',
        2013: 'gdp_2013',
        2014: 'gdp_2014',
        2015: 'gdp_2015',
        2016: 'gdp_2016',
        2017: 'gdp_2017',
        2018: 'gdp_2018',
        2019: 'gdp_2019',
        2020: 'gdp_2020',
    })
    print(geo_fips_df.shape)
    mask_series = (geo_fips_df.geo_description == 'All industry total')
    for i in range(2001, 2021):
        mask_series = mask_series & ~geo_fips_df[f'gdp_{i}'].isnull() & (geo_fips_df[f'gdp_{i}'] != '(NA)')
    geo_fips_df = geo_fips_df[mask_series]
    print(geo_fips_df.shape)
    print(geo_fips_df.columns.tolist())
    for i in range(2001, 2021):
        geo_fips_df[f'gdp_{i}'] = geo_fips_df[f'gdp_{i}'].map(lambda x: int(x)/1_000)
    s.store_objects(geo_fips_df=geo_fips_df)

<class 'UnicodeDecodeError'> error: 'utf-8' codec can't decode byte 0xf1 in position 86565: invalid continuation byte
(108052, 28)
(3166, 28)
['geo_fips', 'geo_name', 'geo_region', 'table_name', 'line_code', 'industry_classification', 'geo_description', 'geo_unit', 'gdp_2001', 'gdp_2002', 'gdp_2003', 'gdp_2004', 'gdp_2005', 'gdp_2006', 'gdp_2007', 'gdp_2008', 'gdp_2009', 'gdp_2010', 'gdp_2011', 'gdp_2012', 'gdp_2013', 'gdp_2014', 'gdp_2015', 'gdp_2016', 'gdp_2017', 'gdp_2018', 'gdp_2019', 'gdp_2020']
Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\geo_fips_df.pkl


In [68]:

# Get the gdp equivalents
gdp_countries_df = s.load_object('gdp_countries_df')
country_tuples_list = sorted([(r.country_name, r.wb_estimate) for i, r in gdp_countries_df.iterrows()], key=lambda x: x[1])
geo_tuples_list = [(r.geo_name, r.gdp_2020) for i, r in df.iterrows() if str(r.geo_name) != 'nan']
rows_list = get_gdp_rows_list(geo_tuples_list, country_tuples_list=country_tuples_list, column_name='geo_name', verbose=True)

New Caledonia is close to the GDP of Roane, TN
Liberia is close to the GDP of Columbia, WI
Djibouti is close to the GDP of Payne, OK
French Polynesia is close to the GDP of Paulding, GA
Sierra Leone is close to the GDP of Faulkner, AR
Eswatini is close to the GDP of Adams, PA
Maldives is close to the GDP of Franklin, WA
Barbados is close to the GDP of Mercer, PA
Fiji is close to the GDP of Mercer, PA
Montenegro is close to the GDP of Chelan, WA
Somalia is close to the GDP of Douglas, GA
Guyana is close to the GDP of Hancock, OH
Guam is close to the GDP of Dallas, IA
Cayman Islands is close to the GDP of Pennington, SD
Liechtenstein is close to the GDP of Jackson, MI
Monaco is close to the GDP of Houston, GA
Bermuda is close to the GDP of Kenosha, WI
Togo is close to the GDP of St. Charles, LA
Kosovo is close to the GDP of Broomfield, CO*
Kyrgyzstan is close to the GDP of Johnston, NC
Mauritania is close to the GDP of Monroe, PA
Tajikistan is close to the GDP of St. Mary's, MD
Equatoria