In [None]:

# Extract the paths from the SVG
import xml.etree.ElementTree as et
import collections

file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\Eurasia_location_map.svg'
root = et.parse(file_path).getroot()
rows_list = []
for el in root.iter():
    rows_list.append(el.attrib)
eurasia_df = pd.DataFrame(rows_list).rename(columns={'{http://www.w3.org/XML/1998/namespace}space': 'namespace',
                                                     '{http://www.inkscape.org/namespaces/inkscape}version': 'inkscape_version',
                                                     '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}docname':'docname',
                                                     '{http://www.inkscape.org/namespaces/inkscape}pageshadow': 'pageshadow',
                                                     '{http://www.inkscape.org/namespaces/inkscape}pageopacity': 'pageopacity',
                                                     '{http://www.inkscape.org/namespaces/inkscape}pagecheckerboard': 'pagecheckerboard',
                                                     '{http://www.inkscape.org/namespaces/inkscape}zoom': 'zoom',
                                                     '{http://www.inkscape.org/namespaces/inkscape}cx': 'cx',
                                                     '{http://www.inkscape.org/namespaces/inkscape}cy': 'cy',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-width': 'window_width',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-height': 'window_height',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-x': 'window_x',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-y': 'window_y',
                                                     '{http://www.inkscape.org/namespaces/inkscape}window-maximized': 'window_maximized',
                                                     '{http://www.inkscape.org/namespaces/inkscape}current-layer': 'current_layer',
                                                     'id': 'fr_country_name',
                                                     'd': 'outline_d'})
assert all(map(lambda x: x==1, collections.Counter(eurasia_df.columns).values())), "You doubled up the column names somehow"

In [None]:

# Convert the french name to an english name
import numpy as np

eurasia_df = eurasia_df.iloc[4:]
country_names_list = ['Montenegro', 'Slovenia', 'Croatia', 'Serbia', 'North Macedonia', 'Bosnia and Herzegovina', 'Albania', 'Lithuania',
                      'Estonia', 'Latvia', 'Ukraine', 'Belarus', 'Moldova, Republic of', 'Italy', 'Poland', 'Slovakia', 'Czechia', 'Hungary',
                      'Romania', 'Bulgaria', 'Greece', 'United Kingdom of Great Britain and Northern Ireland', 'Ireland', 'Iceland', 'Spain',
                      'Portugal', 'Denmark', 'Germany', 'Switzerland', 'Austria', 'France', 'Netherlands', 'Belgium', 'Luxembourg', 'Sweden',
                      'Norway', 'Finland', 'Russian Federation', 'Turkey', 'Georgia', 'Syrian Arab Republic', 'Jordan', 'Lebanon', 'Israel',
                      'Kuwait', 'Saudi Arabia', 'United Arab Emirates', 'Qatar', 'Yemen', 'Oman', 'Iraq', 'Iran (Islamic Republic of)',
                      'Armenia', 'Kazakhstan', 'Uzbekistan', 'Azerbaijan', 'Turkmenistan', 'Tajikistan', 'Kyrgyzstan', 'Afghanistan',
                      'Pakistan', 'India', 'Sri Lanka', 'Nepal', 'Bhutan', 'Bangladesh', 'Cambodia', 'Myanmar', 'Viet Nam', 'Malaysia',
                      'Thailand', "Lao People's Democratic Republic", 'Brunei Darussalam', 'Indonesia', 'Philippines', 'Papua New Guinea',
                      'Mongolia', 'China', 'Japan', 'Taiwan, Province of China', "Korea (Democratic People's Republic of)",
                      'Korea, Republic of']
fr_en_dict = {fr: en for fr, en in zip(eurasia_df.fr_country_name.tolist(), country_names_list)}
eurasia_df['country_name'] = eurasia_df.fr_country_name.map(lambda x: fr_en_dict.get(x, np.nan))

# Check for duplicate country paths
mask_series = eurasia_df.duplicated(subset=['country_name'], keep=False)
assert eurasia_df[mask_series].shape[0] == 0, "You've duplicated some country names"

mask_series = eurasia_df.country_name.isnull()
eurasia_df = eurasia_df[~mask_series]

In [None]:

# Get the state equivalent column
countries_list = [country_name for country_name in eurasia_df.country_name if str(country_name) != 'nan']

gdp_countries_df = s.load_object('gdp_countries_df')

# Check for duplicate country names
mask_series = gdp_countries_df.duplicated(subset=['country_name'], keep=False)
assert gdp_countries_df[mask_series].shape[0] == 0, "You've duplicated some country names in the GDP countries"

country_tuples_list = [(r.country_name, r.wb_estimate) for i, r in gdp_countries_df.iterrows() if r.country_name in countries_list]

In [None]:

gdp_us_states_df = s.load_object('gdp_us_states_df')

# Check for duplicate state name
mask_series = gdp_us_states_df.duplicated(subset=['state_name'], keep=False)
assert gdp_us_states_df[mask_series].shape[0] == 0, "You've duplicated some state names in the GDP states"

In [None]:

# Get the gdp equivalents
state_tuples_list = [(r.state_name, r.gdp_millions_usd_2021) for i, r in gdp_us_states_df.iterrows() if str(r.state_name) != 'nan']
rows_list = []
for country_tuple in country_tuples_list:
    candidate_tuple = sorted([s for s in state_tuples_list], key=lambda x: abs(x[1] - country_tuple[1]))[0]
    state_name = candidate_tuple[0]
    # print(f'{country_name} is close to the GDP of {state_name}')
    row_dict = {}
    country_name = country_tuple[0]
    row_dict['country_name'] = country_name
    row_dict['state_name'] = state_name
    rows_list.append(row_dict)
country_to_state_gdp_equivalent_dict = pd.DataFrame(rows_list).set_index('country_name').state_name.to_dict()
country_tuples_list = [(r.country_name, r.wb_estimate) for i, r in gdp_countries_df.iterrows() if str(r.country_name) != 'nan']
for state_tuple in state_tuples_list:
    candidate_tuple = sorted([s for s in country_tuples_list], key=lambda x: abs(x[1] - state_tuple[1]))[0]
    country_name = candidate_tuple[0]
    # print(f'{state_name} is close to the GDP of {country_name}')
    row_dict = {}
    state_name = state_tuple[0]
    row_dict['state_name'] = state_name
    row_dict['country_name'] = country_name
    rows_list.append(row_dict)
state_to_country_gdp_equivalent_dict = pd.DataFrame(rows_list).set_index('state_name').country_name.to_dict()
eurasia_df['state_equivalent_gdp'] = eurasia_df.country_name.map(lambda x: country_to_state_gdp_equivalent_dict.get(x, np.nan))

In [None]:

# Merge in the GDP and other country info
import re

suffixes_list = ['_all', '_euro', '_gdp', '_merge']
suffixes_regex = re.compile('_(all|euro|gdp|merge)')

# Check for merge cycling
assert all([all(map(lambda x: not column_name.endswith(x), suffixes_list)) for column_name in eurasia_df.columns])

print(f'eurasia_df has {eurasia_df.country_name.unique().shape[0]} unique countries')
# print(sorted(eurasia_df.columns.tolist(), key=lambda x: x[::-1]))

In [None]:

all_countries_df = s.load_object('all_countries_df')

# Check for duplicate country names
mask_series = all_countries_df.duplicated(subset=['country_name'], keep=False)
assert all_countries_df[mask_series].shape[0] == 0, "You've duplicated some country names in the all countries"

# Check for merge cycling
assert all([all(map(lambda x: not column_name.endswith(x), suffixes_list)) for column_name in all_countries_df.columns])

print(f'all_countries_df has {all_countries_df.country_name.unique().shape[0]} unique countries')
# print(sorted(all_countries_df.columns.tolist(), key=lambda x: x[::-1]))

In [None]:

merge_df = all_countries_df.merge(eurasia_df, how='right', on='country_name', suffixes=('_all', '_euro'))

# Check for duplicate country names
mask_series = merge_df.duplicated(subset=['country_name'], keep=False)
assert merge_df[mask_series].shape[0] == 0, "You've duplicated some country names in the first merge"

# Check for merge cycling
assert all([all(map(lambda x: not column_name.endswith(x), suffixes_list)) for column_name in merge_df.columns])

print(f'merge_df has {merge_df.country_name.unique().shape[0]} unique countries')
# print(sorted(merge_df.columns.tolist(), key=lambda x: x[::-1]))

gdp_countries_df = s.load_object('gdp_countries_df')
print(f'gdp_countries_df has {gdp_countries_df.country_name.unique().shape[0]} unique countries')
# print(sorted(gdp_countries_df.columns.tolist(), key=lambda x: x[::-1]))

In [None]:

merge_df = gdp_countries_df.merge(merge_df, how='right', on='country_name', suffixes=('_gdp', '_merge'))

# Check for duplicate country names
mask_series = merge_df.duplicated(subset=['country_name'], keep=False)
assert merge_df[mask_series].shape[0] == 0, "You've duplicated some country names in the second merge"

# Check for duplicate column names
assert all(map(lambda x: x==1, collections.Counter(merge_df.columns).values())), "You doubled up the column names somehow"

# Check for merge cycling
assert all([all(map(lambda x: not column_name.endswith(x), suffixes_list)) for column_name in merge_df.columns])

print(f'merge_df now has {merge_df.country_name.unique().shape[0]} unique countries')
print(sorted(merge_df.columns.tolist(), key=lambda x: x[::-1]))

In [None]:

# Initialize the inkscape settings
eurasia_df['text_x'] = np.nan
eurasia_df['text_y'] = np.nan
eurasia_df ['font_size'] = 12
eurasia_df ['svg_width'] = 683.62434
eurasia_df ['svg_height'] = 335.26391
eurasia_df ['inkscape_cx'] = 341.81217
eurasia_df ['inkscape_cy'] = 167.65197
eurasia_df ['inkscape_zoom'] = 1.9206455
eurasia_df ['legend_transform'] = 'translate(-12.768599,191.52893))'
eurasia_df ['colorbar_transform'] = 'translate(-12.768599,191.52893))'
eurasia_df['label_line_d'] = np.nan
def get_country_code(x):
    country_code = str(x).split(':')[-1]
    if country_code == 'nan':
        country_code = np.nan

    return country_code
eurasia_df['country_code'] = eurasia_df.iso_3166_2.map(get_country_code)

In [None]:

# Compute the centroids
from svgpathtools import Line, Path
from shapely.geometry import Polygon
import math

# Convert paths to polygons
def path_to_poly(inpath):
    points = []
    for path in inpath:
        if isinstance(path, Line):
            points.append([path.end.real, path.end.imag])
        else:
            num_segments = math.ceil(path.length() / 1.0)
            for seg_i in range(int(num_segments + 1)):
                points.append([path.point(seg_i / num_segments).real,
                                path.point(seg_i / num_segments).imag])

    return Polygon(points)

import svgpathtools

eurasia_df['centroid_x'] = np.nan
eurasia_df['centroid_y'] = np.nan
mask_series = eurasia_df.outline_d.isnull()
for state_name, row_series in eurasia_df[~mask_series].iterrows():
    path_obj = svgpathtools.parse_path(pathdef=row_series.outline_d, current_pos=0j)
    poly_obj = path_to_poly(path_obj)
    eurasia_df.loc[state_name, 'centroid_x'] = poly_obj.centroid.x
    eurasia_df.loc[state_name, 'centroid_y'] = poly_obj.centroid.y

In [None]:

# Set the text positions for each country
mask_series = (eurasia_df.country_code == 'AF')
eurasia_df.loc[mask_series, 'text_x'] = 285.10144
eurasia_df.loc[mask_series, 'text_y'] = 182.34619
mask_series = (eurasia_df.country_code == 'AF')
eurasia_df.loc[mask_series, 'text_x'] = 781.2049
eurasia_df.loc[mask_series, 'text_y'] = 487.15378
mask_series = (eurasia_df.country_code == 'AL')
eurasia_df.loc[mask_series, 'text_x'] = 393.65604
eurasia_df.loc[mask_series, 'text_y'] = 425.67667
mask_series = (eurasia_df.country_code == 'AM')
eurasia_df.loc[mask_series, 'text_x'] = 601.62408
eurasia_df.loc[mask_series, 'text_y'] = 415.94672
mask_series = (eurasia_df.country_code == 'AT')
eurasia_df.loc[mask_series, 'text_x'] = 349.4585
eurasia_df.loc[mask_series, 'text_y'] = 367.03894
mask_series = (eurasia_df.country_code == 'AZ')
eurasia_df.loc[mask_series, 'text_x'] = 632.58801
eurasia_df.loc[mask_series, 'text_y'] = 430.56351
mask_series = (eurasia_df.country_code == 'BD')
eurasia_df.loc[mask_series, 'text_x'] = 992.42297
eurasia_df.loc[mask_series, 'text_y'] = 569.45941
mask_series = (eurasia_df.country_code == 'BY')
eurasia_df.loc[mask_series, 'text_x'] = 465.46729
eurasia_df.loc[mask_series, 'text_y'] = 319.43814
mask_series = (eurasia_df.country_code == 'BE')
eurasia_df.loc[mask_series, 'text_x'] = 262.29007
eurasia_df.loc[mask_series, 'text_y'] = 340.96271
mask_series = (eurasia_df.country_code == 'BT')
eurasia_df.loc[mask_series, 'text_x'] = 995.68799
eurasia_df.loc[mask_series, 'text_y'] = 540.8139
mask_series = (eurasia_df.country_code == 'BA')
eurasia_df.loc[mask_series, 'text_x'] = 375.56683
eurasia_df.loc[mask_series, 'text_y'] = 396.71332

In [None]:

# Get the correct size outlines
file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\AF_codes_state_equivalent_district_abbreviation.svg'
root = et.parse(file_path).getroot()
rows_list = []
for el in root.iter():
    rows_list.append(el.attrib)
df = pd.DataFrame(rows_list).rename(columns={'d': 'outline_d', 'data-name': 'data_name'})
mask_series = df.data_name.isnull()
outline_d_dict = df[~mask_series].set_index('data_name').outline_d.to_dict()
outline_d_series = eurasia_df.country_name.map(lambda x: outline_d_dict.get(x, np.nan))
assert any(outline_d_series.isnull()) == False
eurasia_df.outline_d = outline_d_series
mask_series = eurasia_df.outline_d.isnull()
for state_name, row_series in eurasia_df[~mask_series].iterrows():
    path_obj = svgpathtools.parse_path(pathdef=row_series.outline_d, current_pos=0j)
    poly_obj = path_to_poly(path_obj)
    eurasia_df.loc[state_name, 'centroid_x'] = poly_obj.centroid.x
    eurasia_df.loc[state_name, 'centroid_y'] = poly_obj.centroid.y

In [38]:

# Extract the paths from the SVG
import xml.etree.ElementTree as et
import pandas as pd
import collections

file_path = r'C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\AF_codes_state_equivalent_district_abbreviation.svg'
root = et.parse(file_path).getroot()
rows_list = []
for el in root.iter():
    rows_list.append(el.attrib)
df = pd.DataFrame(rows_list).rename(columns={'{http://www.w3.org/XML/1998/namespace}space': 'namespace',
                                             '{http://www.inkscape.org/namespaces/inkscape}version': 'inkscape_version',
                                             '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}docname':'docname',
                                             '{http://www.inkscape.org/namespaces/inkscape}pageshadow': 'pageshadow',
                                             '{http://www.inkscape.org/namespaces/inkscape}pageopacity': 'pageopacity',
                                             '{http://www.inkscape.org/namespaces/inkscape}pagecheckerboard': 'pagecheckerboard',
                                             '{http://www.inkscape.org/namespaces/inkscape}zoom': 'zoom',
                                             '{http://www.inkscape.org/namespaces/inkscape}cx': 'cx',
                                             '{http://www.inkscape.org/namespaces/inkscape}cy': 'cy',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-width': 'window_width',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-height': 'window_height',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-x': 'window_x',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-y': 'window_y',
                                             '{http://www.inkscape.org/namespaces/inkscape}window-maximized': 'window_maximized',
                                             '{http://www.inkscape.org/namespaces/inkscape}current-layer': 'current_layer',
                                             'id': 'tag_id',
                                             'd': 'outline_d',
                                             '{http://www.inkscape.org/namespaces/inkscape}connector-curvature': 'connector_curvature',
                                             '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}about': 'about',
                                             '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource': 'resource',
                                             '{http://www.inkscape.org/namespaces/inkscape}label': 'label',
                                             '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}role': 'role',
                                             '{http://www.w3.org/1999/xlink}href': 'href',
                                             '{http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd}insensitive': 'insensitive'})
df.columns = df.columns.map(lambda x: x.replace('-', '_'))
assert all(map(lambda x: x==1, collections.Counter(df.columns).values())), "You doubled up the column names somehow"
sorted(df.columns.tolist())

['about', 'bordercolor', 'borderopacity', 'clip_path', 'connector_curvature', 'current_layer', 'cx', 'cy', 'data_id', 'data_name', 'docname', 'height', 'href', 'inkscape_version', 'label', 'namespace', 'outline_d', 'pagecheckerboard', 'pagecolor', 'pageopacity', 'pageshadow', 'resource', 'role', 'showgrid', 'style', 'tag_id', 'transform', 'type', 'version', 'viewBox', 'width', 'window_height', 'window_maximized', 'window_width', 'window_x', 'window_y', 'x', 'y', 'zoom']

In [None]:

# Compute the centroids
from svgpathtools import Line, Path
from shapely.geometry import Polygon
import math
import svgpathtools

# Convert paths to polygons
def path_to_poly(inpath):
    points = []
    for path in inpath:
        if isinstance(path, Line):
            points.append([path.end.real, path.end.imag])
        else:
            num_segments = math.ceil(path.length() / 1.0)
            for seg_i in range(int(num_segments + 1)):
                points.append([path.point(seg_i / num_segments).real,
                                path.point(seg_i / num_segments).imag])
    
    return Polygon(points)

eurasia_df = s.load_object('eurasia_df')
mask_series = eurasia_df.outline_d.isnull()
for state_name, row_series in eurasia_df[~mask_series].iterrows():
    path_obj = svgpathtools.parse_path(pathdef=row_series.outline_d, current_pos=0j)
    poly_obj = path_to_poly(path_obj)
    eurasia_df.loc[state_name, 'centroid_x'] = poly_obj.centroid.x
    eurasia_df.loc[state_name, 'centroid_y'] = poly_obj.centroid.y
s.store_objects(eurasia_df=eurasia_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pickle\eurasia_df.pickle


In [42]:

mask_series = df.data_name.isnull()
print(df[~mask_series].data_name.tolist())
df[mask_series].sample(min(8, df[mask_series].shape[0])).dropna(axis='columns', how='all').T

['Yemen', 'Viet Nam', 'Uzbekistan', 'United Kingdom of Great Britain and Northern Ireland', 'United Arab Emirates', 'Ukraine', 'Turkmenistan', 'Turkey', 'Thailand', 'Tajikistan', 'Taiwan, Province of China', 'Syrian Arab Republic', 'Switzerland', 'Sweden', 'Sri Lanka', 'Spain', 'Slovenia', 'Slovakia', 'Serbia', 'Saudi Arabia', 'Russian Federation', 'Romania', 'Qatar', 'Portugal', 'Poland', 'Philippines', 'Papua New Guinea', 'Pakistan', 'Oman', 'Norway', 'North Macedonia', 'Netherlands', 'Nepal', 'Myanmar', 'Montenegro', 'Mongolia', 'Moldova, Republic of', 'Malaysia', 'Luxembourg', 'Lithuania', 'Lebanon', 'Latvia', "Lao People's Democratic Republic", 'Kyrgyzstan', 'Kuwait', 'Korea, Republic of', "Korea (Democratic People's Republic of)", 'Kazakhstan', 'Jordan', 'Japan', 'Italy', 'Israel', 'Ireland', 'Iraq', 'Iran (Islamic Republic of)', 'Indonesia', 'India', 'Iceland', 'Hungary', 'Greece', 'Germany', 'Georgia', 'France', 'Finland', 'Estonia', 'Denmark', 'Czechia', 'Croatia', 'China', 'C

Unnamed: 0,326,362,329,283,348,306,131,356
width,,100%,,,,,,100%
height,,100%,,,,,,100%
tag_id,colorbar_7_text,colorbar_12974_use,DejaVuSans-33,colorbar_12852_defs,DejaVuSans-76,colorbar_12891_g,text-saudi-arabia,colorbar_12962_use
outline_d,,,"m 2597,2516 q 453,-97 707,-404 255,-306 255,-7...",,"M 191,3500 H 800 L 1894,563 2988,3500 h 609 L ...",,,
style,,,,,,,font-style:normal;font-variant:normal;font-wei...,
x,,604.58984,,,,,971.06683,308.10547
y,,0,,,,,645.21332,0
label,,,,,,,Saudi Arabia district abbreviation,
namespace,,,,,,,preserve,
transform,,,scale(0.015625),,scale(0.015625),,,


In [40]:

%run ../load_magic/dataframes.py

column_descriptions_df = get_column_descriptions(df)
column_descriptions_df.sort_values(['count_uniques'], ascending=[False])

Unnamed: 0,column_name,dtype,count_blanks,count_uniques,count_zeroes,has_dates,min_value,max_value,only_integers
4,tag_id,object,4,373,0,False,DejaVuSans-20,tspan-yemen0,
24,outline_d,object,268,108,0,False,,"m 983.33239,538.83155 c 2.86996,-3.32901 5.327...",
31,label,object,293,84,0,False,Adjacent Country Backgrounds,Yemen district abbreviation,
33,data_name,object,294,83,0,False,Afghanistan,Yemen,
32,data_id,object,294,83,0,False,AE,YE,
25,style,object,198,43,0,False,display:inline;fill:#c8eafb;stroke-width:2.73218,stroke:#000000;stroke-width:0.80000001;stroke-...,
29,x,object,169,37,0,False,0,995.68799,False
38,href,object,334,24,0,False,#DejaVuSans-20,"data:image/png;base64, iVBORw0KGgoAAAANSUhEUgA...",
30,y,object,169,21,0,False,-52,645.21332,False
36,transform,object,345,12,0,False,"matrix(0,-0.1,-0.1,0,96.123438,273.24891)","translate(1665.5921,409.56979)",
