In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from os import path as osp
from StatsByUSState import nu, cu, scu, ssu
import pandas as pd
import re
import os
import numpy as np

In [3]:

nu.list_dfs_in_folder()

['all_countries_df', 'column_descriptions_df', 'counties_df', 'county_populations_df', 'gdp_us_cities_df', 'gdp_us_states_df', 'income_inequality_us_states_df', 'one_country_df', 'race_homicide_rate_by_state_year_df', 'state_merge_df', 'state_race_df', 'usa_df', 'us_murder_df', 'us_perps_df', 'us_states_df', 'us_stats_df']

In [4]:

nus_list = [f'nu.{fn}' for fn in dir(nu) if not fn.startswith('_')]
cus_list = [f'cu.{fn}' for fn in dir(cu) if not fn.startswith('_')]
scus_list = [f'scu.{fn}' for fn in dir(scu) if not fn.startswith('_')]
ssus_list = [f'ssu.{fn}' for fn in dir(ssu) if not fn.startswith('_')]
sorted(nus_list + cus_list + scus_list + ssus_list, key=lambda x: x[::-1])

['scu.get_text_rgba', 'cu.get_greatest_area', 'nu.format_timedelta', 'cu.get_fill_color_rgb', 'nu.open_path_in_notepad', 'nu.ensure_module_installed', 'scu.population_pyramid', 'nu.get_euclidean_distance', 'nu.get_turbulence', 'nu.plot_sequence', 'cu.iso_3166_2_code', 'nu.get_dir_tree', 'nu.plot_inauguration_age', 'scu.make_a_movie', 'nu.download_file', 'cu.create_label_line_file', 'nu.attempt_to_pickle', 'cu.copy_file_name', 'cu.add_docname', 'nu.convert_to_data_frame', 'nu.get_regexed_dataframe', 'cu.clean_up_district_merge_dataframe', 'cu.convert_svg_to_dataframe', 'nu.get_shape', 'nu.decoding_type', 'nu.encoding_type', 'ssu.get_similarity_measure', 'scu.get_fontsize', 'cu.all_countries_df', 'ssu.us_stats_df', 'cu.one_country_df', 'ssu.prepare_for_choroplething', 'cu.create_svg_file_beginning', 'cu.indexize_string', 'nu.show_dupl_fn_defs_search_string', 'nu.show_duplicated_util_fns_search_string', 'cu.trim_d_path', 'cu.label_line_file_path', 'nu.get_function_file_path', 'cu.gradient

In [5]:

[fn for fn in sorted(nus_list + cus_list + scus_list + ssus_list, key=lambda x: x[::-1]) if 'wiki' in fn]

['nu.get_wiki_tables']

In [6]:

[fn for fn in sorted(nus_list + cus_list + scus_list + ssus_list, key=lambda x: x[::-1]) if fn.endswith('_df')]

['cu.all_countries_df', 'ssu.us_stats_df', 'cu.one_country_df']


----
# US State Temperature Extremes


In [7]:

if nu.pickle_exists('extreme_temperatures_states_df'): extreme_temperatures_states_df = nu.load_object('extreme_temperatures_states_df')
else:
    tables_url = 'https://en.wikipedia.org/wiki/U.S._state_and_territory_temperature_extremes'
    page_tables_list = nu.get_wiki_tables(tables_url, verbose=False)
    extreme_temperatures_states_df = page_tables_list[0].copy().applymap(lambda x: str(x).split('[')[0].strip().replace('−', '-'))
    # print(extreme_temperatures_states_df.columns.tolist())
    extreme_temperatures_states_df.columns = ['state_name', 'record_high', 'high_date', 'high_place', 'record_low', 'low_date', 'low_place', 'temperature_diff']
    for cn in ['record_high', 'record_low', 'temperature_diff']:
        extreme_temperatures_states_df[f'{cn}_fahrenheit'] = extreme_temperatures_states_df[cn].map(lambda x: float(re.split(r'([\d.-]+)', str(x), 0)[1]))
        extreme_temperatures_states_df[f'{cn}_celsius'] = extreme_temperatures_states_df[cn].map(lambda x: float(re.split(r'([\d.-]+)', str(x), 0)[3]))
    extreme_temperatures_states_df = extreme_temperatures_states_df.set_index('state_name')
extreme_temperatures_states_df.sample(13).T

state_name,Arkansas,Rhode Island,Montana,Minnesota,North Dakota,Wyoming,South Carolina,Illinois,District of Columbia,Hawaii,Missouri,Kansas,Wisconsin
record_high,120 °F (48.9 °C),104 °F (40 °C),117 °F (47.2 °C),115 °F (46.1 °C),121 °F (49.4 °C),115 °F (46.1 °C),113 °F (45 °C),117 °F (47.2 °C),106 °F (41.1 °C),100 °F (37.8 °C),118 °F (47.8 °C),121 °F (49.4 °C),114 °F (45.6 °C)
high_date,"August 10, 1936","August 2, 1975","July 5, 1937","July 29, 1917","July 6, 1936","August 8, 1983","June 29, 2012","July 14, 1954","July 20, 1930","April 27, 1931","July 14, 1954","July 24, 1936","July 13, 1936"
high_place,Ozark,Providence,Medicine Lake,Beardsley,Steele,Basin,Columbia,East Saint Louis,Washington,Pahala,Warsaw,Alton,Wisconsin Dells
record_low,-29 °F (-33.9 °C),-28 °F (-33.3 °C),-70 °F (-56.7 °C),-60 °F (-51.1 °C),-60 °F (-51.1 °C),-63 °F (-52.8 °C),-22 °F (-30 °C),-38 °F (-38.9 °C),-15 °F (-26.1 °C),15 °F (-9.4 °C),-40 °F (-40 °C),-40 °F (-40.0 °C),-55 °F (-48.3 °C)
low_date,"February 13, 1905","January 17, 1942","January 20, 1954","February 2, 1996","February 15, 1936","February 9, 1933","January 21, 1985","January 31, 2019","February 11, 1899","January 5, 1975","February 13, 1905","February 13, 1905","February 4, 1996"
low_place,Gravette,Richmond,Lincoln (Rogers Pass),Tower,Parshall,Moran,Landrum (Hogback Mountain),Mount Carroll,Washington,Mauna Kea Observatories,Warsaw,Lebanon,Couderay
temperature_diff,149 °F (82.8 °C),132 °F (73.3 °C),187 °F (103.9 °C),175 °F (97.2 °C),181 °F (100.6 °C),178 °F (98.9 °C),135 °F (75 °C),155 °F (86.1 °C),121 °F (67.2 °C),83 °F (46 °C),158 °F (87.8 °C),161 °F (89.4 °C),169 °F (93.9 °C)
record_high_fahrenheit,120.0,104.0,117.0,115.0,121.0,115.0,113.0,117.0,106.0,100.0,118.0,121.0,114.0
record_high_celsius,48.9,40.0,47.2,46.1,49.4,46.1,45.0,47.2,41.1,37.8,47.8,49.4,45.6
record_low_fahrenheit,-29.0,-28.0,-70.0,-60.0,-60.0,-63.0,-22.0,-38.0,-15.0,15.0,-40.0,-40.0,-55.0


In [8]:

# Remove US states duplicates and misspellings
states_list = sorted(set(extreme_temperatures_states_df.index).symmetric_difference(set(ssu.us_stats_df.index)))
print(states_list)

['American Samoa', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Virgin Islands (U.S.)']


In [9]:

doubles_df = nu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity


In [10]:

if not set(cu.one_country_df.columns).intersection(set(extreme_temperatures_states_df.columns)):
    cu.one_country_df = cu.one_country_df.merge(extreme_temperatures_states_df, left_index=True, right_index=True)
    nu.store_objects(one_country_df=cu.one_country_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByUSState\saves\pkl\one_country_df.pkl



## Prepare for and Create Choropleths

In [14]:

# Close the Notepad++ window after you're finished in order to completely run this cell
import subprocess

numeric_column_name = 'temperature_diff_celsius'
string_column_name = 'district_abbreviation'
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
if (numeric_column_name in cu.one_country_df.columns) and (string_column_name in cu.one_country_df.columns):
    svg_file_path = os.path.abspath(cu.create_country_colored_labeled_map(
        numeric_column_name,
        string_column_name=string_column_name,
        one_country_df=cu.one_country_df,
        cmap='viridis',
    ))
    print(svg_file_path)
    print(subprocess.call([text_editor_path, svg_file_path]))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByUSState\saves\svg\US_temperature_diff_celsius_district_abbreviation.svg
0


In [15]:

inkscape_path = r'C:\Program Files\Inkscape\bin\inkscape.exe'
png_file_path = svg_file_path.replace('svg', 'png')
os.makedirs(osp.dirname(png_file_path), exist_ok=True)
print(png_file_path)
subprocess.call([inkscape_path, svg_file_path, f'--export-filename={png_file_path}'])

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByUSState\saves\png\US_temperature_diff_celsius_district_abbreviation.png


0

In [21]:

from IPython.display import HTML

common_prefix = osp.commonprefix([png_file_path, os.getcwd()])
relative_path = osp.join(*png_file_path.split(common_prefix)[1:])
HTML('<img src="../' + relative_path.replace(os.sep, '/') + '" style="width:50%"/>')