In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from os import path as osp
from StatsByUSState import nu, cu, scu, ssu
import pandas as pd
import re
import os
import numpy as np

In [3]:

nu.list_dfs_in_folder()

['all_countries_df', 'column_descriptions_df', 'counties_df', 'county_populations_df', 'gdp_us_cities_df', 'gdp_us_states_df', 'income_inequality_us_states_df', 'one_country_df', 'race_homicide_rate_by_state_year_df', 'state_merge_df', 'state_race_df', 'usa_df', 'us_murder_df', 'us_perps_df', 'us_states_df', 'us_stats_df']

In [4]:

nus_list = [f'nu.{fn}' for fn in dir(nu) if not fn.startswith('_')]
cus_list = [f'cu.{fn}' for fn in dir(cu) if not fn.startswith('_')]
scus_list = [f'scu.{fn}' for fn in dir(scu) if not fn.startswith('_')]
ssus_list = [f'ssu.{fn}' for fn in dir(ssu) if not fn.startswith('_')]
sorted(nus_list + cus_list + scus_list + ssus_list, key=lambda x: x[::-1])

['scu.get_text_rgba', 'cu.get_greatest_area', 'nu.format_timedelta', 'cu.get_fill_color_rgb', 'nu.open_path_in_notepad', 'nu.ensure_module_installed', 'scu.population_pyramid', 'nu.get_euclidean_distance', 'nu.get_turbulence', 'nu.plot_sequence', 'cu.iso_3166_2_code', 'nu.get_dir_tree', 'nu.plot_inauguration_age', 'scu.make_a_movie', 'nu.download_file', 'cu.create_label_line_file', 'nu.attempt_to_pickle', 'cu.copy_file_name', 'cu.add_docname', 'nu.convert_to_data_frame', 'nu.get_regexed_dataframe', 'cu.clean_up_district_merge_dataframe', 'cu.convert_svg_to_dataframe', 'nu.get_shape', 'nu.decoding_type', 'nu.encoding_type', 'ssu.get_similarity_measure', 'scu.get_fontsize', 'cu.all_countries_df', 'ssu.us_stats_df', 'cu.one_country_df', 'ssu.prepare_for_choroplething', 'cu.create_svg_file_beginning', 'cu.indexize_string', 'nu.show_dupl_fn_defs_search_string', 'nu.show_duplicated_util_fns_search_string', 'cu.trim_d_path', 'cu.label_line_file_path', 'nu.get_function_file_path', 'cu.gradient

In [5]:

[fn for fn in sorted(nus_list + cus_list + scus_list + ssus_list, key=lambda x: x[::-1]) if 'wiki' in fn]

['nu.get_wiki_tables']

In [6]:

[fn for fn in sorted(nus_list + cus_list + scus_list + ssus_list, key=lambda x: x[::-1]) if fn.endswith('_df')]

['cu.all_countries_df', 'ssu.us_stats_df', 'cu.one_country_df']


----
# US State Temperature Extremes


In [7]:

if nu.pickle_exists('extreme_temperatures_states_df'): extreme_temperatures_states_df = nu.load_object('extreme_temperatures_states_df')
else:
    tables_url = 'https://en.wikipedia.org/wiki/U.S._state_and_territory_temperature_extremes'
    page_tables_list = nu.get_wiki_tables(tables_url, verbose=False)
    extreme_temperatures_states_df = page_tables_list[0].copy().applymap(lambda x: str(x).split('[')[0].strip().replace('−', '-'))
    # print(extreme_temperatures_states_df.columns.tolist())
    extreme_temperatures_states_df.columns = ['state_name', 'record_high', 'high_date', 'high_place', 'record_low', 'low_date', 'low_place', 'temperature_diff']
    for cn in ['record_high', 'record_low', 'temperature_diff']:
        extreme_temperatures_states_df[f'{cn}_fahrenheit'] = extreme_temperatures_states_df[cn].map(lambda x: float(re.split(r'([\d.-]+)', str(x), 0)[1]))
        extreme_temperatures_states_df[f'{cn}_celsius'] = extreme_temperatures_states_df[cn].map(lambda x: float(re.split(r'([\d.-]+)', str(x), 0)[3]))
    extreme_temperatures_states_df = extreme_temperatures_states_df.set_index('state_name')
extreme_temperatures_states_df.sample(13).T

state_name,New Jersey,New Hampshire,Alabama,Kentucky,American Samoa,Northern Mariana Islands,Puerto Rico,Nevada,New York,Washington,Ohio,Kansas,North Carolina
record_high,110 °F (43.3 °C),106 °F (41.1 °C),112 °F (44.4 °C),116 °F (46.7 °C),99 °F (37.2 °C),99 °F (37.2 °C),104 °F (40 °C),125 °F (51.7 °C),108 °F (42.2 °C),120 °F (48.9 °C),113 °F (45 °C),121 °F (49.4 °C),110 °F (43.3 °C)
high_date,"July 10, 1936","July 4, 1911","September 6, 1925","July 28, 1930","April 27, 1972","July 8, 2010","July 2, 1996","June 29, 1994","July 22, 1926","June 29, 2021","July 21, 1934","July 24, 1936","August 21, 1983"
high_place,Runyon,Nashua,Centreville,Louisville,Malaeloa/Aitulagi,Saipan,Isla de Mona (Mayagüez),Laughlin,Troy,Hanford Site,Gallipolis,Alton,Fayetteville
record_low,-34 °F (-36.7 °C),-50 °F (-45.6 °C),-27 °F (-32.8 °C),-37 °F (-38.3 °C),53 °F (11.7 °C),62 °F (16.7 °C),40 °F (4.4 °C),-50 °F (-45.6 °C),-52 °F (-46.7 °C),-48 °F (-44.4 °C),-39 °F (-39.4 °C),-40 °F (-40.0 °C),-34 °F (-36.7 °C)
low_date,"January 5, 1904","January 22, 1885","January 30, 1966","January 19, 1994","March 28, 1962","December 15, 2000","January 24, 1966","January 8, 1937","February 18, 1979","December 30, 1968","February 10, 1899","February 13, 1905","January 21, 1985"
low_place,River Vale,Mount Washington,New Market,Shelbyville,Malaeloa/Aitulagi,Saipan,San Sebastián,San Jacinto,Old Forge,Winthrop,Milligan,Lebanon,Burnsville
temperature_diff,144 °F (80 °C),156 °F (86.7 °C),139 °F (77.2 °C),151 °F (83.9 °C),46 °F (25.6 °C),37 °F (20.6 °C),64 °F (35.6 °C),175 °F (97.2 °C),160 °F (88.9 °C),168 °F (93.3 °C),152 °F (84.4 °C),161 °F (89.4 °C),144 °F (80 °C)
record_high_fahrenheit,110.0,106.0,112.0,116.0,99.0,99.0,104.0,125.0,108.0,120.0,113.0,121.0,110.0
record_high_celsius,43.3,41.1,44.4,46.7,37.2,37.2,40.0,51.7,42.2,48.9,45.0,49.4,43.3
record_low_fahrenheit,-34.0,-50.0,-27.0,-37.0,53.0,62.0,40.0,-50.0,-52.0,-48.0,-39.0,-40.0,-34.0


In [8]:

# Remove US states duplicates and misspellings
states_list = sorted(set(extreme_temperatures_states_df.index).symmetric_difference(set(ssu.us_stats_df.index)))
print(states_list)

['American Samoa', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'Virgin Islands (U.S.)']


In [9]:

doubles_df = nu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False)

Unnamed: 0,first_item,second_item,max_similarity


In [10]:

if not set(cu.one_country_df.columns).intersection(set(extreme_temperatures_states_df.columns)):
    cu.one_country_df = cu.one_country_df.merge(extreme_temperatures_states_df, left_index=True, right_index=True)
    nu.store_objects(one_country_df=cu.one_country_df)

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByUSState\saves\pkl\one_country_df.pkl



## Prepare for and Create Choropleths

In [11]:

cus_list

['cu.add_docname', 'cu.all_countries_df', 'cu.axes_str', 'cu.clean_up_district_common_dict', 'cu.clean_up_district_merge_dataframe', 'cu.clean_up_district_unique_dict', 'cu.clean_up_suggestion_list_dict', 'cu.color_distance_from', 'cu.convert_svg_to_dataframe', 'cu.copy_file_name', 'cu.copy_file_path', 'cu.create_country_colored_labeled_map', 'cu.create_country_colored_map', 'cu.create_country_labeled_map', 'cu.create_district_first_dict', 'cu.create_label_line_file', 'cu.create_suggestion_list_dictionary', 'cu.create_svg_file_beginning', 'cu.create_text_tag_xml', 'cu.create_us_google_suggest_labeled_map', 'cu.district_path_str', 'cu.figure_str', 'cu.fill_style_prefix', 'cu.fill_style_str', 'cu.get_colorbar_xml', 'cu.get_column_description', 'cu.get_fill_color', 'cu.get_fill_color_rgb', 'cu.get_google_suggestion_list', 'cu.get_greatest_area', 'cu.get_legend_xml', 'cu.get_style_list', 'cu.get_tfidf_lists', 'cu.get_tspan_list', 'cu.gradient_file_path', 'cu.gradient_str', 'cu.has_no_limit

In [12]:

# Close the Notepad++ window after you're finished in order to completely run this cell
numeric_column_name = 'temperature_diff_celsius'
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
if (numeric_column_name in cu.one_country_df.columns):
    svg_file_path = os.path.abspath(cu.create_country_colored_map(
        numeric_column_name,
        one_country_df=cu.one_country_df,
        cmap='viridis',
        min=None,
        max=None,
    ))
    !"{text_editor_path}" "{svg_file_path}"

In [26]:

[cn for cn in cu.one_country_df.columns if 'abbrev' in cn.lower()]

['district_abbreviation']

In [30]:

# Close the Notepad++ window after you're finished in order to completely run this cell
import subprocess

numeric_column_name = 'temperature_diff_celsius'
string_column_name = 'district_abbreviation'
text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
if (numeric_column_name in cu.one_country_df.columns) and (string_column_name in cu.one_country_df.columns):
    svg_file_path = os.path.abspath(cu.create_country_colored_labeled_map(
        numeric_column_name,
        string_column_name=string_column_name,
        one_country_df=cu.one_country_df,
        cmap='viridis',
    ))
    print(svg_file_path)
    print(subprocess.call([text_editor_path, svg_file_path]))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByUSState\saves\svg\US_temperature_diff_celsius_district_abbreviation.svg
0


In [31]:

inkscape_path = r'C:\Program Files\Inkscape\bin\inkscape.exe'
png_file_path = svg_file_path.replace('svg', 'png')
os.makedirs(osp.dirname(png_file_path), exist_ok=True)
print(png_file_path)
subprocess.call([inkscape_path, svg_file_path, f'--export-filename={png_file_path}'])

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByUSState\saves\png\US_temperature_diff_celsius_district_abbreviation.png


0


<img src="../saves/png/US_temperature_diff_celsius_district_abbreviation.png" style="width:50%"/>