
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



---
# Choropleths

In [13]:

column_descriptions_df = ssu.get_column_descriptions(c.one_country_df)
mask_series = (column_descriptions_df.dtype == 'object')
print(column_descriptions_df[mask_series].column_name.tolist())
print()
print(column_descriptions_df[~mask_series].column_name.tolist())

['district_abbreviation', 'State_Region', 'Google_Suggest_Unique', 'label_line_d', 'Google_Suggest_Common', 'Google_Suggest_First', 'outline_d', 'Country_Equivalent_GDP', 'Country_Equivalent_Military_Expenditure', 'state_color', 'centroid_id']

['White_Percent', 'Black_Percent', 'Hispanic_Percent', 'Asian_Percent', 'Native_Percent', 'Islander_Percent', 'Multi_Percent', 'Gini_Index', 'Effectiveness_Rank', 'Health_Care_Score', 'Education_Score', 'Economy_Score', 'Infrastructure_Score', 'Opportunity_Score', 'Fiscal_Stability_Score', 'Crime_Corrections_Score', 'Natural_Environment_Score', 'GDP_Rank', 'GDP_2018', 'GDP_Percent', 'Homicide_Rate_2018', 'Homicide_Rate_2017', 'Homicide_Rate_2014', 'Homicide_Rate_2010', 'Homicide_Rate_2005', 'Homicide_Rate_2000', 'Homicide_Rate_1996', 'Guns_Rank', 'Guns_Per_Capita', 'Guns_Registered', 'Suicide_Rate_2017', 'Suicide_Deaths_2017', 'Suicide_Rate_2016', 'Suicide_Deaths_2016', 'Suicide_Rate_2015', 'Suicide_Deaths_2015', 'Suicide_Rate_2014', 'Suicide_De

In [10]:

for file_name in os.listdir(s.saves_pickle_folder):
    if file_name.endswith('_df.pkl'):
        df_name = file_name.split('.')[0]
        df = s.load_object(df_name)
        columns_list = df.columns.tolist()
        if 'district_abbreviation' in columns_list:
            print(f'{df_name}: {columns_list}')

us_stats_df: ['White_Percent', 'Black_Percent', 'Hispanic_Percent', 'Asian_Percent', 'Native_Percent', 'Islander_Percent', 'Multi_Percent', 'Gini_Index', 'Effectiveness_Rank', 'Health_Care_Score', 'Education_Score', 'Economy_Score', 'Infrastructure_Score', 'Opportunity_Score', 'Fiscal_Stability_Score', 'Crime_Corrections_Score', 'Natural_Environment_Score', 'district_abbreviation', 'GDP_Rank', 'GDP_2018', 'GDP_Percent', 'State_Region', 'Homicide_Rate_2018', 'Homicide_Rate_2017', 'Homicide_Rate_2014', 'Homicide_Rate_2010', 'Homicide_Rate_2005', 'Homicide_Rate_2000', 'Homicide_Rate_1996', 'Guns_Rank', 'Guns_Per_Capita', 'Guns_Registered', 'Suicide_Rate_2017', 'Suicide_Deaths_2017', 'Suicide_Rate_2016', 'Suicide_Deaths_2016', 'Suicide_Rate_2015', 'Suicide_Deaths_2015', 'Suicide_Rate_2014', 'Suicide_Deaths_2014', 'Suicide_Rate_2005', 'Suicide_Deaths_2005', 'Total_Inhabitants_2010', 'Inhabitants_Per_Square_Mile_2010', 'Total_Murder_Deaths_2010', 'Total_Gun_Murder_Deaths_2010', 'Gun_Ownershi

In [11]:

text_editor_path = r'C:\Program Files\Notepad++\notepad++.exe'
inkscape_path = r'C:\Program Files\Inkscape\bin\inkscape.exe'

In [None]:

column_name = 'Estimated_IQ'
if column_name in us_stats_df.columns:
    svg_file_path = c.create_country_colored_map(column_name)
    !"{text_editor_path}" "{os.path.abspath(svg_file_path)}"

In [None]:

column_name = 'White_Percent'
if column_name in us_stats_df.columns:
    svg_file_path = os.path.abspath(c.create_country_colored_map(column_name))
    !"{text_editor_path}" "{svg_file_path}"
    # !"{inkscape_path}" window-open "{svg_file_path}"

In [None]:

column_name = 'Percent_Whites_in_Non_Public_Education'
if column_name in us_stats_df.columns:
    svg_file_path = os.path.abspath(c.create_country_colored_map(column_name))
    !"{text_editor_path}" "{svg_file_path}"
    # !"{inkscape_path}" window-open "{svg_file_path}"

In [None]:

if 'centroid_id' not in us_stats_df.columns:
    us_stats_df['centroid_id'] = us_stats_df.index.map(lambda x: ('district-' + c.indexize_string(x)).replace('-district', ''))
    s.store_objects(us_stats_df=us_stats_df)

In [None]:

if 'dy' not in us_stats_df.columns:
    us_stats_df['dy'] = np.nan
    s.store_objects(us_stats_df=us_stats_df)

In [None]:

string_column_name = 'State_Region'
if string_column_name in us_stats_df.columns:
    c.create_label_line_file()
    svg_file_path = os.path.abspath(c.create_country_labeled_map(string_column_name=string_column_name,
                                                                 one_country_df=c.one_country_df))
    !"{text_editor_path}" "{svg_file_path}"
    !"{inkscape_path}" window-open "{svg_file_path}"

In [14]:

state_color_dict = s.load_object('us_state_name_color_dict')
string_column_name = 'Google_Suggest_First'
if string_column_name in us_stats_df.columns:
    svg_file_path = os.path.abspath(c.create_country_labeled_map(string_column_name=string_column_name,
                                                                 one_country_df=c.one_country_df))
    !"{text_editor_path}" "{svg_file_path}"

In [None]:

from matplotlib import cm

ListedColormap_obj = cm.get_cmap('viridis', len(c.one_country_df.State_Budget_Processes.unique()))
min = c.one_country_df.State_Budget_Processes.min()
max = c.one_country_df.State_Budget_Processes.max()
normed_series = (c.one_country_df.State_Budget_Processes - min) / (max - min)
sample_value = normed_series.sample(1).tolist()[0]
if str(sample_value) != 'nan':
    print(ListedColormap_obj(sample_value), '#{:02x}{:02x}{:02x}{:02x}'.format(*tuple(int(x*255) for x in ListedColormap_obj(sample_value))))

In [None]:

print(['c.one_country_df.{}'.format(fn) for fn in dir(c.one_country_df) if 'sort' in fn.lower()])

In [None]:

!start %windir%\explorer.exe "{c.svg_dir}"

In [None]:

for column_name in c.one_country_df.columns:
    svg_file_path = c.create_country_colored_map(column_name=column_name)

In [None]:

print(['c.one_country_df.{}'.format(cn) for cn in c.one_country_df.columns if ('gun' in cn.lower()) and ('murder' in cn.lower())])


---
# O Canada!

In [9]:

page_tables_list = ssu.get_page_tables('https://en.wikipedia.org/wiki/Demographics_of_Canada')

[(22, (140, 8)), (14, (122, 9)), (26, (78, 2)), (27, (37, 2)), (4, (28, 11)), (0, (26, 2)), (5, (25, 12)), (28, (25, 2)), (17, (24, 3)), (13, (22, 5)), (35, (21, 2)), (12, (18, 11)), (25, (18, 3)), (18, (17, 11)), (1, (14, 11)), (41, (13, 2)), (16, (11, 3)), (2, (10, 10)), (3, (10, 10)), (6, (8, 3)), (20, (8, 5)), (10, (7, 3)), (7, (6, 3)), (8, (6, 3)), (23, (6, 3)), (24, (6, 3)), (32, (6, 2)), (11, (5, 3)), (21, (5, 5)), (29, (5, 2)), (31, (5, 2)), (9, (4, 3)), (19, (4, 11)), (34, (4, 2)), (15, (3, 4)), (30, (3, 2)), (33, (3, 2)), (36, (2, 2)), (37, (2, 2)), (38, (2, 2)), (39, (2, 2)), (40, (2, 2)), (42, (2, 2))]


In [None]:

canada_races_df = page_tables_list[14].copy()
canada_races_df.columns = [cn.split('[')[0].strip() for cn in canada_races_df.columns.droplevel(0).tolist()]
canada_races_df['Province/territory'] = canada_races_df['Province/territory'].map(lambda cn: cn.split('[')[0])
canada_races_df.set_index('Province/territory', drop=True, inplace=True)
canada_races_df['Percent visible minority'] = canada_races_df['Percent visible minority'].map(lambda x: float(str(x).split('%')[0]))
canada_races_df['Percent_White'] = canada_races_df['Percent visible minority'].map(lambda x: 100.0 - x)
canada_races_df.dropna(axis='columns', how='all', inplace=True)
canada_races_df.sample(5).T

In [None]:

s.store_objects(canada_races_df=canada_races_df)


---
# Add New Columns

In [None]:

us_stats_df.columns = ['White_Percent', 'Black_Percent', 'Hispanic_Percent', 'Asian_Percent', 'Native_Percent', 'Islander_Percent',
                       'Multi_Percent', 'Gini_Index', 'Effectiveness_Rank', 'Health_Care_Score', 'Education_Score',
                       'Economy_Score', 'Infrastructure_Score', 'Opportunity_Score', 'Fiscal_Stability_Score',
                       'Crime_Corrections_Score', 'Natural_Environment_Score', 'district_abbreviation', 'GDP_Rank', 'GDP_2018',
                       'GDP_Percent', 'State_Region', 'Homicide_Rate_2018', 'Homicide_Rate_2017', 'Homicide_Rate_2014',
                       'Homicide_Rate_2010', 'Homicide_Rate_2005', 'Homicide_Rate_2000', 'Homicide_Rate_1996', 'Guns_Rank',
                       'Guns_Per_Capita', 'Guns_Registered', 'Suicide_Rate_2017', 'Suicide_Deaths_2017', 'Suicide_Rate_2016',
                       'Suicide_Deaths_2016', 'Suicide_Rate_2015', 'Suicide_Deaths_2015', 'Suicide_Rate_2014',
                       'Suicide_Deaths_2014', 'Suicide_Rate_2005', 'Suicide_Deaths_2005',
                       'Total_Inhabitants_2010', 'Inhabitants_Per_Square_Mile_2010', 'Total_Murder_Deaths_2010',
                       'Total_Gun_Murder_Deaths_2010', 'Gun_Ownership_Percent_2013', 'Murder_Rate_2010', 'Gun_Murder_Rate_2010',
                       'State_FIPS', 'State_Population', 'Gun_Suicide_Deaths', 'Gun_Suicide_Rate', 'Google_Suggest_Unique', 'text_x',
                       'text_y', 'label_line_d', 'Google_Suggest_Common', 'Google_Suggest_First',
                       'Public_Access_to_Information', 'Political_Financing', 'Electoral_Oversight',
                       'Executive_Accountability', 'Legislative_Accountability', 'Judicial_Accountability',
                       'State_Budget_Processes', 'State_Civil_Service_Management', 'Procurement', 'Internal_Auditing',
                       'Lobbying_Disclosure', 'Ethics_Enforcement_Entities', 'State_Pension_Fund_Management', 'outline_d',
                       'centroid_x', 'centroid_y']
s.store_objects(us_stats_df=us_stats_df)

In [None]:

import xml.etree.ElementTree as et

file_path = os.path.join(s.data_folder, 'svg', 'us.svg')
#print(['root.{}'.format(fn) for fn in dir(root) if not fn.startswith('_')])
us_stats_df = s.load_object('us_stats_df')
root = et.parse(file_path).getroot()
outline_d_dict = {}
for tag in root:
    if (tag.tag.split('}')[-1] == 'path'):
        #print(['tag.{}'.format(fn) for fn in dir(tag) if not fn.startswith('_')])
        state_name = tag.attrib['data-name']
        outline_d = tag.attrib['d']
        outline_d_dict[state_name] = outline_d

df = pd.DataFrame([outline_d_dict]).T
df.columns = ['outline_d']
us_stats_df = us_stats_df.T.append(df.T).T
us_stats_df.T.tail(5).T.sample(7).T

In [None]:

s.store_objects(us_stats_df=us_stats_df)


---

In [None]:

file_name = 'State_Integrity_2015_Full_Dataset.xlsx'
excel_path = os.path.join(s.data_folder, 'xlsx', file_name)
sheet_df_dict = pd.read_excel(excel_path, sheet_name=None)

In [None]:

us_stats_df = s.load_object('us_stats_df')
for sheet_name in sheet_df_dict.keys():
    column_name = '_'.join(sheet_name.strip().split(' '))
    df = sheet_df_dict[sheet_name].copy()
    index_columns = df.loc[0, ['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2']].tolist()
    df.columns = index_columns + df.columns.tolist()[3:]
    df.set_index(keys=index_columns, inplace=True)
    cn_dict = {}
    for state_name in us_stats_df.index:
        cn_dict[state_name] = []
    for index_tuple, row_series in df[states_list].iterrows():
        if str(index_tuple[1]).isdigit():
            #print(index_tuple)
            for state_name, column_value in row_series.iteritems():
                state_name = state_name.strip()
                column_value = column_value.strip()
                if str(column_value).isdigit():
                    scores_list = cn_dict[state_name]
                    scores_list.append(column_value)
                    cn_dict[state_name] = scores_list
                elif column_value.lower() in ['no', 'moderate', 'yes']:
                    scores_list = cn_dict[state_name]
                    scores_list.append(['no', 'moderate', 'yes'].index(column_value.lower())*50)
                    cn_dict[state_name] = scores_list
    cn_dict = {state_name: sum([b/len(scores_list) for b in scores_list]) for state_name,
               scores_list in cn_dict.items()}
    df = pd.DataFrame([cn_dict]).T
    df.columns = [column_name]
    us_stats_df = us_stats_df.T.append(df.T).T

In [None]:

us_stats_df.T.tail(20).T.sample(8).T


---
# Get Correlations ("P-Hunting")

In [None]:

def get_correlation_dataframe(numeric_columns_list):
    rows_list = []
    for x_column in numeric_columns_list:
        for y_column in numeric_columns_list:
            if x_column != y_column:
                columns_list = [x_column, y_column]
                df = us_stats_df[columns_list].dropna()
                x = df[x_column].values
                y = df[y_column].values
                try:
                    r_tuple = stats.pearsonr(x, y)
                    if r_tuple[1] < 0.05:
                        row_dict = {}
                        row_dict['left_column'] = x_column
                        row_dict['right_column'] = y_column
                        row_dict['pearson_r'] = abs(r_tuple[0])
                        rows_list.append(row_dict)
                except Exception as e:
                    print('{} and {} get an error: {}'.format(x_column, y_column, e))
    correlation_df = pd.DataFrame(rows_list, columns=['left_column', 'right_column', 'pearson_r'])
    
    return correlation_df

In [None]:

column_descriptions_df = ssu.get_column_descriptions(df=us_stats_df)
column_descriptions_df.columns

In [None]:

mask_series = (column_descriptions_df.dtype == 'float32')
columns_list = [row_series.column_name for row_index, row_series in column_descriptions_df[mask_series].iterrows()]
correlation_df = get_correlation_dataframe(columns_list)

In [None]:

mask_series = (correlation_df.pearson_r > 0.95)
correlation_df[mask_series].sort_values('pearson_r', ascending=False).left_column.tolist()

In [None]:

def get_correlation_dataframe(x_column_list, y_column_list):
    rows_list = []
    for x_column in x_column_list:
        for y_column in y_column_list:
            if x_column != y_column:
                columns_list = [x_column, y_column]
                df = us_stats_df[columns_list].dropna()
                x = df[x_column].values
                y = df[y_column].values
                try:
                    r_tuple = stats.pearsonr(x, y)
                    if r_tuple[1] < 0.05:
                        row_dict = {}
                        row_dict['left_column'] = x_column
                        row_dict['right_column'] = y_column
                        row_dict['pearson_r'] = abs(r_tuple[0])
                        rows_list.append(row_dict)
                except Exception as e:
                    print('{} and {} get an error: {}'.format(x_column, y_column, e))
    correlation_df = pd.DataFrame(rows_list, columns=['left_column', 'right_column', 'pearson_r'])
    
    return correlation_df

In [None]:

mask_series = (column_descriptions_df.dtype == 'float32')
numeric_columns_list = [row_series.column_name for row_index, row_series in column_descriptions_df[mask_series].iterrows()]
new_columns_list = us_stats_df.T.tail(13).index.tolist()
old_columns_list = list(set(numeric_columns_list) - set(new_columns_list))
correlation_df = get_correlation_dataframe(new_columns_list, old_columns_list)
correlation_df.sort_values('pearson_r', ascending=False)


---
# Linear Scatterplots

In [None]:

def show_linear_scatterplot(merged_df, columns_list, ev_min_str=None, ev_max_str=None, rv_min_str=None, rv_max_str=None):
    ev_column_name = columns_list[0]
    rv_column_name = columns_list[1]
    explanatory_variable = get_column_description(ev_column_name)
    response_variable = get_column_description(rv_column_name)
    if (ev_min_str is None):
        ev_min_str = 'minimum {}'.format(explanatory_variable)
    if (ev_max_str is None):
        ev_max_str = 'maximum {}'.format(explanatory_variable)
    if (rv_min_str is None):
        rv_min_str = 'minimum {}'.format(response_variable)
    if (rv_max_str is None):
        rv_max_str = 'maximum {}'.format(response_variable)
    
    df = merged_df.copy()
    columns_list = [ev_column_name, rv_column_name]
    df = df[columns_list].dropna()
    ev_max = df[ev_column_name].max()
    ev_min = df[ev_column_name].min()
    rv_min = df[rv_column_name].min()
    rv_max = df[rv_column_name].max()
    ev_max_labeled = False
    ev_min_labeled = False
    rv_min_labeled = False
    rv_max_labeled = False
    
    # First order (linear) scatterplot
    fig1_fig = plt.figure(figsize=(12, 8))
    merge_axes_subplot = sns.regplot(x=ev_column_name, y=rv_column_name,
                                     scatter=True, data=df)
    xlabel_text = plt.xlabel('{} (explanatory variable)'.format(explanatory_variable))
    ylabel_text = plt.ylabel('{} (response variable)'.format(response_variable))
    
    # Add annotations
    for label, x, y in zip(df.index, df[ev_column_name], df[rv_column_name]):
        if (x == ev_min):
            if not ev_min_labeled:
                ev_min_labeled = True
                annotation = plt.annotate('{} ({})'.format(label, ev_min_str), xy=(x, y), xytext=ev_min_xytext, **kwargs)
        elif (x == ev_max):
            if not ev_max_labeled:
                ev_max_labeled = True
                annotation = plt.annotate('{} ({})'.format(label, ev_max_str), xy=(x, y), xytext=ev_max_xytext, **kwargs)
        elif (y == rv_min):
            if not rv_min_labeled:
                rv_min_labeled = True
                annotation = plt.annotate('{} ({})'.format(label, rv_min_str), xy=(x, y), xytext=rv_min_xytext, **kwargs)
        elif (y == rv_max):
            if not rv_max_labeled:
                rv_max_labeled = True
                annotation = plt.annotate('{} ({})'.format(label, rv_max_str), xy=(x, y), xytext=rv_max_xytext, **kwargs)
        elif (label == 'Arizona'):
            annotation = plt.annotate('{} (my home state)'.format(label), xy=(x, y), xytext=az_xytext, **kwargs)
    
    # Add r-squared
    x = df[ev_column_name].values
    y = df[rv_column_name].values
    plt.text(0.92, 0.965, r'$r^2 = {0:.2}$'.format(stats.pearsonr(x, y)[0] ** 2), fontsize=20, alpha=0.25,
             horizontalalignment='center', verticalalignment='center', transform=merge_axes_subplot.transAxes)

In [None]:

def get_column_description(column_name):
    if column_name in column_description_dict:
        column_description = column_description_dict[column_name]
    else:
        column_description = re.sub('^pf_', 'Personal Freedom:_', str(column_name), 1)
        column_description = re.sub('^hf_', 'Human Freedom:_', str(column_description), 1)
        column_description = re.sub('^ef_', 'Economic Freedom:_', str(column_description), 1)
        column_list = column_description.split('_')
        descr_list = []
        for word in column_list:
            descr_list.append(word[0].upper()+word[1:])
        column_description = ' '.join(descr_list)
        column_description_dict[column_name] = column_description
        s.store_objects(column_description_dict=column_description_dict)
    
    return column_description

In [None]:

from scipy import stats

kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

ev_column_name = 'State_Budget_Processes'

rv_column_name = 'Homicide_Rate_2018'

ev_min_str = 'worst process'
ev_max_str = 'best process'
rv_min_str = 'least murderous'
rv_max_str = 'most murderous'
ev_min_xytext = (-5, -40)
ev_max_xytext = (-100, -40)
rv_min_xytext = (-45, -35)
rv_max_xytext = (-100, -50)
az_xytext = (0, 30)
columns_list = [ev_column_name, rv_column_name]
show_linear_scatterplot(us_stats_df, columns_list, ev_min_str=ev_min_str,
                        ev_max_str=ev_max_str, rv_min_str=rv_min_str, rv_max_str=rv_max_str)

In [None]:

correlation_df = get_correlation_dataframe(new_columns_list, new_columns_list)
correlation_df.sort_values('pearson_r', ascending=False)

In [None]:

from scipy import stats

kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

ev_column_name = 'Executive_Accountability'

rv_column_name = 'Legislative_Accountability'

ev_min_str = 'least executive accountability'
ev_max_str = 'most executive accountability'
rv_min_str = 'least legislative accountability'
rv_max_str = 'most legislative accountability'
ev_min_xytext = (15, 5)
ev_max_xytext = (-210, -30)
rv_min_xytext = (-45, -35)
rv_max_xytext = (-100, -50)
az_xytext = (-15, 10)
columns_list = [ev_column_name, rv_column_name]
show_linear_scatterplot(us_stats_df, columns_list, ev_min_str=ev_min_str,
                        ev_max_str=ev_max_str, rv_min_str=rv_min_str, rv_max_str=rv_max_str)

In [None]:

kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

ev_column_name = 'Guns_Registered'
rv_column_name = 'Suicide_Deaths_2017'

ev_min_str = 'least gun-nutty'
ev_max_str = 'most gun-nutty'
rv_min_str = 'least suicidal'
rv_max_str = 'most suicidal'
ev_min_xytext = (50, -10)
ev_max_xytext = (-150, -60)
rv_min_xytext = (20, -15)
rv_max_xytext = (100, -50)
az_xytext = (60, 50)
columns_list = [ev_column_name, rv_column_name]
df = us_stats_df[columns_list+['Total_Inhabitants_2010']].copy()
df[ev_column_name] = df[ev_column_name]/df['Total_Inhabitants_2010']
df[rv_column_name] = df[rv_column_name]/df['Total_Inhabitants_2010']
mask_series = (df.index == 'Wyoming')
show_linear_scatterplot(df[~mask_series], columns_list, ev_min_str=ev_min_str,
                        ev_max_str=ev_max_str, rv_min_str=rv_min_str, rv_max_str=rv_max_str)

In [10]:

us_stats_df = s.load_object('us_stats_df')
tables_url = 'https://en.wikipedia.org/wiki/Firearm_death_rates_in_the_United_States_by_state'
tables_df_list = ssu.get_page_tables(tables_url, verbose=True)

[(1, (52, 8)), (0, (50, 4))]


In [None]:

us_stats_df.district_abbreviation.to_dict()

In [None]:

gun_murders_df = tables_df_list[4].copy()
gun_murders_df.set_index('State', inplace=True)
print(gun_murders_df.columns.tolist())
gun_murders_df.columns = ['Total_Inhabitants_2010', 'Inhabitants_Per_Square_Mile_2010', 'Total_Murder_Deaths_2010',
                          'Total_Gun_Murder_Deaths_2010', 'Gun_Ownership_Percent_2013', 'Murder_Rate_2010',
                          'Gun_Murder_Rate_2010']
gun_murders_df.Gun_Ownership_Percent_2013 = gun_murders_df.Gun_Ownership_Percent_2013.map(lambda x: float(str(x).split('%')[0]))
gun_murders_df.sample(5).T

In [None]:

gun_suicides_df = s.load_csv(csv_name='gun_suicides_by_state',
                             folder_path=s.data_folder).dropna(axis=0, how='all').dropna(axis='columns', how='all')
gun_suicides_df = gun_suicides_df.iloc[11:].dropna(axis='columns', how='all')
abbrev_dict = {row_series.district_abbreviation: state_name for state_name, row_series in us_stats_df.iterrows()}
abbrev_dict['DC'] = 'District of Columbia'
gun_suicides_df.ST = gun_suicides_df.ST.map(lambda x: abbrev_dict[x])
gun_suicides_df.columns = ['State', 'State_FIPS', 'State_Population', 'Gun_Suicide_Deaths', 'Gun_Suicide_Rate']
gun_suicides_df.set_index('State', inplace=True)
gun_suicides_df.Gun_Suicide_Rate = gun_suicides_df.Gun_Suicide_Rate.map(lambda x: float(x))
for column_name in ['State_FIPS', 'State_Population', 'Gun_Suicide_Deaths']:
    gun_suicides_df[column_name] = gun_suicides_df[column_name].map(lambda x: int(x))
gun_suicides_df.sample(5).T

In [None]:

set(gun_merge_df.columns).intersection(set(us_stats_df.columns))

In [None]:

columns_list = ['White_Percent', 'Black_Percent', 'Hispanic_Percent', 'Asian_Percent', 'Native_Percent', 'Islander_Percent',
                'Multi_Percent', 'Gini_Index', 'Effectiveness_Rank', 'Health_Care_Score', 'Education_Score', 'Economy_Score',
                'Infrastructure_Score', 'Opportunity_Score', 'Fiscal_Stability_Score', 'Crime_Corrections_Score',
                'Natural_Environment_Score', 'district_abbreviation', 'GDP_Rank', 'GDP_2018', 'GDP_Percent', 'State_Region',
                'Homicide_Rate_2018', 'Homicide_Rate_2017', 'Homicide_Rate_2014', 'Homicide_Rate_2010', 'Homicide_Rate_2005',
                'Homicide_Rate_2000', 'Homicide_Rate_1996', 'Guns_Rank', 'Guns_Per_Capita', 'Guns_Registered',
                'Suicide_Rate_2017', 'Suicide_Deaths_2017', 'Suicide_Rate_2016', 'Suicide_Deaths_2016', 'Suicide_Rate_2015',
                'Suicide_Deaths_2015', 'Suicide_Rate_2014', 'Suicide_Deaths_2014', 'Suicide_Rate_2005', 'Suicide_Deaths_2005']
us_stats_df = pd.merge(left=us_stats_df[columns_list], right=gun_merge_df, left_index=True,
                          right_index=True, suffixes=('_merge', '_guns'))

In [None]:

us_stats_df.columns.tolist()

In [None]:

column_descriptions_df = ssu.get_column_descriptions(df=us_stats_df, column_list=us_stats_df.columns)
mask_series = (column_descriptions_df.dtype.isin(['int64', 'float64']))
print(column_descriptions_df[~mask_series].column_name.tolist())
column_descriptions_df[~mask_series]

In [None]:

column_descriptions_df = ssu.get_column_descriptions(us_stats_df)
column_descriptions_df['dtype'].unique()

In [None]:

mask_series = (column_descriptions_df['dtype'].isin(['int64', 'float64']))
numeric_columns_list = column_descriptions_df[mask_series]['column_name'].tolist()

In [None]:

for column_name in numeric_columns_list:
    us_stats_df[column_name] = pd.to_numeric(us_stats_df[column_name])

In [None]:

from scipy import stats

r_columns_list = []
rows_list = []
for x_column in numeric_columns_list:
    for y_column in numeric_columns_list:
        if x_column != y_column:
            columns_list = [x_column, y_column]
            df = us_stats_df[columns_list].dropna()
            x = df[x_column].values
            y = df[y_column].values
            try:
                r_tuple = stats.pearsonr(x, y)
                if r_tuple[1] < 0.05:
                    c_tuple = ('/'.join(columns_list), row_dict['pearson_r'])
                    r_columns_list.append(c_tuple)
            except Exception as e:
                print('{} and {} get an error: {}'.format(x_column, y_column, e))

In [None]:

column_pairs_list = sorted(r_columns_list, key=lambda x: x[1], reverse=True)
[column_pairs_list[0][0].split('/')[0], column_pairs_list[0][0].split('/')[1]]

In [None]:

us_stats_df = s.load_object('us_stats_df')
kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

ev_column_name = 'Guns_Registered'
column_description_dict[ev_column_name] = 'Number of Guns Registered'
#s.store_objects(column_description_dict=column_description_dict)

rv_column_name = 'Gun_Murder_Rate_2010'
column_description_dict[rv_column_name] = 'Gun Murder Rate'
s.store_objects(column_description_dict=column_description_dict)

ev_min_str = 'least gun-nutty'
ev_max_str = 'most gun-nutty'
rv_min_str = 'least murderous'
rv_max_str = 'most murderous'
ev_min_xytext = (-5, 90)
ev_max_xytext = (-130, -60)
rv_min_xytext = (20, -15)
rv_max_xytext = (-100, -50)
az_xytext = (-60, 50)
columns_list = [ev_column_name, rv_column_name]
show_linear_scatterplot(us_stats_df, columns_list, ev_min_str=ev_min_str,
                        ev_max_str=ev_max_str, rv_min_str=rv_min_str, rv_max_str=rv_max_str)

In [None]:

import re
from scipy import stats
%run ../load_magic/dataframes.py

us_stats_df = s.load_object('us_stats_df')
kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

ev_column_name = 'Guns_Registered'
column_description_dict[ev_column_name] = 'Number of Guns Registered'
#s.store_objects(column_description_dict=column_description_dict)

rv_column_name = 'Gun_Suicide_Deaths'
column_description_dict[rv_column_name] = 'Gun Suicide Deaths'
s.store_objects(column_description_dict=column_description_dict)

ev_min_str = 'least gun-nutty'
ev_max_str = 'most gun-nutty'
rv_min_str = 'least suicidal'
rv_max_str = 'most suicidal'
ev_min_xytext = (-5, 110)
ev_max_xytext = (-130, -60)
rv_min_xytext = (20, -30)
rv_max_xytext = (-100, -50)
az_xytext = (-60, 50)
columns_list = [ev_column_name, rv_column_name]
show_linear_scatterplot(us_stats_df, columns_list, ev_min_str=ev_min_str,
                        ev_max_str=ev_max_str, rv_min_str=rv_min_str, rv_max_str=rv_max_str)

In [None]:

rows_list = []
for row_index, row_series in correlation_df.sort_values('pearson_r', ascending=False).iterrows():
    left_column = row_series['left_column']
    right_column = row_series['right_column']
    if ('gun' in left_column.lower()) or ('gun' in right_column.lower()):
        rows_list.append(row_series.to_dict())
pd.DataFrame(rows_list).head(20)

In [None]:

rows_list = []
for row_index, row_series in correlation_df.sort_values('pearson_r', ascending=False).iterrows():
    left_column = row_series['left_column']
    right_column = row_series['right_column']
    if ('gun' in left_column.lower()) or ('gun' in right_column.lower()):
        rows_list.append(row_series.to_dict())
pd.DataFrame(rows_list).head(20)

In [None]:

#print(['row_series.{}'.format(fn) for fn in dir(row_series) if 'dict' in fn.lower()])

In [11]:

tables_url = 'https://www.thoughtco.com/gun-owners-percentage-of-state-populations-3325153'
tables_df_list = ssu.get_page_tables(tables_url, verbose=True)

[(0, (52, 3))]


In [None]:

nutty_df = tables_df_list[0].dropna(axis=0, how='all').dropna(axis='columns', how='all')
nutty_df.columns = ['Guns_Rank', 'State', 'Guns_Per_Capita', 'Guns_Registered']
nutty_df = nutty_df.iloc[1:]
nutty_df.set_index('State', inplace=True)
us_stats_df = pd.merge(left=us_stats_df, right=nutty_df, left_index=True,
                          right_index=True, suffixes=('_merge', '_nutty'))
s.store_objects(us_stats_df=us_stats_df)

In [None]:

suicide_df = s.load_csv(csv_name='Suicide Mortality by State',
                        folder_path=s.data_folder).dropna(axis=0, how='all').dropna(axis='columns', how='all')
columns_list = ['Suicide_Year', 'district_abbreviation', 'Suicide_Rate', 'Suicide_Deaths']
suicide_df.columns = columns_list + ['URL']
suicide_df = suicide_df[columns_list]
suicide_df.Suicide_Year = suicide_df.Suicide_Year.map(lambda x: int(x))
suicide_df = s.load_csv(csv_name='Suicide Mortality by State',
                        folder_path=s.data_folder).dropna(axis=0, how='all').dropna(axis='columns', how='all')
suicide_df

In [None]:

for year in suicide_df.Suicide_Year.unique():
    mask_series = (suicide_df.Suicide_Year == year)
    df = suicide_df[mask_series]
    columns_list = ['district_abbreviation', 'Suicide_Rate_{}'.format(year),
                    'Suicide_Deaths_{}'.format(year)]
    df.columns = ['Suicide_Year'] + columns_list
    df = df[columns_list]
    us_stats_df = pd.merge(left=us_stats_df, right=df, how='inner', on='district_abbreviation', suffixes=('_merge', '_suicide'))

In [None]:

print(us_stats_df.columns.tolist())

In [None]:

import re

for column_name in ['Guns_Rank', 'Guns_Registered']:
    us_stats_df[column_name] = us_stats_df[column_name].map(lambda x: int(x))
us_stats_df.Guns_Per_Capita = us_stats_df.Guns_Per_Capita.map(lambda x: float(x))
for year in [2005, 2014, 2015, 2016, 2017]:
    for infix in ['Rate', 'Deaths']:
        column_name = 'Suicide_{}_{}'.format(infix, year)
        us_stats_df[column_name] = us_stats_df[column_name].map(lambda x: int(re.sub(r'[^0-9\.]+', '', str(x))))

In [None]:

us_stats_df.Suicide_Deaths_2017.unique()

In [12]:

#us_stats_df['district_abbreviation']
file_path = os.path.join(s.data_folder, 'html', 'us_state_abbreviations.html')
#tables_url = 'https://www.50states.com/abbreviations.htm'
tables_df_list = ssu.get_page_tables(file_path, verbose=True)

[(0, (60, 2))]


In [None]:

abbrevs_df = tables_df_list[0].dropna(axis=0, how='all').dropna(axis='columns', how='all')
abbrevs_df.columns = ['State', 'district_abbreviation']
us_stats_df = pd.merge(left=us_stats_df, right=abbrevs_df, on='district_abbreviation', suffixes=('_merge', '_abbrevs'))
us_stats_df.set_index('State', inplace=True)
us_stats_df.sample(5).T

In [None]:

s.store_objects(us_stats_df=us_stats_df)

In [None]:

columns_list = ['Guns_Rank', 'Guns_Per_Capita', 'Guns_Registered', 'Suicide_Rate_2017', 'Suicide_Deaths_2017', 'Suicide_Rate_2016',
                'Suicide_Deaths_2016', 'Suicide_Rate_2015', 'Suicide_Deaths_2015', 'Suicide_Rate_2014', 'Suicide_Deaths_2014',
                'Suicide_Rate_2005', 'Suicide_Deaths_2005']
ssu.get_column_descriptions(df=us_stats_df, column_list=columns_list)

In [None]:

get_max_rsquared_adj(df=us_stats_df, columns_list=columns_list,
                     verbose=False).sort_values('max_similarity', ascending=False)

In [None]:

import matplotlib.pyplot as plt

# Use the following only if you are on a high definition device
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina')

import seaborn as sns
import matplotlib.colors as mcolors

basecolor_list = list(mcolors.BASE_COLORS.keys())

In [None]:

import re
from scipy import stats
%run ../load_magic/dataframes.py

us_stats_df = s.load_object('us_stats_df')
kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))

ev_column_name = 'Guns_Registered'
column_description_dict[ev_column_name] = 'Number of Guns Registered'
s.store_objects(column_description_dict=column_description_dict)

rv_column_name = 'Suicide_Deaths_2017'
column_description_dict[rv_column_name] = 'Suicide Deaths in 2017'
s.store_objects(column_description_dict=column_description_dict)

ev_min_str = 'least gun-nutty'
ev_max_str = 'most gun-nutty'
rv_min_str = 'least suicidal'
rv_max_str = 'most suicidal'
ev_min_xytext = (-5, 150)
ev_max_xytext = (-130, -100)
rv_min_xytext = (20, -30)
rv_max_xytext = (-100, -50)
az_xytext = (-80, 50)
columns_list = [ev_column_name, rv_column_name]
show_linear_scatterplot(us_stats_df, columns_list, ev_min_str=ev_min_str,
                        ev_max_str=ev_max_str, rv_min_str=rv_min_str, rv_max_str=rv_max_str)

In [None]:

s.store_objects(us_stats_df=us_stats_df)

In [None]:

tables_url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_GDP'
tables_df_list = pd.read_html(tables_url)
print([(i, df.shape) for (i, df) in enumerate(tables_df_list) if df.shape[0] > 50])

In [None]:

state_gdps_df = tables_df_list[2].dropna(axis=0, how='all').dropna(axis='columns', how='all')
state_gdps_df.columns = ['Rank', 'State', '2018', '% of Nation', 'Region']
state_gdps_df = state_gdps_df.iloc[1:]
state_gdps_df.set_index('State', inplace=True)
for column_name in ['Rank', '2018']:
    state_gdps_df[column_name] = state_gdps_df[column_name].map(lambda x: int(str(x).split('[')[0]))
for column_name in ['% of Nation']:
    state_gdps_df[column_name] = state_gdps_df[column_name].map(lambda x: float(str(x).split('[')[0]))
for column_name in ['Region']:
    state_gdps_df[column_name] = state_gdps_df[column_name].map(lambda x: str(x).split('[')[0])
state_gdps_df

In [None]:

us_stats_df = load_object('us_stats_df')
print(us_stats_df.shape, state_gdps_df.shape)
us_stats_df = pd.merge(left=us_stats_df, right=state_gdps_df, left_index=True, right_index=True, suffixes=('_merge', '_gdp'))
print(us_stats_df.shape)
us_stats_df.columns.tolist()

In [None]:

us_stats_df.columns = ['White_Percent', 'Black_Percent', 'Hispanic_Percent', 'Asian_Percent', 'Native_Percent', 'Islander_Percent',
                          'Multi_Percent', 'Gini_Index', 'Effectiveness_Rank', 'Health_Care_Score', 'Education_Score', 'Economy_Score',
                          'Infrastructure_Score', 'Opportunity_Score', 'Fiscal_Stability_Score', 'Crime_Corrections_Score',
                          'Natural_Environment_Score', 'district_abbreviation', 'GDP_Rank', 'GDP_2018', 'GDP_Percent', 'State_Region']
store_objects(us_stats_df=us_stats_df)

In [None]:

us_stats_df.sample(10).T

In [None]:

import matplotlib.pyplot as plt

# Use the following only if you are on a high definition device
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina')

import seaborn as sns

explanatory_variable = 'Effectiveness Rank'
ev_column_name = 'Effectiveness_Rank'
response_variable = 'GDP Rank'
rv_column_name = 'GDP_Rank'

In [None]:

df = us_stats_df.copy()
ev_min_str = 'most effective'
ev_max_str = 'least effective'
rv_min_str = 'highest GDP'
rv_max_str = 'lowest GDP'
ev_min_xytext = (-5, 150)
ev_max_xytext = (-135, -30)
rv_min_xytext = (20, -10)
rv_max_xytext = (-100, -50)
us_xytext = (-90, 40)

fig1_fig = plt.figure(figsize=(12,8))
columns_list = [ev_column_name, rv_column_name]
df = df[columns_list].dropna()

# First order (linear) scatterplot
merge_axes_subplot = sns.regplot(x=ev_column_name, y=rv_column_name,
                                 scatter=True, data=df)
xlabel_text = plt.xlabel('{} (explanatory variable)'.format(explanatory_variable))
ylabel_text = plt.ylabel('{} (response variable)'.format(response_variable))
kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
ev_max = df[ev_column_name].max()
ev_min = df[ev_column_name].min()
rv_min = df[rv_column_name].min()
rv_max = df[rv_column_name].max()
for label, x, y in zip(df.index, df[ev_column_name], df[rv_column_name]):
    if (x == ev_min):
        annotation = plt.annotate('{} ({})'.format(label, ev_min_str), xy=(x, y), xytext=ev_min_xytext, **kwargs)
    elif (x == ev_max):
        annotation = plt.annotate('{} ({})'.format(label, ev_max_str), xy=(x, y), xytext=ev_max_xytext, **kwargs)
    elif (y == rv_min):
        annotation = plt.annotate('{} ({})'.format(label, rv_min_str), xy=(x, y), xytext=rv_min_xytext, **kwargs)
    elif (y == rv_max):
        annotation = plt.annotate('{} ({})'.format(label, rv_max_str), xy=(x, y), xytext=rv_max_xytext, **kwargs)

In [None]:

tables_url = 'https://en.wikipedia.org/wiki/List_of_U.S._states_by_homicide_rate'
tables_df_list = pd.read_html(tables_url)
print([(i, df.shape) for (i, df) in enumerate(tables_df_list) if df.shape[0] >= 50])

In [None]:

homicide_df = tables_df_list[0]
homicide_df.set_index('State', inplace=True)
for column_name in homicide_df.columns:
    homicide_df[column_name] = homicide_df[column_name].map(lambda x: float(x))
homicide_df.columns = ['Homicide_Rate_{}'.format(cn) for cn in homicide_df.columns]
homicide_df.sample(5)

In [None]:

print(us_stats_df.shape, homicide_df.shape)
us_stats_df = pd.merge(left=us_stats_df, right=homicide_df, left_index=True, right_index=True, suffixes=('_merge', '_homicide'))
print(us_stats_df.shape)
us_stats_df.columns.tolist()

In [None]:

store_objects(us_stats_df=us_stats_df)

In [None]:

us_stats_df.sample(10).T

In [None]:

import matplotlib.pyplot as plt

# Use the following only if you are on a high definition device
from matplotlib_inline.backend_inline import set_matplotlib_formats
set_matplotlib_formats('retina')

import seaborn as sns

explanatory_variable = 'Percent Black'
ev_column_name = 'Black_Percent'
response_variable = 'Homicide Rate 2014'
rv_column_name = 'Homicide_Rate_2014'

In [None]:

def show_linear_scatterplot(merged_df, columns_list, ev_min_str=None, ev_max_str=None, rv_min_str=None, rv_max_str=None):
    ev_column_name = columns_list[0]
    rv_column_name = columns_list[1]
    explanatory_variable = get_column_description(ev_column_name)
    response_variable = get_column_description(rv_column_name)
    if (ev_min_str is None):
        ev_min_str = 'minimum {}'.format(explanatory_variable)
    if (ev_max_str is None):
        ev_max_str = 'maximum {}'.format(explanatory_variable)
    if (rv_min_str is None):
        rv_min_str = 'minimum {}'.format(response_variable)
    if (rv_max_str is None):
        rv_max_str = 'maximum {}'.format(response_variable)
    
    df = merged_df.copy()
    columns_list = [ev_column_name, rv_column_name]
    df = df[columns_list].dropna()
    ev_max = df[ev_column_name].max()
    ev_min = df[ev_column_name].min()
    rv_min = df[rv_column_name].min()
    rv_max = df[rv_column_name].max()
    ev_max_labeled = False
    ev_min_labeled = False
    rv_min_labeled = False
    rv_max_labeled = False
    
    # First order (linear) scatterplot
    fig1_fig = plt.figure(figsize=(12,8))
    merge_axes_subplot = sns.regplot(x=ev_column_name, y=rv_column_name,
                                     scatter=True, data=df)
    xlabel_text = plt.xlabel('{} (explanatory variable)'.format(explanatory_variable))
    ylabel_text = plt.ylabel('{} (response variable)'.format(response_variable))
    
    # Add annotations
    for label, x, y in zip(df.index, df[ev_column_name], df[rv_column_name]):
        if (x == ev_min):
            if not ev_min_labeled:
                ev_min_labeled = True
                annotation = plt.annotate('{} ({})'.format(label, ev_min_str), xy=(x, y), xytext=ev_min_xytext, **kwargs)
        elif (x == ev_max):
            if not ev_max_labeled:
                ev_max_labeled = True
                annotation = plt.annotate('{} ({})'.format(label, ev_max_str), xy=(x, y), xytext=ev_max_xytext, **kwargs)
        elif (y == rv_min):
            if not rv_min_labeled:
                rv_min_labeled = True
                annotation = plt.annotate('{} ({})'.format(label, rv_min_str), xy=(x, y), xytext=rv_min_xytext, **kwargs)
        elif (y == rv_max):
            if not rv_max_labeled:
                rv_max_labeled = True
                annotation = plt.annotate('{} ({})'.format(label, rv_max_str), xy=(x, y), xytext=rv_max_xytext, **kwargs)
        elif (label == 'Arizona'):
            annotation = plt.annotate('{} (my home state)'.format(label), xy=(x, y), xytext=az_xytext, **kwargs)
    
    # Add r-squared
    x = df[ev_column_name].values
    y = df[rv_column_name].values
    plt.text(0.92, 0.965, r'$r^2 = {0:.2}$'.format(stats.pearsonr(x, y)[0] ** 2), fontsize=20, alpha=0.25,
             horizontalalignment='center', verticalalignment='center', transform=merge_axes_subplot.transAxes)

In [None]:

import re
from scipy import stats

us_stats_df = load_object('us_stats_df')
kwargs = dict(textcoords='offset points', ha='left', va='bottom',
              bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5),
              arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
ev_min_str = 'least black'
ev_max_str = 'most black'
rv_min_str = 'least murderous'
rv_max_str = 'most murderous'
ev_min_xytext = (-5, 150)
ev_max_xytext = (-130, -100)
rv_min_xytext = (20, -10)
rv_max_xytext = (-100, -50)
az_xytext = (-80, 60)
columns_list = [ev_column_name, rv_column_name]
show_linear_scatterplot(us_stats_df, columns_list, ev_min_str=ev_min_str, ev_max_str=ev_max_str, rv_min_str=rv_min_str, rv_max_str=rv_max_str)


----

In [None]:

column_name = 'Total_Gun_Murder_Deaths_2010'
cb1 = c.show_colorbar(column_name)

In [None]:

from matplotlib.colorbar import ColorbarBase
from matplotlib.pyplot import savefig

savefig??

In [None]:

print([f'cb1.{fn}' for fn in dir(cb1) if 'fig' in fn.lower()])
print([f'cb1.{fn}' for fn in dir(cb1) if 'ax' in fn.lower()])

In [None]:

fig, ax = plt.subplots(figsize=(1, 6))
print([f'plt.{fn}' for fn in dir(plt) if 'fig' in fn.lower()])
print([f'plt.{fn}' for fn in dir(plt) if 'ax' in fn.lower()])

In [None]:

from matplotlib.backends.backend_agg import FigureCanvasAgg as fga

canvas_obj = fga(figure=fig)
print([f'fga.{fn}' for fn in dir(fga) if not fn.startswith('_')])
print([f'canvas_obj.{fn}' for fn in dir(canvas_obj) if not fn.startswith('_')])


----

In [11]:

if 'Estimated_IQ' not in us_stats_df.columns:
    if s.csv_exists('iq_by_us_state'):
        iq_by_us_state_df = s.load_csv('iq_by_us_state')
        iq_by_us_state_df.set_index('State', drop=True, inplace=True)
        iq_by_us_state_df.columns = ['Estimated_IQ', 'Percent_Whites_in_Non_Public_Education', 'Gross_Product', 'Health', 'Violent_Crime',
                                     'Government_Effectiveness']
        us_stats_df = pd.merge(left=us_stats_df, right=iq_by_us_state_df, how='outer', left_index=True,
                               right_index=True, suffixes=('_stats', '_iq'))
        s.store_objects(us_stats_df=us_stats_df)

In [4]:

if notebook_path is not None:
    !start %windir%\explorer.exe "{os.path.abspath(os.path.dirname(notebook_path))}"