
# Set up the notebook

In [1]:

%pprint

Pretty printing has been turned OFF


In [2]:

import sys

# Insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../py')

from choropleth_utils import ChoroplethUtilities
from stats_scraping_utils import StatsScrapingUtilities
from storage import Storage
import pandas as pd
import re
import os
import numpy as np

s = Storage()
ssu = StatsScrapingUtilities(s=s)

In [52]:

column_description_dict = s.load_object('column_description_dict')
us_stats_df = s.load_object('us_stats_df')

all_countries_df = s.load_object('all_countries_df').set_index('country_code', drop=True)
all_countries_df.country_name = all_countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
s.store_objects(all_countries_df=all_countries_df.reset_index(drop=False))

Pickling to C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\pkl\all_countries_df.pkl



----
# Get State/Country CO<sub>2</sub> Emissions Equivalents


## Clean and prepare US States dataset

In [21]:

url = 'https://www.bts.gov/browse-statistical-products-and-data/state-transportation-statistics/energy-consumption-and-co2'
tables_list = ssu.get_page_tables(url)

No tables found
[]


In [23]:

us_states_df = s.load_csv('Energy_Consumption_and_CO2_Emissions_by_us_state')
# print(us_states_df.columns.tolist())
us_states_df.columns = ['state_name', 'measure_str', 'sector_str', 'measure_year', 'pivot_list', 'co2_emissions_mmt']
mask_series = us_states_df.state_name.isin(ssu.us_states_list) & (us_states_df.sector_str == 'Total') & (us_states_df.measure_year == 2017)
print(us_states_df[mask_series].shape)
columns_list = ['state_name', 'co2_emissions_mmt']
us_states_df = us_states_df[mask_series][columns_list]
print(us_states_df.co2_emissions_mmt.sum())

url = 'https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv'
POPULATION_DICT = pd.read_csv(url, encoding=s.encoding_type).set_index('NAME').POPESTIMATE2017.to_dict()
def f(row_series):
    state_name = row_series.state_name
    co2_emissions_mmt = row_series.co2_emissions_mmt
    
    return 1_000*co2_emissions_mmt/POPULATION_DICT[state_name]
us_states_df['co2_emissions_mmt_per_capita'] = us_states_df.apply(f, axis='columns')

us_states_df.sample(5)

(51, 6)
5166.4


Unnamed: 0,state_name,co2_emissions_mmt,co2_emissions_mmt_per_capita
1758,Maryland,51.9,0.008616
162,Alaska,34.3,0.04637
2094,Mississippi,67.8,0.022687
3438,South Carolina,69.2,0.013781
1926,Michigan,152.7,0.015311


In [25]:

mask_series = (us_states_df.state_name == 'District of Columbia')
us_states_df = us_states_df[~mask_series]

In [26]:

# Remove US states duplicates and misspellings
states_list = sorted(set(us_states_df.state_name).symmetric_difference(set(ssu.us_stats_df.index)))
doubles_df = ssu.check_4_doubles(states_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = us_states_df.duplicated(subset=['state_name'], keep=False)
if us_states_df[mask_series].shape[0]:
    display(us_states_df[mask_series])


## Clean and prepare Countries dataset

In [28]:

driver = ssu.get_driver()
tables_list = ssu.get_page_tables('https://worldpopulationreview.com/country-rankings/carbon-footprint-by-country', driver=driver)
driver.close()

Getting the FireFox driver
[(0, (210, 6))]


In [29]:

countries_df = tables_list[0].copy()
# print(countries_df.columns.tolist())
countries_df.columns = ['country_name', 'co2_emissions_mt_2020',  'co2_emissions_mt_2017',
                        'co2_emissions_per_capita_2020', 'co2_emissions_per_capita_2017', 'country_population_2022']
for cn in ['co2_emissions_mt_2020', 'co2_emissions_mt_2017', 'co2_emissions_per_capita_2020', 'co2_emissions_per_capita_2017']:
    countries_df[cn] = pd.to_numeric(countries_df[cn].map(lambda x: re.sub(r'[^0-9\.]+', '', str(x))), errors='coerce', downcast='float')
mask_series = countries_df.country_name.isnull()
countries_df = countries_df[~mask_series]
countries_df.country_name = countries_df.country_name.map(lambda x: str(x).split('*')[0].strip())
countries_df.head(9).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
country_name,China,United States,India,Russia,Japan,Iran,Germany,South Korea,Saudi Arabia
co2_emissions_mt_2020,11680.419922,4535.299805,2411.72998,1674.22998,1061.77002,690.23999,636.880005,621.469971,588.809998
co2_emissions_mt_2017,10877.219727,5107.390137,2454.77002,1764.869995,1320.780029,671.450012,796.530029,673.320007,638.76001
co2_emissions_per_capita_2020,8.2,13.68,1.74,11.64,8.39,8.26,7.72,12.07,16.959999
co2_emissions_per_capita_2017,7.7,15.7,1.8,12.3,10.4,8.3,9.7,13.2,19.4
country_population_2022,1425887337,338289857,1417173173,144713314,123951692,88550570,83369843,51815810,36408820


In [30]:

# Remove country duplicates and misspellings
countries_df.country_name = countries_df.country_name.map(lambda x: ssu.country_name_dict.get(x, x))
countries_list = sorted(set(countries_df.country_name).symmetric_difference(set(all_countries_df.country_name)))
doubles_df = ssu.check_4_doubles(countries_list)
mask_series = (doubles_df.max_similarity > 0.6)
columns_list = ['first_item', 'second_item', 'max_similarity']
if doubles_df[mask_series].shape[0]:
    display(doubles_df[mask_series][columns_list].sort_values('max_similarity', ascending=False))
mask_series = countries_df.duplicated(subset=['country_name'], keep=False)
if countries_df[mask_series].shape[0]:
    display(countries_df[mask_series])

Unnamed: 0,first_item,second_item,max_similarity
10,Falkland Islands,Åland Islands,0.827586
32,Sint Maarten,St. Martin,0.727273
15,Guernsey,Jersey,0.714286
21,Marshall Islands,Åland Islands,0.62069



## Prepare for and Create Choropleth

In [None]:

equivalence_column_name = 'Country_Equivalent_CO2_Emissions'
states_target_column_name = 'co2_emissions_mmt'
mask_series = countries_df.country_name.isin(all_countries_df.country_name)
ssu.prepare_for_choroplething(countries_df[mask_series], 'co2_emissions_mt_2017', us_states_df, st_col_name=states_target_column_name,
                              st_col_explanation='CO2 Emissions in Millions of Metric Tons (2017)',
                              equivalence_column_name=equivalence_column_name, verbose=True)

In [32]:

c = ChoroplethUtilities(iso_3166_2_code='us', one_country_df=ssu.us_stats_df, all_countries_df=all_countries_df)
c.create_label_line_file()
svg_file_path = c.create_country_colored_labeled_map(
    numeric_column_name=states_target_column_name, string_column_name=equivalence_column_name,
    one_country_df=ssu.us_stats_df, cmap='summer')
print(os.path.abspath(svg_file_path))

C:\Users\daveb\OneDrive\Documents\GitHub\StatsByCountry\saves\svg\US_co2_emissions_mmt_Country_Equivalent_CO2_Emissions.svg
