Visualization babysteps. 
We start with the code of Ch 3 of 'Data Science from scratch' by Joel Grus
But we'll work with real gdp data

In [26]:
# all imports at the top
from matplotlib import pyplot as plt
import ipywidgets as widgets
import pandas as pd
import requests
import zipfile
from io import BytesIO
from pathlib import Path
from IPython.display import clear_output, display
from collections import Counter
import math
import numpy as np


In [27]:
# helper functions
def download_extract_gdp_data(url = 'https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.PP.CD?downloadformat=csv', destination_file : Path = Path('../data/gdp_pcap_ppp.csv'))-> None:
    """
    Download and extract GDP data from the World Bank API.
    
    Parameters:
    url (str): The URL to download the data from.
    destination_file (Path): The path to the destination file where it saves the extracted file. Defaults to '../data/gdp_ppp.csv'.
    
    Returns:
    None
    """
    print(f"Downloading data from the World Bank API {url} to {destination_file} ...")
    # download the csv file from the World Bank API
    response = requests.get(url)
    # unzip the contents, find the csv file that starts with API, and save it to the data folder
    with zipfile.ZipFile(BytesIO(response.content)) as z:
        for filename in z.namelist():
            if filename.startswith('API_') and filename.endswith('.csv'):
                with open(destination_file, 'wb') as f:
                    f.write(z.read(filename))
                # save the csv file to the data folder as gdp_ppp.csv
                print(f"Downloaded {filename} to {destination_file}")
                break
            
def add_country_indicator(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add a non-country indicator to the DataFrame.
    
    Parameters:
    df (pd.DataFrame): The DataFrame to modify.
    
    Returns:
    pd.DataFrame: The modified DataFrame with an additional column indicator that the row is dedicated to a country: 1 if it is a country, 0 if it is not.
    """
    #list of non-country codes 
    noncountry_codes = ['AFE','AFW','ARB', 'CEB', 'CSS', 'EAP', 'EAR', 'EAS', 'ECA', 'ECS', 'EMU', 
                    'EUU', 'FCS', 'HIC', 'HPC', 'IBD', 'IBT', 'IDA', 'IDB', 'IDX', 
                    'INX', 'LAC', 'LCN', 'LDC', 'LIC', 'LMC', 'LMY', 'LTE', 'MEA', 
                    'MIC', 'MNA', 'NAC', 'OED', 'OSS', 'PRE', 'PSS', 'PST', 'SAS', 
                    'SSA', 'SSF', 'SST', 'TEA', 'TEC', 'TLA', 'TMN', 'TSA', 'TSS',
                    'UMC', 'WLD']
    # add a non-country indicator to the DataFrame
    df['Is Country'] = df['Country Code'].apply(lambda x: 0 if x in noncountry_codes else 1)
    return df
 
def compute_allowed_bin_numbers(min_value:int, max_value:int, grain:int) -> list:
    """
    Compute allowed bin numbers for histogram.
    
    Parameters:
    min_value (int): The minimum value of the data.
    max_value (int): The maximum value of the data.
    grain (int): The minimum grain allowed.
    
    Returns:
    list: A list of allowed bin numbers.
    """
    # if the range is not divisible by the grain, adjust the min value to be divisible by the grain
    max_value = math.ceil(max_value / grain) * grain
    min_value = math.floor(min_value / grain) * grain
    grained_range = int((max_value - min_value) / grain)
    
    return [d for d in range(1, grained_range + 1) if grained_range % d == 0]

def compute_gdp_change() -> pd.DataFrame:
    """
    Compute the GDP change from 2010 to 2020 for each country.
    
    Returns:
    pd.DataFrame: A DataFrame containing the GDP change for each country.
    """
    # load the data
    df = pd.read_csv('../data/gdp_pcap_ppp.csv', skiprows=4)
    
    # add a non-country indicator to the DataFrame
    df = add_country_indicator(df)
    
    # filter the DataFrame to only include countries
    df = df[df['Is Country'] == 1]
    
    # compute the GDP change from 2010 to 2020
    df['GDP Change'] = df['2020'] - df['2010']
    
    return df[['Country Code', 'Country Name', 'GDP Change']]

def compute_gdp_change(year1:str, year2:str, df:pd.DataFrame)-> pd.DataFrame:
    gdp_change = df[['Country Name', year1, year2]]
    gdp_change = gdp_change.rename(columns={year1: 'Year1', year2: 'Year2'})
    gdp_change = gdp_change.dropna()
    # calculate the percentage change between the two years
    gdp_change['Change (%)'] = ((gdp_change['Year2'] - gdp_change['Year1']) / gdp_change['Year1']) * 100
    # sort the data frame by the percentage change
    gdp_change = gdp_change.sort_values(by='Change (%)', ascending=False)
    return gdp_change

In [28]:
# check if data folder has the gdp-ppp csv file
# if not, download it from the World Bank API

data_folder = Path("../data")
data_folder.mkdir(exist_ok=True)
csv_file = data_folder.joinpath("gdp_pcap_ppp.csv")
#TODO: look at the lates data on the World Bank API and compare its metadata with the one in the csv file, 
# if they are different, download the new csv file
if not csv_file.exists():
    download_extract_gdp_data(destination_file=csv_file) 
# read the csv file into a pandas dataframe
df = pd.read_csv(csv_file, skiprows=4) 


In [29]:

# ask the user for the country name
# get the list of countries from the dataframe
countries = df['Country Name'].unique()
# sort the countries alphabetically
countries = sorted(countries)
# create a dropdown widget for the country name
country_dropdown = widgets.Dropdown(
    options=countries,
    description='Country:',
    disabled=False,
)

output = widgets.Output()

#create a 1-line table all available gdp data for the selected country
def display_country_data(country_name:str, df:pd.DataFrame) -> None:
    """
    Display the GDP data for the selected country.
    
    Parameters:
    country_name (str): The name of the country to display data for.
    
    Returns:
    None
    """
    # filter the dataframe for the first row + the row of the  selected country
    country_data = df[df['Country Name'] == country_name]
    country_data = country_data.iloc[0]
    years = df.columns[4:]
    # filter the dataframe for the years columns
    country_data = country_data[years]
    # rename the columns to be the years
    country_data.columns = years
    #drop the missing values
    country_data = country_data.dropna()
    # plot the data as a line chart
    plt.figure(figsize=(12, 8))
    plt.plot(country_data.index, country_data.values, marker='o', linestyle='-', color='violet')
    plt.title(f"GDP per capita (PPP) in {country_name}")
    plt.xlabel('Year')
    plt.ylabel('GDP per capita (current international $)')
    plt.xticks(rotation=45)
    plt.grid()
    plt.show()
    
    
    
# a function to update the table when the country is changed
def update_country_data(change):
    """
    Update the GDP data for the selected country.
    
    Parameters:
    change (dict): The change event from the dropdown widget.
    
    Returns:
    None
    """
    with output:
        clear_output(wait=True)
        display_country_data(change['new'],df)
# bind the update function to the dropdown widget
country_dropdown.observe(update_country_data, names='value')
# display the dropdown widget
display(country_dropdown, output)

Dropdown(description='Country:', options=('Afghanistan', 'Africa Eastern and Southern', 'Africa Western and Ce…

Output()

In [None]:
#add a widget to select two years and display a hystogram for countiries' gdp change (%) between the selected years

def display_gdp_change(gdp_change:pd.DataFrame, num_bins:int, min_edge:int, max_edge:int)->None:   
    print(f"function display_gdp_change called with num_bins = {num_bins}, min_edge = {min_edge}, max_edge = {max_edge}")
    edges = np.arange(min_edge, max_edge, num_bins + 1)
    #edges = list(range(min_edge, max_edge, grain))
    # create a histogram of the percentage change
    histogram = pd.cut(gdp_change['Change (%)'],bins=edges)
    # Get the counts and sorted bins
    counts = histogram.value_counts().sort_index()
    bins = histogram.cat.categories
    # Compute edges and widths
   # edges = [interval.left for interval in bins] + [bins[-1].right]
    left_edges = edges[:-1]
    widths = [interval.right - interval.left for interval in bins]
    left_edges = [i.left for i in bins]
    plt.bar(left_edges, counts, width=widths, align='edge', color='violet')
    plt.xticks(edges, rotation=45)
    plt.title(f"GDP Change from year1 to year2") 
    plt.ylabel('Country count')
    plt.xlabel('GDP Change persentiles (%)')
    plt.grid()
    plt.show()

# create a dropdown widget for the years
# get the list of years from the dataframe that have data
years = [y for y in df.columns[4:].unique() if df[y].notna().any()]
# sort the years
years = sorted(years)
year1_dropdown = widgets.Dropdown(
    options=years,
    description='Year 1:',
    disabled=False,
)
year2_dropdown = widgets.Dropdown(
    options=years,
    value = years[-1],
    description='Year 2:',
    disabled=False,
)
#create a widget for the grain
grain_slider = widgets.IntSlider(
    value=5,
    min=1,
    max=25,
    step=1,
    description='Grain:',
    continuous_update=False,
    disabled=False,
)

#add a non-country indicator to the DataFrame
df = add_country_indicator(df)
#filter out all non-country rows
df_countries = df[add_country_indicator(df)['Is Country'] == 1]

# compute the filtered dataset for the selected years
#gdp_change = compute_gdp_change(year1_dropdown.value, year2_dropdown.value, df_countries)
#print(f"{gdp_change['Country Name'].count()} countries found with data for the selected years {year1_dropdown.value} and {year2_dropdown.value}.")
#grain = grain_slider.value
# compute the min and max values for the histogram
#min_val = math.floor(gdp_change['Change (%)'].min() / grain) * grain
#max_val = math.ceil(gdp_change['Change (%)'].max() / grain) * grain

#display the min and max values
#print(f"Min value: {min_val}")
#print(f"Max value: {max_val}")
# compute the allowed bin numbers
#allowed_bins = compute_allowed_bin_numbers(min_val, max_val, grain)
allowed_bins = [0]

def update_bin_options(*args):
    # Recompute gdp_change and min/max
    gdp_change = compute_gdp_change(year1_dropdown.value, year2_dropdown.value, df_countries)
    grain = grain_slider.value
    min_val = math.floor(gdp_change['Change (%)'].min() / grain) * grain
    max_val = math.ceil(gdp_change['Change (%)'].max() / grain) * grain
    print(f"Min value: {min_val}")
    print(f"Max value: {max_val}")
    # Compute allowed bin numbers
    allowed_bins = compute_allowed_bin_numbers(min_val, max_val, grain)

    # Update dropdown options
    num_bins_dropdown.options = allowed_bins
    num_bins_dropdown.value = allowed_bins[0]

    # Optional: print stats
    print(f"{len(gdp_change)} countries found")
    print(f"Min: {min_val}, Max: {max_val}")
    return gdp_change, min_val, max_val



# create a dropdown widget for the number of bins
num_bins_dropdown = widgets.Dropdown(
    options=allowed_bins,
    value=allowed_bins[0],
    description='Number of bins:',
    disabled=False,
)

year1_dropdown.observe(update_bin_options, names='value')
year2_dropdown.observe(update_bin_options, names='value')
grain_slider.observe(update_bin_options, names='value')
num_bins_dropdown.observe(
    names='value')


# create a button to display the GDP change
gdp_change_button = widgets.Button(
    description='Display GDP Change',
    button_style='success',
    tooltip='Click to display GDP change',
    icon='check'
)
# bind the button to the display_gdp_change function
def on_button_click(b):
    """
    Handle the button click event to display GDP change.
    
    Parameters:
    b (Button): The button that was clicked.
    
    Returns:
    None
    """
    with output:
        clear_output(wait=True)
        gdp_change, min_val, max_val = update_bin_options()
        display_gdp_change(gdp_change, int(num_bins_dropdown.value), min_val, max_val)
gdp_change_button.on_click(on_button_click)
# display the dropdown widgets and the button
display(year1_dropdown, year2_dropdown, grain_slider, num_bins_dropdown, gdp_change_button, output)

TypeError: HasTraits.observe() missing 1 required positional argument: 'handler'