### Plotly Template

In [17]:
import plotly.graph_objects as go
import plotly.io as pio

# Define the base template
base = go.layout.Template(
    layout=go.Layout(
        paper_bgcolor='#FFF5CC',
        plot_bgcolor='#FFF5CC',
        height=600,
        width=600 * 1.618,
        xaxis=dict(
            anchor='y',
            showgrid=True,
            tickfont=dict(
                size=14,
                family='Open Sans, sans-serif'  # Use Open Sans font
            ),
            titlefont=dict(
                size=16,
                family='Open Sans, sans-serif'  # Use Open Sans font
            ),
            linecolor='#333333',
            linewidth=2  # Adjust the thickness of the x-axis line
        ),
        yaxis=dict(
            anchor='x',
            showgrid=True,
            tickfont=dict(
                size=14,
                family='Open Sans, sans-serif'
            ),
            titlefont=dict(
                size=16,
                family='Open Sans, sans-serif'
            ),
            linecolor='#333333',
            linewidth=3  # Adjust the thickness of the y-axis line
        ),
        font=dict(
            color='#333333',
            size=18,
            family='Open Sans, sans-serif'
        ),
        colorway=["#348273", "#280F3C", "#CC5500", "#333333", "#FF5733", "#2E86C1", "#9B59B6", "#28B463", "#F39C12", "#E74C3C", "#3498DB"],
        title=go.layout.Title(
            text='',
            font=dict(
                size=24,
                color='#333333',
                family='Open Sans, sans-serif'
            ),
            x=0.05,
        )
    ),
    data=dict(
        scatter=[
            go.Scatter(
                line=dict(width=3)  # Set the line width for scatter plots
            )
        ]
    )
)

# Register the base template
pio.templates['base'] = base
pio.templates.default = 'base'


### World Bank Data

In [2]:
import requests
import pandas as pd

# URL for World Bank API to get country metadata
url_metadata = "http://api.worldbank.org/v2/country?format=json&per_page=300"

# Function to fetch country metadata from the World Bank API and return a DataFrame
def fetch_country_metadata(url):
    response = requests.get(url)
    data = response.json()
    
    # Extract relevant data
    records = []
    for entry in data[1]:
        country_id = entry['id']
        country_name = entry['name']
        region = entry['region']['value']
        income_level = entry['incomeLevel']['value']
        lending_type = entry['lendingType']['value']
        
        records.append([country_id, country_name, region, income_level, lending_type])
    
    df = pd.DataFrame(records, columns=["Country Code", "Country Name", "Region", "Income Level", "Lending Type"])
    return df

# Fetch country metadata
df_country_metadata = fetch_country_metadata(url_metadata)

# Display the DataFrame
print(df_country_metadata.head())

# Save the DataFrame to a CSV file
df_country_metadata.to_csv("country_metadata.csv", index=False)


  Country Code                 Country Name                      Region  \
0          ABW                        Aruba  Latin America & Caribbean    
1          AFE  Africa Eastern and Southern                  Aggregates   
2          AFG                  Afghanistan                  South Asia   
3          AFR                       Africa                  Aggregates   
4          AFW   Africa Western and Central                  Aggregates   

  Income Level    Lending Type  
0  High income  Not classified  
1   Aggregates      Aggregates  
2   Low income             IDA  
3   Aggregates      Aggregates  
4   Aggregates      Aggregates  


In [4]:
import requests
import pandas as pd

# URLs for World Bank API to get the data
urls = {
    "Life Expectancy - Male": "http://api.worldbank.org/v2/country/all/indicator/SP.DYN.LE00.MA.IN?format=json&per_page=20000",
    "Life Expectancy - Female": "http://api.worldbank.org/v2/country/all/indicator/SP.DYN.LE00.FE.IN?format=json&per_page=20000",
    "Life Expectancy - Total": "http://api.worldbank.org/v2/country/all/indicator/SP.DYN.LE00.IN?format=json&per_page=20000",
    "Under-five Mortality - Male": "http://api.worldbank.org/v2/country/all/indicator/SH.DYN.MORT.MA?format=json&per_page=20000",
    "Under-five Mortality - Female": "http://api.worldbank.org/v2/country/all/indicator/SH.DYN.MORT.FE?format=json&per_page=20000",
    "Under-five Mortality - Total": "http://api.worldbank.org/v2/country/all/indicator/SH.DYN.MORT?format=json&per_page=20000",
    "Adult Mortality - Male": "http://api.worldbank.org/v2/country/all/indicator/SP.DYN.AMRT.MA?format=json&per_page=20000",
    "Adult Mortality - Female": "http://api.worldbank.org/v2/country/all/indicator/SP.DYN.AMRT.FE?format=json&per_page=20000",
    #"Adult Mortality - Total": "http://api.worldbank.org/v2/country/all/indicator/SP.DYN.AMRT?format=json&per_page=20000",
}

# Function to fetch data from the World Bank API and return a DataFrame
def fetch_data(url, indicator_name, gender=None):
    response = requests.get(url)
    data = response.json()
    
    # Extract relevant data
    records = []
    for entry in data[1]:
        country = entry['country']['value']
        year = entry['date']
        value = entry['value']
        records.append([country, year, value, indicator_name, gender])
    
    df = pd.DataFrame(records, columns=["Country", "Year", "Value", "Indicator", "Gender"])
    return df

# Fetch data for all indicators
dfs = []
for indicator, url in urls.items():
    gender = None
    if "Male" in indicator:
        gender = "Male"
    elif "Female" in indicator:
        gender = "Female"
    elif "Total" in indicator:
        gender = "Total"
    dfs.append(fetch_data(url, indicator.split(" - ")[0], gender))

# Combine all the data into a single DataFrame
df_combined = pd.concat(dfs)

# Convert the Year column to numeric
df_combined['Year'] = pd.to_numeric(df_combined['Year'])

# Pivot the DataFrame to get a better structure for analysis
df_death_timing = df_combined.pivot_table(index=['Country', 'Year'], columns=['Indicator', 'Gender'], values='Value', aggfunc='first').reset_index()

# Flatten the MultiIndex columns
df_death_timing.columns = [' '.join(filter(None, col)).strip() for col in df_death_timing.columns.values]

# Display the DataFrame
print(df_death_timing.head())

# Save the DataFrame to a CSV file
df_death_timing.to_csv("death_timing.csv", index=False)


       Country  Year  Adult Mortality Female  Adult Mortality Male  \
0  Afghanistan  1960                 550.189               601.887   
1  Afghanistan  1961                 543.600               594.812   
2  Afghanistan  1962                 537.703               588.870   
3  Afghanistan  1963                 531.856               583.144   
4  Afghanistan  1964                 526.179               577.178   

   Life Expectancy Female  Life Expectancy Male  Life Expectancy Total  \
0                  33.285                31.870                 32.535   
1                  33.813                32.409                 33.068   
2                  34.297                32.883                 33.547   
3                  34.773                33.346                 34.016   
4                  35.246                33.828                 34.494   

   Under-five Mortality Female  Under-five Mortality Male  \
0                          NaN                        NaN   
1           

In [19]:
df_death_timing.dtypes

Country                         object
Year                             int64
Adult Mortality Female         float64
Adult Mortality Male           float64
Life Expectancy Female         float64
Life Expectancy Male           float64
Life Expectancy Total          float64
Under-five Mortality Female    float64
Under-five Mortality Male      float64
Under-five Mortality Total     float64
dtype: object

In [45]:
import pandas as pd
import numpy as np

# Assuming df_death_timing and df_country_metadata are already loaded

# Calculate Life Expectancy Difference as the difference between Life Expectancy Male and Life Expectancy Female
df_death_timing['Life Expectancy Difference'] = df_death_timing['Life Expectancy Male'] - df_death_timing['Life Expectancy Female']

# Join the dataframes on the 'Country' column from df_death_timing and 'Country Name' column from df_country_metadata
rbt_death_timing = pd.merge(df_death_timing, df_country_metadata, left_on="Country", right_on="Country Name", how="left")

# Calculate average Life Expectancy Total for the periods 2010-2020 and 1960-1970
period_2010_2020 = rbt_death_timing[(rbt_death_timing['Year'] >= 2010) & (rbt_death_timing['Year'] <= 2020)]
period_1960_1970 = rbt_death_timing[(rbt_death_timing['Year'] >= 1960) & (rbt_death_timing['Year'] <= 1970)]

avg_life_expectancy_2010_2020 = period_2010_2020.groupby('Country')['Life Expectancy Total'].mean().reset_index()
avg_life_expectancy_1960_1970 = period_1960_1970.groupby('Country')['Life Expectancy Total'].mean().reset_index()

# Rename columns for clarity before merging
avg_life_expectancy_2010_2020.rename(columns={'Life Expectancy Total': 'Avg Life Expectancy 2010-2020'}, inplace=True)
avg_life_expectancy_1960_1970.rename(columns={'Life Expectancy Total': 'Avg Life Expectancy 1960-1970'}, inplace=True)

# Merge the average life expectancy data back into the main dataframe
rbt_death_timing = pd.merge(rbt_death_timing, avg_life_expectancy_2010_2020, on='Country', how='left')
rbt_death_timing = pd.merge(rbt_death_timing, avg_life_expectancy_1960_1970, on='Country', how='left')

# Calculate the Life Expectancy Delta
rbt_death_timing['Life Expectancy Delta'] = ((rbt_death_timing['Avg Life Expectancy 2010-2020'] - rbt_death_timing['Avg Life Expectancy 1960-1970']) / rbt_death_timing['Avg Life Expectancy 1960-1970']) * 100

# Display the first few rows of the joined DataFrame in a nicely formatted table
print(rbt_death_timing.head().to_string())

# Save the joined DataFrame to a CSV file
rbt_death_timing.to_csv("rbt_death_timing.csv", index=False)


       Country  Year  Adult Mortality Female  Adult Mortality Male  Life Expectancy Female  Life Expectancy Male  Life Expectancy Total  Under-five Mortality Female  Under-five Mortality Male  Under-five Mortality Total  Life Expectancy Difference Country Code Country Name      Region Income Level Lending Type  Avg Life Expectancy 2010-2020  Avg Life Expectancy 1960-1970  Life Expectancy Delta
0  Afghanistan  1960                 550.189               601.887                  33.285                31.870                 32.535                          NaN                        NaN                       354.6                      -1.415          AFG  Afghanistan  South Asia   Low income          IDA                      62.471545                         34.976              78.612607
1  Afghanistan  1961                 543.600               594.812                  33.813                32.409                 33.068                          NaN                        NaN               

In [60]:
import pandas as pd
import plotly.express as px

# Assuming rbt_death_timing is already created and available

def plot_metric(df, metric_column, color_by_column, group_by_column=None, agg_func='mean'):
    """
    Plots the specified metric segmented by the specified group with year on the x-axis.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    metric_column (str): The metric column to plot (e.g., 'Life Expectancy Total', 'Adult Mortality Female', etc.).
    color_by_column (str): The column to color by (e.g., 'Country', 'Region', etc.).
    group_by_column (str): The column to group by (e.g., 'Country', 'Region', etc.). If None, no grouping is applied.
    agg_func (str or function): Aggregation function to apply ('mean', 'sum', 'median', etc.). Ignored if no grouping.

    Returns:
    None
    """
    if group_by_column:
        # Group by the specified column and year, then aggregate
        df_grouped = df.groupby(['Year', group_by_column])[metric_column].agg(agg_func).reset_index()
        color_col = group_by_column
    else:
        df_grouped = df
        color_col = color_by_column

    # Create the line plot using Plotly
    fig = px.line(
        df_grouped,
        x='Year',
        y=metric_column,
        color=color_col,
        title=f'{metric_column} by {color_by_column}' if not group_by_column else f'{metric_column} by {group_by_column}',
        template='base'
    )
    
    # Show the plot
    fig.show()

# Example usage
plot_metric(rbt_death_timing[rbt_death_timing['Country'] == 'World'], 'Life Expectancy Total', 'Country', group_by_column='Country')
plot_metric(rbt_death_timing[rbt_death_timing['Region'] != 'Aggregates'], 'Life Expectancy Total', 'Region', group_by_column='Region')
plot_metric(rbt_death_timing[rbt_death_timing['Region'] != 'Aggregates'], 'Life Expectancy Difference', 'Region', group_by_column='Region')
plot_metric(rbt_death_timing[rbt_death_timing['Region'] != 'Aggregates'], 'Life Expectancy Difference', 'Country', group_by_column='Country')
plot_metric(rbt_death_timing[rbt_death_timing['Region'] != 'Aggregates'], 'Life Expectancy Total', 'Country', group_by_column='Country')


In [59]:
import pandas as pd
import plotly.express as px

# Assuming rbt_death_timing is already created and available

def plot_metric_highlight(df, metric_column, color_by_column, highlight_value, group_by_column=None, agg_func='mean'):
    """
    Plots the specified metric segmented by the specified group with year on the x-axis.
    Highlights all countries in a specific region while coloring the rest in grey.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    metric_column (str): The metric column to plot (e.g., 'Life Expectancy Total', 'Adult Mortality Female', etc.).
    color_by_column (str): The column to color by (e.g., 'Country', 'Region', etc.).
    highlight_value (str): The value to highlight (e.g., specific region).
    group_by_column (str): The column to group by (e.g., 'Country', 'Region', etc.). If None, no grouping is applied.
    agg_func (str or function): Aggregation function to apply ('mean', 'sum', 'median', etc.). Ignored if no grouping.

    Returns:
    None
    """
    if group_by_column:
        # Group by the specified column and year, then aggregate
        df_grouped = df.groupby(['Year', group_by_column])[metric_column].agg(agg_func).reset_index()
        color_col = group_by_column
    else:
        df_grouped = df
        color_col = color_by_column

    # Create a custom color mapping
    highlight_countries = df[df[color_by_column] == highlight_value][group_by_column].unique()
    color_map = {country: '#348273' for country in highlight_countries}
    grey = 'lightgrey'
    for val in df_grouped[color_col].unique():
        if val not in color_map:
            color_map[val] = grey

    # Create the line plot using Plotly
    fig = px.line(
        df_grouped,
        x='Year',
        y=metric_column,
        color=color_col,
        title=f'{metric_column} by {color_by_column}' if not group_by_column else f'{metric_column} by {group_by_column}',
        template='base',
        color_discrete_map=color_map
    )

    # Update the traces to set the line width and style for highlighting
    highlight_traces = []
    other_traces = []
    for trace in fig.data:
        if trace.name in highlight_countries:
            trace.line.width = 4  # Highlighted line width
            highlight_traces.append(trace)
        else:
            trace.line.width = 2  # Default line width
            other_traces.append(trace)

    # Move all highlight traces to the end to ensure they are on top
    fig.data = other_traces + highlight_traces

    # Remove the legend
    fig.update_layout(showlegend=False)

    # Show the plot
    fig.show()

# Example usage
plot_metric_highlight(
    rbt_death_timing[rbt_death_timing['Region'] != 'Aggregates'],
    metric_column='Life Expectancy Difference',
    color_by_column='Country',
    highlight_value='Bangladesh',  # Adjust this value to highlight all countries in a different region
    group_by_column='Country'
)


In [51]:
import pandas as pd

def top_bottom_life_expectancy_delta(df, top_n=5, bottom_n=5, region=None):
    """
    Displays the top and bottom countries by Life Expectancy Delta.
    Optionally filters by a specific region.

    Parameters:
    df (pd.DataFrame): The DataFrame containing the data.
    top_n (int): Number of top countries to display.
    bottom_n (int): Number of bottom countries to display.
    region (str): The region to filter by. If None, considers all regions.

    Returns:
    None
    """
    # Filter the DataFrame for the specified region if provided
    if region:
        df = df[df['Region'] == region]

    # Drop rows with NaN Life Expectancy Delta values
    df = df.dropna(subset=['Life Expectancy Delta'])

    # Group by Country and calculate the mean Life Expectancy Delta
    country_delta = df.groupby('Country')['Life Expectancy Delta'].mean().reset_index()

    # Sort the DataFrame by Life Expectancy Delta
    sorted_df = country_delta.sort_values(by='Life Expectancy Delta', ascending=False)

    # Get the top and bottom n countries
    top_countries = sorted_df.head(top_n)
    bottom_countries = sorted_df.tail(bottom_n)

    # Display the results
    if region:
        print(f"Top {top_n} countries in {region} by Life Expectancy Delta:")
    else:
        print(f"Top {top_n} countries by Life Expectancy Delta:")
    print(top_countries.to_string(index=False))
    print("\n")
    if region:
        print(f"Bottom {bottom_n} countries in {region} by Life Expectancy Delta:")
    else:
        print(f"Bottom {bottom_n} countries by Life Expectancy Delta:")
    print(bottom_countries.to_string(index=False))

# Assuming rbt_death_timing is already created and available

# Example usage
# top_bottom_life_expectancy_delta(rbt_death_timing, top_n=5, bottom_n=5, region='South Asia')
print("\n")
top_bottom_life_expectancy_delta(rbt_death_timing, top_n=10, bottom_n=10)




Top 10 countries by Life Expectancy Delta:
    Country  Life Expectancy Delta
South Sudan              97.082376
Timor-Leste              94.166809
     Bhutan              92.790279
   Maldives              90.965005
       Mali              88.605317
       Oman              84.666423
Yemen, Rep.              84.377584
Afghanistan              78.612607
    Algeria              78.593005
     Malawi              70.230978


Bottom 10 countries by Life Expectancy Delta:
           Country  Life Expectancy Delta
   Slovak Republic               8.871028
            Latvia               6.010874
          Bulgaria               5.550622
         Lithuania               5.433547
          Zimbabwe               5.287691
Russian Federation               5.089666
             Nauru               4.554566
           Belarus               3.514686
           Ukraine               2.421486
           Lesotho               0.709967


### WHO Data

In [84]:
import requests
import pandas as pd
import plotly.express as px

# Function to fetch country metadata from the restcountries API
def fetch_country_metadata():
    url = 'https://restcountries.com/v3.1/all'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        metadata = []
        for country in data:
            country_info = {
                'CountryCode': country.get('cca3', ''),
                'CountryName': country.get('name', {}).get('common', ''),
                'Continent': country.get('continents', [None])[0],
                'Region': country.get('region', ''),
            }
            metadata.append(country_info)
        return pd.DataFrame(metadata)
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

# Fetch the country metadata
country_metadata = fetch_country_metadata()

# Assuming df_hale_60plus is already created and available
def fetch_hale_60plus_data():
    """
    Fetch Healthy Life Expectancy (HALE) data for individuals aged 60+ from the WHO API.

    Returns:
    DataFrame: A pandas DataFrame containing HALE data for individuals aged 60+.
    """
    # WHO API endpoint for Healthy Life Expectancy (HALE) at age 60
    url = 'https://ghoapi.azureedge.net/api/WHOSIS_000007'  # This endpoint should match the specific indicator for HALE at age 60
    
    # Send a GET request to the API
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()['value']
        
        # Convert the JSON data to a pandas DataFrame
        df = pd.DataFrame(data)
        
        # Filter relevant columns
        df_filtered = df[['IndicatorCode', 'SpatialDimType', 'SpatialDim', 'TimeDim', 'Dim1', 'Value']]
        
        # Rename columns for clarity
        df_filtered.columns = ['IndicatorCode', 'RegionType', 'Country', 'Year', 'Sex', 'HALE']
        
        return df_filtered
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        return None

# Fetch the HALE data for 60+
df_hale_60plus = fetch_hale_60plus_data()

# Merge the metadata with the HALE data
df_hale_60plus = pd.merge(df_hale_60plus, country_metadata, left_on='Country', right_on='CountryCode', how='left')

# Filter the DataFrame for Sex = SEX_BTSX
df_hale_60plus_filtered = df_hale_60plus[df_hale_60plus['Sex'] == 'SEX_BTSX']

# Function to plot the data with options to group and colour by specified columns
def plot_hale_data(df, group_by_col, color_by_col):
    # Sort the DataFrame by group and Year
    df_sorted = df.sort_values(by=[group_by_col, 'Year'])

    # Create the line plot using Plotly Express
    fig = px.line(
        df_sorted,
        x='Year',
        y='HALE',
        color=color_by_col,
        line_group=group_by_col,
        title=f'Healthy Life Expectancy (HALE) at Age 60 by {group_by_col}',
        labels={'HALE': 'Healthy Life Expectancy (HALE)', 'Year': 'Year'}
    )

    # Show the plot
    fig.show()

# Example usage
plot_hale_data(df_hale_60plus_filtered, group_by_col='CountryName', color_by_col='Continent')


In [96]:
import pandas as pd

# Function to normalize country names
def normalize_country_name(name):
    if pd.isna(name):
        return ''
    return str(name).strip().lower().replace(' ', '_')

# Normalize country names in both datasets
df_hale_60plus_filtered['NormalizedCountryName'] = df_hale_60plus_filtered['CountryName'].apply(normalize_country_name)
rbt_death_timing['NormalizedCountry'] = rbt_death_timing['Country'].apply(normalize_country_name)

# Debug prints to check normalization
print("Normalized country names in df_hale_60plus_filtered:")
print(df_hale_60plus_filtered[['CountryName', 'NormalizedCountryName']].head())
print("\nNormalized country names in rbt_death_timing:")
print(rbt_death_timing[['Country', 'NormalizedCountry']].head())

# Filter the HALE data for the year 2020
df_hale_2020 = df_hale_60plus_filtered[df_hale_60plus_filtered['Year'] == 2020]

# Filter the rbt_death_timing data for the year 2020
df_death_timing_2020 = rbt_death_timing[rbt_death_timing['Year'] == 2020]

# Merge the two dataframes on NormalizedCountry and NormalizedCountryName
df_death_timing_and_hale = pd.merge(df_death_timing_2020, df_hale_2020, left_on='NormalizedCountry', right_on='NormalizedCountryName', how='left')

# Drop the normalized country name columns for clarity
df_death_timing_and_hale = df_death_timing_and_hale.drop(columns=['NormalizedCountry', 'NormalizedCountryName'])

# Display the first few rows of the new dataframe
print(df_death_timing_and_hale.head())


Normalized country names in df_hale_60plus_filtered:
   CountryName NormalizedCountryName
0     Colombia              colombia
2  South Sudan           south_sudan
3      Moldova               moldova
4      Croatia               croatia
7   Uzbekistan            uzbekistan

Normalized country names in rbt_death_timing:
       Country NormalizedCountry
0  Afghanistan       afghanistan
1  Afghanistan       afghanistan
2  Afghanistan       afghanistan
3  Afghanistan       afghanistan
4  Afghanistan       afghanistan
                     Country_x  Year_x  Adult Mortality Female  \
0                  Afghanistan    2020              210.053000   
1  Africa Eastern and Southern    2020              224.232001   
2   Africa Western and Central    2020              297.189740   
3                      Albania    2020               56.310000   
4                      Algeria    2020               81.699000   

   Adult Mortality Male  Life Expectancy Female  Life Expectancy Male  \
0         



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

