---
title: "Exploring the Relationship Between Healthcare Capacity and HIV Mortality: Two Indicator Approach"
format: dashboard
theme: superhero
---

In [None]:
# Load libraries
import pandas as pd
import country_converter as coco
import plotly.express as px
import seaborn as sns
import geopandas as gpd

First import two raw datasets, each containing estimates per country-year. These data were accessed from the Gapminder foundation, at [www.gapminder.org/data](https://www.gapminder.org/data).

1. **Annual HIV Deaths Dataset:** This dataset provides the annual number of deaths attributable to HIV/AIDS(*data/annual_hiv_deaths_number_all_ages.csv*).

2. **Medical Doctors per 1,000 People:** This dataset measures the number of medical doctors per 1,000 people, representing healthcare resource availability at national or subnational levels (*data/medical_doctors_per_1000_people.csv*).


In [None]:
# Load raw data 
hiv_deaths_raw = pd.read_csv("data/annual_hiv_deaths_number_all_ages.csv")
medical_doctors_raw = pd.read_csv("data/medical_doctors_per_1000_people.csv")
hiv_deaths_raw

# Data Manipulation


In [None]:
#converting to log format

hiv_deaths_raw_wide = hiv_deaths_raw

hiv_deaths_long = hiv_deaths_raw_wide.melt(
    id_vars=["country"],
    var_name="year",
    value_name = "hiv_deaths"
)

In [None]:
medical_doctors_raw_wide = medical_doctors_raw
medical_doctors_long = medical_doctors_raw_wide.melt(
    id_vars=["country"], var_name="year", value_name="doctors_per_1000"
)
medical_doctors_long["number_of_doctors"] = (
    medical_doctors_long["doctors_per_1000"] * 1000)

In [None]:
# Convert non-numeric values in 'hiv_deaths' to numeric (e.g., '11k' to 11000)
hiv_deaths_long['hiv_deaths'] = (
    hiv_deaths_long['hiv_deaths']
    .replace(r'[^\d.]k', lambda x: float(x.group(0).strip('k')) * 1000, regex=True)
    .replace(r'[^\d.]', '', regex=True)
    .astype(float, errors='ignore')
)

In [None]:
#Convert year to integer and clean numeric columns
hiv_deaths_long['year'] = pd.to_numeric(hiv_deaths_long['year'], errors='coerce')
medical_doctors_long['year'] = pd.to_numeric(medical_doctors_long['year'], errors='coerce')

In [None]:
# Step 3: Align datasets by filtering overlapping years
common_years = set(hiv_deaths_long['year']).intersection(medical_doctors_long['year'])
hiv_deaths_long = hiv_deaths_long[hiv_deaths_long['year'].isin(common_years)]
medical_doctors_long = medical_doctors_long[medical_doctors_long['year'].isin(common_years)]

In [None]:
# Step 4: Merge datasets on 'country' and 'year'
merged_data = pd.merge(hiv_deaths_long, medical_doctors_long, on=["country", "year"], how="inner")


In [None]:
# Step 5: Drop duplicates and save the cleaned dataset
merged_data = merged_data.drop_duplicates()

In [None]:
# Save to a CSV file
merged_data.to_csv('data/cleaned_merged_dataset.csv', index=False)

# Display a preview of the cleaned and merged dataset
print(merged_data.head())

# Visuals
## Line Chart for Annual HIV Deaths


In [None]:
import matplotlib.pyplot as plt

# Aggregate data for the line chart
annual_hiv_deaths = merged_data.groupby('year')['hiv_deaths'].sum()

# Create line chart
plt.figure(figsize=(10, 6))
plt.plot(annual_hiv_deaths.index, annual_hiv_deaths.values, marker='o', linestyle='-')
plt.title('Annual HIV Deaths (All Countries)', fontsize=16)
plt.xlabel('Year', fontsize=12)
plt.ylabel('HIV Deaths', fontsize=12)
plt.grid(True)
plt.show()

In [None]:
# Value box for country with highest HIV deaths in the most recent year
most_recent_year = merged_data["year"].max()
highest_deaths = merged_data[merged_data["year"] == most_recent_year].sort_values("hiv_deaths", ascending=False).iloc[0]
highest_deaths_country = highest_deaths["country"]
highest_deaths_value = highest_deaths["hiv_deaths"]
f"Country with the highest HIV deaths in {most_recent_year}: {highest_deaths_country} ({highest_deaths_value} deaths)"

In [None]:
# Table summarizing statistics for the most recent year
summary_stats = merged_data[merged_data["year"] == most_recent_year][["country", "hiv_deaths"]].sort_values("hiv_deaths", ascending=False)
summary_stats.head(10)

## Bar Chart for Medical Doctors

In [None]:
# Aggregate data for a specific year (2011))
yearly_data = merged_data[merged_data['year'] == 2011].dropna(subset=['doctors_per_1000'])

# Sort by doctors_per_1000 for better visualization
yearly_data = yearly_data.sort_values(by='doctors_per_1000', ascending=False).head(10)

# Create bar chart
plt.figure(figsize=(10, 6))
plt.bar(yearly_data['country'], yearly_data['doctors_per_1000'], color='skyblue')
plt.title('Top 10 Countries by Doctors per 1000 People (2020)', fontsize=16)
plt.xlabel('Country', fontsize=12)
plt.ylabel('Doctors per 1000 People', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.show()


In [None]:
# Filter data for the year 2011
data_2011 = merged_data[merged_data['year'] == 2011]

# Ensure country names match those in the world dataset
# This step may require manual adjustments if there are discrepancies
data_2011['country'] = data_2011['country'].replace({
    'United States': 'United States of America',
    'Russia': 'Russian Federation',
    # Add more replacements as needed
})


# Set up the plot
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Plot the data
world.boundary.plot(ax=ax)
world.plot(column='doctors_per_1000', ax=ax, legend=True,
           legend_kwds={'label': "Doctors per 1,000 People",
                        'orientation': "horizontal"},
           cmap='OrRd', missing_kwds={"color": "lightgrey"})

# Add titles and labels
ax.set_title('Global Distribution of Doctors per 1,000 People (2011)', fontsize=16)
ax.set_axis_off()

# Show the plot
plt.show()

## Scatter Plot for Relationship


In [None]:
import seaborn as sns

# Create scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_data, x='doctors_per_1000', y='hiv_deaths', hue='year', palette='viridis')
plt.title('Relationship between Medical Doctors and HIV Deaths', fontsize=16)
plt.xlabel('Doctors per 1000 People', fontsize=12)
plt.ylabel('HIV Deaths', fontsize=12)
plt.legend(title='Year', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
import plotly.io as pio
pio.renderers.default = "notebook"  # For Jupyter Notebook
# Use "jupyterlab" if running in JupyterLab


import plotly.io as pio
print(pio.renderers.default)


## Map for Medical Doctors

In [None]:
import geopandas as gpd
import plotly.express as px

# Load the shapefile
world = gpd.read_file("data/ne_110m_admin_0_countries.shp")

# Merge GeoJSON with medical doctor data for a specific year
map_data = merged_data[merged_data['year'] == 2010].dropna(subset=['doctors_per_1000'])
map_data = world.merge(map_data, left_on='ADMIN', right_on='country', how='left')

# Create map
fig = px.choropleth(
    map_data,
    geojson=world.__geo_interface__,
    locations='ADMIN',
    color='doctors_per_1000',
    hover_name='ADMIN',
    title='Doctors per 1000 People (2020)',
    color_continuous_scale='Viridis'
)
fig.update_geos(fitbounds="locations", visible=False)

fig.show(renderer="browser")


## HeatMap for Medical Doctors


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select 5 countries from 5 continents
selected_countries = [
    "United States",
    "Nigeria",
    "Brazil",
    "Australia",
    "India",
]  # Customize based on your dataset

# Filter the dataset
filtered_data = merged_data[merged_data["country"].isin(selected_countries)]

# Prepare data for heatmap
heatmap_data = filtered_data.pivot_table(
    index="country", columns="year", values="doctors_per_1000"
)

# Plot the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(
    heatmap_data,
    cmap="YlGnBu",
    cbar_kws={"label": "Doctors per 1000 People"},
    linewidths=0.5,
    annot=True,
    fmt=".2f",  # Annotate cells with data values
)
plt.title(
    "Heatmap of Medical Doctors Per 1000 People (Selected Countries)", fontsize=16
)
plt.xlabel("Year", fontsize=12)
plt.ylabel("Country", fontsize=12)
plt.tight_layout()
plt.show()

## Scatter Plot for Relationship 5 Countries


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Select the same 5 countries
selected_countries = ['United States', 'Nigeria', 'Brazil', 'Australia', 'India']

# Filter the dataset for these countries
filtered_scatter_data = merged_data[merged_data['country'].isin(selected_countries)]

# Create scatter plot
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=filtered_scatter_data,
    x='doctors_per_1000',
    y='hiv_deaths',
    hue='country',
    style='country',
    size='year',
    sizes=(50, 200),
    palette='Set2'
)
plt.title('Relationship Between Medical Doctors and HIV Deaths (Selected Countries)', fontsize=16)
plt.xlabel('Doctors per 1000 People', fontsize=12)
plt.ylabel('HIV Deaths', fontsize=12)
plt.legend(title='Country', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.show()