In [1]:
#pip install dash

In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from dash import Dash, dcc, html, Input, Output

In [3]:
# Load the dataset
data = pd.read_csv('health.csv')

In [4]:
# Display the first few rows of the dataset
print("Initial Data:")
display(data.head())

Initial Data:


Unnamed: 0,GHO (CODE),GHO (DISPLAY),GHO (URL),YEAR (DISPLAY),STARTYEAR,ENDYEAR,REGION (CODE),REGION (DISPLAY),COUNTRY (CODE),COUNTRY (DISPLAY),DIMENSION (TYPE),DIMENSION (CODE),DIMENSION (NAME),Numeric,Value,Low,High
0,TB_hivtest_pos_pct,Tested TB patients HIV-positive (%),https://www.who.int/data/gho/data/indicators/i...,2006,2006,2006,AFR,Africa,KEN,Kenya,,,,52.0,52,,
1,NCD_BMI_30A,"Prevalence of obesity among adults, BMI &Great...",https://www.who.int/data/gho/data/indicators/i...,2000,2000,2000,AFR,Africa,KEN,Kenya,SEX,SEX_MLE,Male,2.17215,2.2 [1.2-3.5],1.22961,3.52131
2,MORT_300,Distribution of causes of death among children...,https://www.who.int/data/gho/data/indicators/i...,2014,2014,2014,AFR,Africa,KEN,Kenya,AGEGROUP,AGEGROUP_MONTHS1-59,1-59 months,0.07214,0.1,,
3,AIR_62,Household and ambient air pollution attributab...,https://www.who.int/data/gho/data/indicators/i...,2015,2015,2015,AFR,Africa,KEN,Kenya,SEX,SEX_BTSX,Both sexes,46.23,46 [39-53],38.606,53.434
4,WHOSIS_000002,Healthy life expectancy (HALE) at birth (years),https://www.who.int/data/gho/data/indicators/i...,2018,2018,2018,AFR,Africa,KEN,Kenya,SEX,SEX_BTSX,Both sexes,57.88585,57.9 [57.1-58.7],57.10911,58.74801


Data Wrangling

In [6]:
## Cleaning the Data

# Handling Missing Values
print("\nHandling Missing Values:")
missing_values = data.isnull().sum()
print("Missing Values per Column:\n", missing_values)

# Option 1: Imputation (for numerical columns, using mean or median)
for column in data.select_dtypes(include=[np.number]).columns:
    if data[column].isnull().any():
        data[column].fillna(data[column].mean(), inplace=True)  # or use median()


Handling Missing Values:
Missing Values per Column:
 GHO (CODE)              0
GHO (DISPLAY)           0
GHO (URL)               0
YEAR (DISPLAY)          0
STARTYEAR               0
ENDYEAR                 0
REGION (CODE)           0
REGION (DISPLAY)        0
COUNTRY (CODE)          0
COUNTRY (DISPLAY)       0
DIMENSION (TYPE)     3544
DIMENSION (CODE)     3544
DIMENSION (NAME)     3589
Numeric              1669
Value                  49
Low                  6668
High                 6668
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mean(), inplace=True)  # or use median()


In [7]:
# Standardizing Formats
print("\nStandardizing Formats:")
# Ensure consistent formatting for dates
data['YEAR (DISPLAY)'] = pd.to_datetime(data['YEAR (DISPLAY)'], format='%Y', errors='coerce')



Standardizing Formats:


In [8]:
# Convert percentage columns to float (if they are stored as strings)
#percentage_columns = ['Numeric Value', 'Low', 'High']
#for col in percentage_columns:
  #  if col in data.columns:
   #     data[col] = data[col].str.replace('%', '').astype(float) / 100  # Convert to decimal

Transforming Variables

In [10]:
## Convert Categorical Variables into Factors
print("\nConverting Categorical Variables into Factors:")
categorical_columns = ['#region+name', '#country+name', '#dimension+name']
for col in categorical_columns:
    if col in data.columns:
        data[col] = data[col].astype('category')


Converting Categorical Variables into Factors:


In [11]:
# Creating a Clean Dataset
print("\nCreating a Clean Dataset:")
cleaned_data = data[['GHO (CODE)', 'GHO (DISPLAY)', 'YEAR (DISPLAY)', 
                      'STARTYEAR', 'ENDYEAR', 'REGION (CODE)', 
                      'REGION (DISPLAY)', 'COUNTRY (CODE)', 
                      'COUNTRY (DISPLAY)', 'DIMENSION (TYPE)', 
                      'DIMENSION (CODE)', 'DIMENSION (NAME)', 
                      'Numeric', 'Growth Rate', 
                      'Value']]


Creating a Clean Dataset:


KeyError: "['Growth Rate'] not in index"

In [None]:
# Display the cleaned dataset
print("Cleaned Dataset:")
display(cleaned_data.head())

In [None]:
# Save the cleaned dataset to a new CSV file (optional)
cleaned_data.to_csv('cleaned_health_data.csv', index=False)

Exploratory Data Analysis (EDA)

In [None]:
## Descriptive Statistics

print("\nDescriptive Statistics:")
stats_summary = cleaned_data.describe()
print(stats_summary)

In [None]:
# Frequency distributions for categorical variables
for col in categorical_columns:
    if col in cleaned_data.columns:
        print(f"\nFrequency Distribution for {col}:")
        print(cleaned_data[col].value_counts())

In [None]:
## Trend Analysis
def plot_trend(indicator_name):
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=cleaned_data[cleaned_data['GHO (DISPLAY)'] == indicator_name],
                 x='YEAR (DISPLAY)', y='Numeric')
    plt.title(f'Trend of {indicator_name} Over Years')
    plt.xlabel('Year')
    plt.ylabel('Value (%)')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
## Regional Comparisons
def plot_regional_comparison(indicator_name):
    plt.figure(figsize=(12, 6))
    sns.barplot(data=cleaned_data[cleaned_data['GHO (DISPLAY)'] == indicator_name],
                 x='REGION (DISPLAY)', y='Numeric')
    plt.title(f'{indicator_name} by Region')
    plt.xlabel('Region')
    plt.ylabel('Value (%)')
    plt.xticks(rotation=45)
    plt.show()

In [None]:
## Confidence Intervals
def confidence_interval(indicator_name):
    data_for_ci = cleaned_data[cleaned_data['GHO (DISPLAY)'] == indicator_name]
    mean_value = data_for_ci['Numeric Value'].mean()
    std_dev = data_for_ci['Numeric Value'].std()
    n = len(data_for_ci)

    confidence_level = 0.95
    z_score = 1.96  # z-score for 95% confidence interval

    margin_of_error = z_score * (std_dev / np.sqrt(n))
    return mean_value - margin_of_error, mean_value + margin_of_error