In [1]:
# Imports
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from dash import Dash, dcc, html, Input, Output, State
import dash_bootstrap_components as dbc

In [2]:
!pip install dash-bootstrap-components



In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv(r'C:\Users\Admin\Desktop\vsasss\TB_Burden_Country.csv')  # Adjust filename as needed

In [4]:
# Display initial info
print("Initial shape:", df.shape)
print("\nMissing values before cleaning:")
print(df.isnull().sum())

Initial shape: (5120, 47)

Missing values before cleaning:
Country or territory name                                                                            0
ISO 2-character country/territory code                                                              24
ISO 3-character country/territory code                                                               0
ISO numeric country/territory code                                                                   0
Region                                                                                               0
Year                                                                                                 0
Estimated total population number                                                                    0
Estimated prevalence of TB (all forms) per 100 000 population                                        0
Estimated prevalence of TB (all forms) per 100 000 population, low bound                            20
Estimated prev

In [5]:
# Fill missing ISO 2-character codes using the 3-character codes as reference
# Create a mapping from ISO3 to ISO2 for existing records
iso_mapping = df.dropna(subset=['ISO 2-character country/territory code', 'ISO 3-character country/territory code'])\
                .set_index('ISO 3-character country/territory code')['ISO 2-character country/territory code'].to_dict()

# Apply the mapping to fill missing ISO2 codes
df['ISO 2-character country/territory code'] = df.apply(
    lambda row: iso_mapping.get(row['ISO 3-character country/territory code'], np.nan) 
    if pd.isna(row['ISO 2-character country/territory code']) 
    else row['ISO 2-character country/territory code'],
    axis=1
)

In [6]:
# Drop columns with excessive missing values or redundant information
columns_to_drop = [
    'Method to derive TBHIV estimates',
    'Method to derive incidence estimates',
    'Estimated prevalence of TB (all forms) per 100 000 population, low bound',
    'Estimated prevalence of TB (all forms) per 100 000 population, high bound',
    'Estimated prevalence of TB (all forms), low bound',
    'Estimated prevalence of TB (all forms), high bound'
]

df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])

In [7]:
# Create indicator for HIV data availability
hiv_cols = [col for col in df.columns if 'HIV' in col]
df['HIV_data_available'] = df[hiv_cols[0]].notna()

# Optional: Fill HIV-related missing values with 0 if appropriate
# df[hiv_cols] = df[hiv_cols].fillna(0)

In [8]:
# Fill missing bounds with the main estimate
for bound in ['low bound', 'high bound']:
    main_col = 'Case detection rate (all forms), percent'
    bound_col = f'Case detection rate (all forms), percent, {bound}'
    if bound_col in df.columns:
        df[bound_col] = df[bound_col].fillna(df[main_col])

In [9]:
# For numeric columns, fill with median by country
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
for col in numeric_cols:
    df[col] = df.groupby('Country or territory name')[col].transform(
        lambda x: x.fillna(x.median()) if x.notna().any() else x
    )

# For any remaining missing values in numeric columns, fill with global median
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

In [10]:
# Display cleaning results
print("\nMissing values after cleaning:")
print(df.isnull().sum())

print("\nFinal shape:", df.shape)

# Save cleaned data
df.to_csv('cleaned_tb_data.csv', index=False)


Missing values after cleaning:
Country or territory name                                                                          0
ISO 2-character country/territory code                                                            24
ISO 3-character country/territory code                                                             0
ISO numeric country/territory code                                                                 0
Region                                                                                             0
Year                                                                                               0
Estimated total population number                                                                  0
Estimated prevalence of TB (all forms) per 100 000 population                                      0
Estimated prevalence of TB (all forms)                                                             0
Method to derive prevalence estimates                      

In [11]:
# Columns to KEEP
keep_columns = [
    'Country or territory name',
    'ISO 3-character country/territory code',
    'Region',
    'Year',
    'Estimated number of incident cases (all forms)',
    'Estimated total population number',
    'Estimated prevalence of TB (all forms) per 100 000 population',
    'Estimated incidence (all forms) per 100 000 population',
    'Estimated mortality of TB cases (all forms, excluding HIV) per 100 000 population',
    'Estimated number of deaths from TB (all forms, excluding HIV)',
    'Estimated HIV in incident TB (percent)',
    'Estimated incidence of TB cases who are HIV-positive per 100 000 population',
    'Case detection rate (all forms), percent',
    'HIV_data_available'  # This was created in previous cleaning
]

# Filter the dataframe
df_clean = df[keep_columns].copy()

df_clean = df_clean.rename(columns={
    "Country or territory name": "Country",
    "Region": "Region",
    "Year": "Year",
    "Estimated number of incident cases (all forms)": "TB_Cases",
    "Estimated incidence (all forms) per 100 000 population": "Incidence_per_100k"
})

# Verify
print(f"Original columns: {len(df.columns)}")
print(f"Kept columns: {len(df_clean.columns)}")
print("\nMissing values in cleaned data:")
print(df_clean.isnull().sum())

Original columns: 42
Kept columns: 14

Missing values in cleaned data:
Country                                                                              0
ISO 3-character country/territory code                                               0
Region                                                                               0
Year                                                                                 0
TB_Cases                                                                             0
Estimated total population number                                                    0
Estimated prevalence of TB (all forms) per 100 000 population                        0
Incidence_per_100k                                                                   0
Estimated mortality of TB cases (all forms, excluding HIV) per 100 000 population    0
Estimated number of deaths from TB (all forms, excluding HIV)                        0
Estimated HIV in incident TB (percent)                     

In [12]:
df_clean.head()

Unnamed: 0,Country,ISO 3-character country/territory code,Region,Year,TB_Cases,Estimated total population number,Estimated prevalence of TB (all forms) per 100 000 population,Incidence_per_100k,"Estimated mortality of TB cases (all forms, excluding HIV) per 100 000 population","Estimated number of deaths from TB (all forms, excluding HIV)",Estimated HIV in incident TB (percent),Estimated incidence of TB cases who are HIV-positive per 100 000 population,"Case detection rate (all forms), percent",HIV_data_available
0,Afghanistan,AFG,EMR,1990,22000.0,11731193,306.0,189.0,37.0,4300.0,0.06,0.11,20.0,True
1,Afghanistan,AFG,EMR,1991,24000.0,12612043,343.0,191.0,46.0,5800.0,0.07,0.13,96.0,True
2,Afghanistan,AFG,EMR,1992,26000.0,13811876,371.0,191.0,54.0,7400.0,0.08,0.16,47.0,True
3,Afghanistan,AFG,EMR,1993,29000.0,15175325,392.0,189.0,60.0,9100.0,0.1,0.19,47.0,True
4,Afghanistan,AFG,EMR,1994,31000.0,16485018,410.0,188.0,65.0,11000.0,0.11,0.21,47.0,True


In [13]:
# Add dummy 2024 data by copying from 2013 for now
if 2024 not in df_clean['Year'].values:
    df_temp = df_clean[df_clean['Year'] == 2013].copy()
    df_temp['Year'] = 2024
    df_clean = pd.concat([df_clean, df_temp], ignore_index=True)

# Precompute global trend for line chart
df_global = df_clean.groupby('Year', as_index=False)['TB_Cases'].sum()

# -----------------------
# Initialize Dash App
# -----------------------
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# -----------------------
# App Layout
# -----------------------
app.layout = dbc.Container([

    html.H1("Tuberculosis Dashboard (1990–2024)", className="text-center my-3"),

    dbc.Row([
        dbc.Col([
            html.Label("Select Year:"),
            dcc.Slider(
                id='year-slider',
                min=1990, max=2024, step=1, value=2024,
                marks={str(year): str(year) for year in range(1990, 2025)},
                tooltip={"placement": "bottom"}
            )
        ])
    ], className="mb-4"),

    dbc.Row([
        dbc.Col([
            html.Label("Select Region:"),
            dcc.Dropdown(
                id='region-dropdown',
                options=[{'label': r, 'value': r} for r in sorted(df_clean['Region'].dropna().unique())],
                placeholder="All Regions",
                clearable=True
            )
        ], md=6),

        dbc.Col([
            html.Label("Select Country:"),
            dcc.Dropdown(id='country-dropdown', placeholder="All Countries", clearable=True)
        ], md=6),
    ], className="mb-4"),

    dbc.Row([
        dbc.Col(html.Div(id='kpi-total-cases'), md=6),
        dbc.Col(html.Div(id='kpi-incidence-rate'), md=6)
    ], className="mb-4"),

    dbc.Row([
        dbc.Col([
            html.Label("Top 10 Bar Metric:"),
            dcc.RadioItems(
                id='bar-metric',
                options=[
                    {"label": "TB Cases", "value": "TB_Cases"},
                    {"label": "Incidence per 100k", "value": "Incidence_per_100k"},
                ],
                value="TB_Cases",
                inline=True
            )
        ])
    ], className="mb-3"),

    dbc.Row([
        dbc.Col(dcc.Graph(id='global-trend'), md=12),
    ]),

    dbc.Row([
        dbc.Col(dcc.Graph(id='top-countries'), md=6),
        dbc.Col(dcc.Graph(id='pie-distribution'), md=6),
    ], className="mb-4"),

    dbc.Row([
        dbc.Col([
            html.Button("Download Filtered Data", id="download-btn", className="btn btn-primary"),
            dcc.Download(id="download-data")
        ])
    ], className="mb-4")
], fluid=True)

# -----------------------
# Callbacks
# -----------------------

@app.callback(
    Output('country-dropdown', 'options'),
    Input('region-dropdown', 'value')
)
def update_country_dropdown(selected_region):
    if selected_region:
        countries = df_clean[df_clean["Region"] == selected_region]["Country"].unique()
    else:
        countries = df_clean["Country"].unique()
    return [{'label': c, 'value': c} for c in sorted(countries)]


@app.callback(
    [
        Output('kpi-total-cases', 'children'),
        Output('kpi-incidence-rate', 'children'),
        Output('global-trend', 'figure'),
        Output('top-countries', 'figure'),
        Output('pie-distribution', 'figure'),
    ],
    [
        Input('year-slider', 'value'),
        Input('region-dropdown', 'value'),
        Input('country-dropdown', 'value'),
        Input('bar-metric', 'value')
    ]
)
def update_dashboard(year, region, country, metric):
    df_filtered = df_clean[df_clean['Year'] == year]

    if region:
        df_filtered = df_filtered[df_filtered['Region'] == region]
    if country:
        df_filtered = df_filtered[df_filtered['Country'] == country]

    total_cases = int(df_filtered['TB_Cases'].sum()) if not df_filtered.empty else 0
    avg_incidence = round(df_filtered['Incidence_per_100k'].mean(), 2) if not df_filtered.empty else 0.0

    kpi1 = dbc.Card([
        dbc.CardBody([
            html.H5("Total TB Cases", className="card-title"),
            html.H3(f"{total_cases:,}", className="text-danger")
        ])
    ], color="light")

    kpi2 = dbc.Card([
        dbc.CardBody([
            html.H5("Avg. Incidence per 100k", className="card-title"),
            html.H3(f"{avg_incidence}", className="text-primary")
        ])
    ], color="light")

    fig1 = px.line(df_global, x='Year', y='TB_Cases', title='Global TB Cases Over Time', markers=True)

    top10 = df_filtered.sort_values(by=metric, ascending=False).head(10)
    fig2 = px.bar(top10, x='Country', y=metric,
                  title=f'Top 10 Countries by {metric.replace("_", " ")} ({year})',
                  color=metric)

    fig3 = px.pie(top10, names='Country', values='TB_Cases',
                  title=f'TB Case Distribution in Top 10 Countries ({year})')

    return kpi1, kpi2, fig1, fig2, fig3


@app.callback(
    Output("download-data", "data"),
    Input("download-btn", "n_clicks"),
    State("year-slider", "value"),
    State("region-dropdown", "value"),
    State("country-dropdown", "value"),
    prevent_initial_call=True
)
def download_filtered_data(n_clicks, year, region, country):
    df_filtered = df_clean[df_clean['Year'] == year]
    if region:
        df_filtered = df_filtered[df_filtered["Region"] == region]
    if country:
        df_filtered = df_filtered[df_filtered["Country"] == country]
    return dcc.send_data_frame(df_filtered.to_csv, filename=f"TB_data_{year}.csv", index=False)

# -----------------------
# Run the app
# -----------------------
if __name__ == '__main__':
    app.run(debug=True, use_reloader=False)
