In [30]:
# import libraries 
import os
import sys
import pandas as pd
import numpy as np
from dash import Dash, dcc, html, Input, Output
import dash_bootstrap_components as dbc
import plotly.express as px
import threading, time, socket
from IPython.display import IFrame, display, clear_output

In [31]:
# load data
df = pd.read_csv('cleaned_data.csv')

# display data
df.head()

Unnamed: 0,Age,Gender,Country,State,Self Employed,Family History,Treatment,Work Interfere,Num Employees,Remote Work,...,Anonymity,Leave,Mental Health Consequence,Phys Health Consequence,Coworkers,Supervisor,Mental Health Interview,Phys Health Interview,Mental vs Physical,Obs Consequence
0,46,Male,United States,MD,Yes,Yes,No,Sometimes,1-5,Yes,...,Yes,Very easy,No,No,Yes,Yes,No,Yes,Yes,Yes
1,36,Male,France,,Yes,Yes,No,Unknown,6-25,Yes,...,Yes,Somewhat easy,No,No,Some of them,Some of them,Maybe,Maybe,Don't know,No
2,29,Male,United States,NY,No,Yes,Yes,Sometimes,100-500,No,...,No,Somewhat difficult,Maybe,No,Some of them,Some of them,No,No,No,No
3,31,Male,United States,NC,Yes,No,No,Never,1-5,Yes,...,Yes,Somewhat difficult,No,No,Some of them,Some of them,No,Maybe,Yes,No
4,46,Male,United States,MA,No,No,Yes,Often,26-100,Yes,...,Don't know,Don't know,Maybe,No,Some of them,Yes,No,Maybe,No,No


In [32]:
# check for missing values
missing_values = df.isnull().sum()
missing_values

Age                            0
Gender                         0
Country                        0
State                        506
Self Employed                  0
Family History                 0
Treatment                      0
Work Interfere                 0
Num Employees                  0
Remote Work                    0
Tech Company                   0
Benefits                       0
Care Options                   0
Wellness Program               0
Seek Help                      0
Anonymity                      0
Leave                          0
Mental Health Consequence      0
Phys Health Consequence        0
Coworkers                      0
Supervisor                     0
Mental Health Interview        0
Phys Health Interview          0
Mental vs Physical             0
Obs Consequence                0
dtype: int64

In [33]:
# show unique values by column
for column in df.columns:
    print(f"{column}: {df[column].unique()}\n")

Age: [46 36 29 31 41 33 35 34 37 32 30 42 40 27 38 50 24 18 28 26 22 44 23 19
 25 39 45 21 43 56 60 54 55 48 20 57 58 47 62 51 65 49 53 61 72]

Gender: ['Male' 'Female' 'Other']

Country: ['United States' 'France' 'United Kingdom' 'Canada' 'Portugal'
 'Netherlands' 'Switzerland' 'Poland' 'Australia' 'Germany' 'Russia'
 'Mexico' 'Brazil' 'Slovenia' 'Costa Rica' 'Austria' 'Ireland' 'India'
 'South Africa' 'Italy' 'Bulgaria' 'Sweden' 'Colombia' 'Latvia' 'Romania'
 'Belgium' 'New Zealand' 'Spain' 'Finland' 'Uruguay' 'Israel'
 'Bosnia and Herzegovina' 'Hungary' 'Singapore' 'Japan' 'Nigeria'
 'Croatia' 'Norway' 'Thailand' 'Denmark' 'Greece' 'Moldova' 'Georgia'
 'China' 'Czech Republic' 'Philippines']

State: ['MD' nan 'NY' 'NC' 'MA' 'IA' 'CA' 'TN' 'OH' 'PA' 'WA' 'WI' 'IN' 'TX' 'MI'
 'IL' 'UT' 'NM' 'OR' 'FL' 'MN' 'MO' 'AZ' 'CT' 'CO' 'GA' 'DC' 'NE' 'WV'
 'OK' 'KS' 'VA' 'NH' 'KY' 'AL' 'NV' 'NJ' 'SC' 'VT' 'SD' 'ID' 'MS' 'RI'
 'WY' 'LA' 'ME']

Self Employed: ['Yes' 'No']

Family History: ['Yes' '

In [None]:
# show percent of respondents that do remote work by company size
remote_percent = df[df['Remote Work'] == 'Yes']['Num Employees'].value_counts(normalize=True) * 100
print(remote_percent)

Num Employees
1-5               26.027397
6-25              23.835616
26-100            21.917808
More than 1000    11.780822
100-500           11.780822
500-1000           4.657534
Name: proportion, dtype: float64


In [48]:
# initialize variable for company size by number of employees
company_size = ['1-5', '6-25', '26-100', '100-500', '500-1000', 'More than 1000']

# create helper functions
def sorted_company_bins(values):
    uniq = list(pd.Series(values).dropna().unique())
    ordered = [b for b in company_size if b in uniq]
    return ordered + [x for x in uniq if x not in company_size]

def apply_filters(df, countries=None, gender=None, age_range=None):
    d = df
    if countries:
        d = d[d['Country'].isin(countries)]
    if gender and gender != 'All':
        d = d[d['Gender'] == gender]
    if age_range:
        lo, hi = age_range
        d = d[
            pd.to_numeric(d['Age'], errors='coerce').between(18, 72)
            & (pd.to_numeric(d['Age'], errors='coerce') >= lo)
            & (pd.to_numeric(d['Age'], errors='coerce') <= hi)
        ]
    return d

In [49]:
# use IBM colorblindness friendly palette
ibm_palette = [
    "#648fff",  # blue
    "#785ef0",  # purple
    "#dc267f",  # magenta
    "#fe6100",  # orange
    "#ffb000",  # yellow
    "#19a979",  # teal
    "#06a0c9",  # cyan
    "#92278f",  # violet
]

In [71]:
# control options
country_options = sorted(df['Country'].dropna().unique())
gender_options  = ['All'] + list(pd.Series(df['Gender'].dropna().unique()).sort_values())

# initialize app
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

app.layout = dbc.Container([
    html.H3("Exploring Mental Health in the Tech Industry", className="mb-2"),
    dcc.Markdown(
        """
 \n \n \n
**Dashboard Overview** \n
This app utilizes data collected by **OSMH (Open Source Mental Health)**, a non-profit organization dedicated 
to raising awareness and reducing stigma surrounding mental illness within the tech and developer community.

The OSMI Mental Health in Tech Survey is conducted annually and the data is made publicly available on the 
OSMH website. This specific dataset is from the 2014 study and contains over 1200 responses from individuals 
working in the tech industry. It includes questions regarding mental health history, employer support, perceived 
consequences, and workplace culture.

Dataset: https://osmhhelp.org/research.html

**Usage** \n
Use the filters to narrow respondents by **Country**, **Gender**, **Age** (18–72), and **Work Mode**.
Charts will update automatically to reflect selected filters.

**KPIs**
- **Respondents** — number of respondents (after filters are applied)
- **Treatment rate** — percentage of respondents who are receiving or have received mental health treatment

**Charts**
- **Mental Health Benefits by Company Size** — respondents who are provided mental health benefits by their employer
- **Treatment by Company Size** — respondents who are receiving or have received mental health treatment
- **Work Interference by Company Size** — respondents who report that their mental health conditions interfere with their work
- **Perceived Consequence by Company Size** — respondents who believe that disclosing their mental health conditions to their employer would have negative consequences
        """,
        className="mb-3"
    ),


    # create filters
    dbc.Row([
        dbc.Col(
            dbc.Card(
                dbc.CardBody([
                    # dropdown menu for country of residence
                    dbc.Label("Country"),
                    dcc.Dropdown(
                        id='country_dd',
                        options=[{'label': c, 'value': c} for c in country_options],
                        value=None, multi=True, placeholder="Select country/countries"
                    ),
                    html.Br(),

                    # radio buttons for respondent gender
                    dbc.Label("Gender"),
                    dcc.RadioItems(
                        id='gender_radio',
                        options=[{'label': g, 'value': g} for g in gender_options],
                        value='All',
                        inline=True,
                        style={"display": "flex", "gap": "25px"},
                        inputStyle={"margin-right": "5px"}
                    ),
                    html.Br(),

                    # slider for respondent age, ranging from 18 to 72
                    dbc.Label("Age range"),
                    dcc.RangeSlider(
                        id='age_slider',
                        min=18, max=72,
                        step=1,  # keep as requested
                        value=[18, 72],
                        marks={i: str(i) for i in range(20, 71, 5)},
                        tooltip={"always_visible": True, "placement": "bottom"},
                        dots=False,
                        allowCross=False
                    ),
                    html.Br(),

                    # radio buttons for work mode, remote or in-office
                    dbc.Label("Work mode"),
                    dcc.RadioItems(
                        id='workmode_radio',
                        options=[
                            {'label': 'All', 'value': 'All'},
                            {'label': 'Remote', 'value': 'Remote'},
                            {'label': 'In-office', 'value': 'In-office'},
                        ],
                        value='All',
                        inline=True,
                        style={"display": "flex", "gap": "25px"},
                        inputStyle={"margin-right": "5px"}
                    ),
                ]),
                style={"minHeight": "300px", "padding": "12px"},
                className="shadow-sm"
            ),
            width=12
        )
    ], className="mb-3"),

    # KPI row
    dbc.Row([
        dbc.Col(
            dbc.Card(dbc.CardBody([
                html.Div("Respondents", className="text-muted small"),
                html.H3(id='kpi_count', className='mb-0')
            ]), className="shadow-sm"),
            md=6
        ),
        dbc.Col(
            dbc.Card(dbc.CardBody([
                html.Div("Treatment rate", className="text-muted small"),
                html.H3(id='kpi_treat', className='mb-0')
            ]), className="shadow-sm"),
            md=6
        ),
    ], className="mb-3 g-3"),

    # plots
    dcc.Graph(id='fig_treatment_by_size', style={"height": "480px"}),
    dcc.Graph(id='fig_mhc_by_size',       style={"height": "480px"}, className="mb-3"),
    dcc.Graph(id='fig_benefits_by_size',  style={"height": "480px"}, className="mb-3"),
    dcc.Graph(id='fig_interfere_by_size', style={"height": "480px"}, className="mb-3"),
], fluid=True, className='p-4')

# callback to update KPIs and charts
@app.callback(
    Output('kpi_count','children'),
    Output('kpi_treat','children'),
    Output('fig_treatment_by_size','figure'),
    Output('fig_mhc_by_size','figure'),
    Output('fig_benefits_by_size','figure'),
    Output('fig_interfere_by_size','figure'),
    Input('country_dd','value'),
    Input('gender_radio','value'),
    Input('age_slider','value'),
    Input('workmode_radio','value')
)

def update(countries, gender, age_range, workmode):
    # base filters
    sub = apply_filters(df, countries=countries, gender=gender, age_range=age_range)

    # apply work-mode filter 
    if workmode == 'Remote':
        sub = sub[sub['Remote Work'].eq('Yes')]
    elif workmode == 'In-office':
        sub = sub[sub['Remote Work'].eq('No')]

    count = len(sub)
    tr = (sub['Treatment'].eq('Yes').mean()*100) if count > 0 else float('nan')

    # stacked percent bar by company size for a given column
    def stacked_by_size(data, target_col, title, order_map=None):
        if len(data) == 0 or 'Num Employees' not in data.columns or target_col not in data.columns:
            return px.scatter(title=f"No data for {title}")
        # order company bins
        order_bins = sorted_company_bins(data['Num Employees'].dropna().unique())
        tmp = (
            data.dropna(subset=['Num Employees', target_col])
                .groupby(['Num Employees', target_col])
                .size().reset_index(name='n')
        )
        tmp['percent'] = 100 * tmp['n'] / tmp.groupby('Num Employees')['n'].transform('sum')

        # category order for the stacked variable (optional, for nice legend order)
        cat_orders = {'Num Employees': order_bins}
        if order_map is not None:
            present = [x for x in order_map if x in tmp[target_col].unique()]
            cat_orders[target_col] = present

        fig = px.bar(
            tmp, x='Num Employees', y='percent', color=target_col,
            category_orders=cat_orders,
            labels={'Num Employees': 'Company size', 'percent': 'Percent', target_col: target_col},
            title=title, barmode='stack',
            color_discrete_sequence=ibm_palette # use IBM palette
        )
        fig.update_layout(yaxis_range=[0, 100], yaxis_ticksuffix='%', bargap=0.15, legend_title_text=target_col)
        return fig

    # figure 1 - Benefits by company size
    fig_benefits = stacked_by_size(sub, 'Benefits', 'Mental Health Benefits by Company Size',order_map=['Yes','No',"Don't know"])

    # figure 2 - Treatment by company size
    fig_size = stacked_by_size(sub, 'Treatment', 'Treatment by Company Size', order_map=['Yes','No'])

    # figure 3 - Work Interfere by company size
    fig_interfere = stacked_by_size(sub, 'Work Interfere', 'Work Interference by Company Size', order_map=['Never','Rarely','Sometimes','Often'])

    # figure 4 - Mental Health Consequence by company size
    fig_mhc = stacked_by_size(sub, 'Mental Health Consequence','Perceived Mental Health Consequence by Company Size', order_map=['Yes','Maybe','No'])

    return (
        f"{count:,}",
        (f"{tr:.1f}%" if pd.notna(tr) else "—"),
        fig_benefits,
        fig_size,
        fig_interfere,
        fig_mhc
    )

In [72]:
# runner
PORT = 8056
URL  = f"http://127.0.0.1:{PORT}"

def _port_in_use(port: int) -> bool:
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(("127.0.0.1", port)) == 0

def _stop_server(port: int):
    try:
        # try a conventional shutdown endpoint if you added one; otherwise ignore errors
        requests.get(f"http://127.0.0.1:{port}/_shutdown", timeout=0.5)
        time.sleep(0.3)
    except Exception:
        pass

def _run_server():
    # Dash/Flask server
    app.run(port=PORT, debug=False, use_reloader=False)

# stop any old server on this port
if _port_in_use(PORT):
    _stop_server(PORT)
    time.sleep(0.5)

# start fresh
server_thread = threading.Thread(target=_run_server, daemon=True)
server_thread.start()

time.sleep(1.2)
clear_output(wait=True)
display(IFrame(src=URL, width="100%", height=720))

