# Problem Statement

The company is facing mental health issues among its employees and is launching a program to address these issues. The survey has been conducted amongst the company’s staff and the results are complex and are not straightforward. The goal is to group responders based on their answers and to provide visual aids to assist cluster interpretation. 

# Objectives and Goals

1. Build an unsupervised machine learning model to categorize the survey’s participants based on their responses.
2. Create visualizations to gain insights into each cluster of participants and their key traits.

# Notebook Imports

In [None]:
!pip install prince

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer 
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from umap import UMAP

from prince import MCA

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold

from kmodes.kmodes import KModes

from yellowbrick.cluster import KElbowVisualizer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics import rand_score, fowlkes_mallows_score

import warnings
warnings.filterwarnings('ignore')

# Step 1: Data Preprocessing and Exploration

In [None]:
df = pd.read_csv('/kaggle/input/mental-health-in-tech-2016/mental-heath-in-tech-2016_20161114.csv')

data = pd.DataFrame(df)
len(data.columns)

In [None]:
data.isna().sum()

In [None]:
data.describe(include='object')

### Renaming Columns

In [None]:
dict_new_columns = {
    'Are you self-employed?': 'Self Employed',
    'How many employees does your company or organization have?': 'Number of employees',
    'Is your employer primarily a tech company/organization?': 'Tech Company',
    'Does your employer provide mental health benefits as part of healthcare coverage?': 'MH Coverage',
    'Do you know the options for mental health care available under your employer-provided coverage?': 'MH Coverage Awareness',
    'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?': 'MH Discussion',
    'Does your employer offer resources to learn more about mental health concerns and options for seeking help?': 'MH Resources Provided',
    'Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources provided by your employer?': 'MH Anonimity',
    'If a mental health issue prompted you to request a medical leave from work, asking for that leave would be:': 'Medical Leave',
    'Do you think that discussing a mental health disorder with your employer would have negative consequences?': 'MH Discuission Neg Impact',
    'Would you feel comfortable discussing a mental health disorder with your coworkers?': 'MH Discuission Coworkers',
    'Would you feel comfortable discussing a mental health disorder with your direct supervisor(s)?': 'MH Discuission Supervisor(s)',
    'Do you feel that your employer takes mental health as seriously as physical health?': 'MH as PH',
    'Have you heard of or observed negative consequences for co-workers who have been open about mental health issues in your workplace?': 'MH Coworker Reveal Neg Impact',
    'Do you feel that being identified as a person with a mental health issue would hurt your career?': 'MH Bad Impact on Career',
    'Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?': 'MH Neg View of Coworkers',
    'How willing would you be to share with friends and family that you have a mental illness?': 'MH Sharing Freinds/Family',
    'Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?': 'MH Bad Response',
    'Do you have a family history of mental illness?': 'MH Family History',
    'Have you had a mental health disorder in the past?': 'MH Disorder Past',
    'Do you currently have a mental health disorder?': 'Current MH Disorder',
    'Have you been diagnosed with a mental health condition by a medical professional?': 'MH Diagnos Proffesional',
    'Have you ever sought treatment for a mental health issue from a mental health professional?': 'MH Sought Proffes Treat',
    'What is your age?': 'Age',
    'What is your gender?': 'Gender',
    'What country do you live in?': 'Country',
    'What US state or territory do you live in?': 'US State',
    'What country do you work in?': 'Work Country',
    'What US state or territory do you work in?': 'Work US State',
    'Which of the following best describes your work position?': 'Work Position',
    'Do you work remotely?': 'Remote', 
    'Do you think that discussing a physical health issue with your employer would have negative consequences?': 'PH Discuission Neg Impact',
    'Do you have previous employers?': 'Prev Employers',
    'Have your previous employers provided mental health benefits?': 'Prev MH Benefits',
    'Were you aware of the options for mental health care provided by your previous employers?': 'MH Awarness Prev',
    'Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?': 'Prev MH Discussion',
    'Did your previous employers provide resources to learn more about mental health issues and how to seek help?': 'Prev MH Resources Provided',
    'Was your anonymity protected if you chose to take advantage of mental health or substance abuse treatment resources with previous employers?': 'Prev MH Anonimity',
    'Do you think that discussing a mental health disorder with previous employers would have negative consequences?': 'Prev MH Discuission Neg Impact',
    'Do you think that discussing a physical health issue with previous employers would have negative consequences?': 'Prev PH Discuission Neg Impact',
    'Would you have been willing to discuss a mental health issue with your previous co-workers?': 'Prev MH Discuission Coworkers',
    'Would you have been willing to discuss a mental health issue with your direct supervisor(s)?': 'Prev MH Discuission Supervisor(s)',
    'Did you feel that your previous employers took mental health as seriously as physical health?': 'Prev MH as PH',
    'Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?': 'Prev MH Coworker Reveal Neg Impact',
    'Would you be willing to bring up a physical health issue with a potential employer in an interview?': 'PH in Interview',
    'Why or why not?': 'Why/why not',
    'Would you bring up a mental health issue with a potential employer in an interview?': 'MH in Interview',
    'Why or why not?.1': 'Why/why not (1)',
    'Have your observations of how another individual who discussed a mental health disorder made you less likely to reveal a mental health issue yourself in your current workplace?': 'Less Reveal MH after observation',
    'If you have a mental health issue, do you feel that it interferes with your work when being treated effectively?': 'MH Effective Treatment',
    'If you have a mental health issue, do you feel that it interferes with your work when NOT being treated effectively?': 'MH NOT Effective Treatment'
}

data.rename(columns=dict_new_columns, inplace=True)

In [None]:
cols_to_drop = list(data.isna().sum() >= 500)
data.drop(data.columns[cols_to_drop], axis=1, inplace=True)
len(data.columns)

### Dealing with Missing Values

In [None]:
mode_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
mode_imputer.fit(data)
imputed_data = mode_imputer.transform(data)
imputed_df = pd.DataFrame(data=imputed_data, columns=data.columns)

### Gender

In [None]:
# Male
male = ['Male', 'male', 'Male ', 'M', 'm', 'man', 'Cis male', 'Male.', 'male 9:1 female, roughly', 'Male (cis)', 'Man', 'Sex is male',
       'cis male', 'Malr', 'Dude', 'mail', 'M|', 'Male/genderqueer', 'male ', 'Cis Male', 'cisdude', 'cis man', 'MALE',
       "I'm a man why didn't you make this a drop down question. You should of asked sex? And I would of answered yes please. Seriously how much text can this take? "]
imputed_df['Gender'].replace(to_replace=male, value='Male', inplace=True)

# Female
female = ['Female', 'female', 'female ',  'F', 'Woman', 'fm', 'f', 'Cis female ', 'Transitioned, M2F', ' Female', 'Cis-woman', 'AFAB',
          'Genderfluid (born female)',  'Female ', 'woman', 'female/woman', 'Cisgender Female', 'fem', 'female-bodied; no feelings about gender',
          'Female (props for making this a freeform field, though)', 'I identify as female.', 'Female assigned at birth ', 'Female or Multi-Gender Femme']
imputed_df['Gender'].replace(to_replace=female, value='Female', inplace=True)

# Non-binary
other = ['non-binary', 'Agender', 'Nonbinary', 'Queer', 'nb masculine', 'mtf', 'human', 'genderqueer woman', 'genderqueer', 'Unicorn', 
         'Transgender woman', 'Bigender', 'Enby', 'Other', 'Other/Transfeminine', 'Human', 'Genderqueer', 'Genderflux demi-girl', 'Genderfluid', 
         'Fluid', 'Enby Bigender', 'Androgynous', 'none of your business', 'Male (trans, FtM)']

imputed_df['Gender'].replace(to_replace=other, value='Other', inplace=True)

imputed_df[['Gender']].value_counts()

### Age

In [None]:
print(imputed_df[['Age']].isnull().sum(), '\n')
imputed_df[['Age']].value_counts()

In [None]:
# weird ages: 3, 15, 99, 323
replace = imputed_df[(data['Age'] < 18) | (imputed_df['Age'] > 75)]['Age'].tolist()
value = round(imputed_df[(imputed_df['Age'] >= 18) | (imputed_df['Age'] <= 75)]['Age'].mean())

imputed_df['Age'].replace(to_replace=replace, value=value, inplace=True)

### Country

In [None]:
# group the countries which have less than 20 employees
countries = ['Sweden', 'France', 'Ireland', 'Switzerland', 'Brazil', 'India',
             'New Zealand', 'Russia', 'Denmark', 'Bulgaria','Finland','Italy', 'Belgium', 
             'Spain', 'Austria', 'South Africa', 'Poland ', 'Romania', 'Pakistan', 
             'Norway', 'Czech Republic', 'Chile', 'Lithuania', 'Other', 'Mexico',
             'Afghanistan', 'Japan', 'Colombia', 'Bosnia and Herzegovina', 'Estonia',
             'Israel', 'Venezuela', 'Argentina', 'Bangladesh', 'Taiwan', 'Brunei',
             'China', 'Serbia', 'Slovakia', 'Costa Rica', 'Ecuador', 'Greece', 'Guatemala', 
             'Hungary', 'Algeria', 'Iran', 'Vietnam', 'Poland']

imputed_df['Country'].replace(to_replace=countries, value='Other', inplace=True)
imputed_df['Country'].replace(to_replace='United States of America', value='USA', inplace=True)
imputed_df['Country'].replace(to_replace='United Kingdom', value='UK', inplace=True)

imputed_df[['Country']].value_counts()

In [None]:
countries.extend(["United Arab Emirates", "Turkey"])

In [None]:
imputed_df['Work Country'].replace(to_replace='United States of America', value='USA', inplace=True)
imputed_df['Work Country'].replace(to_replace='United Kingdom', value='UK', inplace=True)
imputed_df['Work Country'].replace(to_replace=countries, value='Other', inplace=True)

In [None]:
imputed_df[['Work Country']].value_counts()

### Why / Why not

In [None]:
negative_impact_conditions = ["depend", "support", "same", "bad", "deter", "relate", "qualif", 
                              "underst", "hurt", 'chance', "crazy", "doubt",  "weak", "product", "liability",
                              "nothing", "allude", "reject", "respect", "eliminat", "pass", "leave", "filter", "expect", 
                              "complain"]

discrimination_conditions = ["stigma", "discrimination", "cost", "serious", "IQ", "believe", "disqualify", "discount", "prejudice", 
                             "taboo", "judge", "fear", "disability", "unstable", "unreliable"]

privacy_conditions = ["personal", "priva", "accommodation", "situation", "trust", "problem", "shar", "sensitive", "never",
                      "feel", "business", "reflect", "embarass", "obligat", "concern", "condition"]

uncertainty_conditions = ["reaction", "recept", "sure", "issue", "offer", "know", "neg", "afraid", "refus", 
                          "necessary", "vulnerable", "stupid", "crap"]

job_performance_conditions = ["map", "bias", "impact", "compromise", "uncomfortable", "relate", "job", "against", 
                              "sign", "discuss", "ffect", "no"]

legal_conditions = ["manageable", "suicide", "dismis", "depression", "anxiety", "ocd", "burn", "consider"]

transparency_conditions = ["support", "relevant", "reaction", "prompt", "doubt", "perceive", "risk", "accept", "honest", 
                          "perception", "option", "important", "open", "protect"]

lack_of_benefit_conditions = ["appl", "above", "part", "topical", "stance", "aware","roadblock", "won't", "harm", 
                              "position", "benef", "previous", "physical", "potential", "as", "see", "disclos"]

categories = {
    "Negative Impact on Hiring Chances": negative_impact_conditions,
    "Fear of Discrimination or Prejudice": discrimination_conditions,
    "Privacy and Personal Concerns": privacy_conditions,
    "Uncertainty about Employer Reaction": uncertainty_conditions,
    "Concerns about Job Performance or Suitability": job_performance_conditions,
    "Legal and Rights Considerations": legal_conditions,
    "Desire for Transparency and Alignment": transparency_conditions,
    "Lack of Benefit in Disclosing": lack_of_benefit_conditions
}

In [None]:
responses = imputed_df['Why/why not (1)'].unique()

for response in responses:
    for word in response.lower().split():
        for key, values in categories.items():
            if any(value in word for value in values):
                imputed_df['Why/why not (1)'].replace(response, value=key, inplace=True) 
                break

for response in responses:
    for key, values in categories.items():
        if key != response:
            imputed_df['Why/why not (1)'].replace(response, value="Other", inplace=True)
                                                      
imputed_df['Why/why not (1)'].unique()

In [None]:
responses_2 = imputed_df['Why/why not'].unique()

for response in responses_2:
    for word in response.lower().split():
        for key, values in categories.items():
            if any(value in word for value in values):
                imputed_df['Why/why not'].replace(response, value=key, inplace=True)
                break

for response in responses_2:
    for key, values in categories.items():
        if key != response:
            imputed_df['Why/why not'].replace(response, value="Other", inplace=True)
                                                      
imputed_df['Why/why not'].unique()

### Work Position

In [None]:
job_roles = imputed_df['Work Position'].unique()

categories_2 = {
    "IT Development": ["Back", "Developer", "Front"],
    "Management": ["Supervisor","Team", "Lead", "Leadership"],
    "DevOps": ["DevOps", "SysAdmin"],
    "Advocacy": ["Evangelist", "Advocate"],
    "Support": ["Support"],
    "Design": ["Design"], 
    "Sales": ["Sales"],
    "Other": ["Other"],
    "One-person shop":["One","person", "shop"],
    "HR": ['HR']
}

for role in job_roles:
    for word in role.lower().split():
        for key, values in categories_2.items():
            if any(value.lower() in word for value in values):
                imputed_df['Work Position'].replace(role, value=key, inplace=True)
                break
                                                      
imputed_df['Work Position'].value_counts()

## Encoding

In [None]:
columns_ = imputed_df.columns
labeled_df = imputed_df.copy()

le = LabelEncoder()

for column in columns_:
    if labeled_df[column].dtype == 'object':
        labeled_df[column] = le.fit_transform(labeled_df[column].astype(str))
        
labeled_df.head()

## Standardazing

In [None]:
scaler = StandardScaler()
labeled_df["Age"] = scaler.fit_transform(labeled_df[['Age']])
labeled_df.head()

## Removing Highly Correlated Features

In [None]:
plt.figure(figsize=(20, 16))
mask = np.triu(np.ones_like(labeled_df.corr(), dtype=bool))

sns.heatmap(labeled_df.corr(), vmin=-1, vmax=1, mask=mask, annot=True, annot_kws={'size': 6}, linewidths=0.7)

In [None]:
corr_matrix = labeled_df.corr()
threshold = 0.7

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > 0.7:
            labeled_df.drop(columns=[corr_matrix.columns[i], corr_matrix.columns[j]], inplace=True)

labeled_df.shape

## Visualizations and Insights

### Number of Responders by Country and by Company Size

> According to the survey responses, the most prominent company size is '6-25', with about 533 responses from tech-company and 46 responses from non-tech ones, while the '1-5' company size has the fewest responses with 52 from tech-company and 8 from non-tech ones. It is worth mentioning that non-tech companies tend to have more than 1,000 employees in their organization.

> In terms of country distribution, the United States of America has the highest number of responses from both types of companies, followed by the United Kingdom with 144 responses from tech companies and 36 from non-tech ones, while Australia has the lowest number of responses from both types of companies, with 30 and 5, respectively.


In [None]:
# First Chart
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
ax = sns.countplot(x = imputed_df['Number of employees'], hue = imputed_df['Tech Company'], 
              order = ['1-5', '6-25', '26-100', '100-500', '500-1000', 'More than 1000'], 
              palette=["#FFD9B7", "#7EAA92"])
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])

plt.title('No. of Respondents by Company Size')
plt.xlabel('Company Size')
plt.ylabel('No. of Responders')
plt.legend(['Not Tech', 'Tech'])

# Second Chart
plt.subplot(1, 2, 2)
ax = sns.countplot(x = imputed_df['Country'], hue = imputed_df['Tech Company'], 
              palette=["#FFD9B7", "#7EAA92"])

ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])

plt.title('No. of Respondents by Country')
plt.ylabel('No. of Responders')
plt.legend(['Not Tech', 'Tech'])

# plt.savefig("company_country.png")

### Gender by Country

> According to the charts, it is clear that most the majority of participants from all countries are male and approximately one quarter are female.


In [None]:
colors = ["#7EAA92", "#FFD9B7" ,"#A1CCD1"]

plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)

ax = sns.countplot(x = imputed_df['Country'], hue = imputed_df['Gender'], palette=colors)
ax.bar_label(ax.containers[0])
ax.bar_label(ax.containers[1])
ax.bar_label(ax.containers[2])

plt.ylabel("Number of Responders")
plt.legend(['Male', 'Female', 'Other'])

# Pie chart

gender_labels = ['Male', 'Female', 'Other']
values = imputed_df['Gender'].value_counts()
sizes = [values[0], values[1], values[2]]

plt.subplot(1, 2, 2)
plt.pie(sizes, colors=colors, labels=gender_labels, autopct='%1.f%%', startangle=150, pctdistance=0.85)

# plt.savefig("gender.png")
plt.show()

### Mental Health Disorder 

> **About 40%** of respondents confirmed that they currently have a mental health disorder, while half of them had a mental health disorder in the past. 

> Approximately half of the participants have a family history of mental illness and **34.1%** do not.

> When it comes to sharing their mental illness with family or friends, **around 62%** of employee staff expressed openness or some degree of openness, while **about 20%** did not.


In [None]:
imputed_df['MH Sharing Freinds/Family'].replace(to_replace='Not applicable to me (I do not have a mental illness)', value='No mental illness', inplace=True)

In [None]:
# First Pie: Current MH Disorder
labels = ['No', 'Yes', 'Maybe']

no_mhd = imputed_df[imputed_df['Current MH Disorder'] == "No"].shape[0]
yes_mhd = imputed_df[imputed_df['Current MH Disorder'] == "Yes"].shape[0]
maybe_mhd = imputed_df[imputed_df['Current MH Disorder'] == "Maybe"].shape[0]

sizes = [no_mhd, yes_mhd, maybe_mhd]
colors = ["#7EAA92", "#FFD9B7", "#A1CCD1"]
explode = (0.05,0.05,0.05)

fig, axs = plt.subplots(2, 2, figsize=(14, 10))
plt.suptitle('Mental Health Disorder', fontsize=15)

axs[0, 0].pie(sizes, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

## draw circle
centre_circle = plt.Circle((0.0125, 0.0125), 0.6, color='white')
axs[0, 0].add_artist(centre_circle)

axs[0, 0].set_title("Do you currently have a mental health disorder?")
 
# Second Pie: Past MH Disorder
labels_2 = ['No', 'Yes', 'Maybe']

no_mhd_2 = imputed_df[imputed_df['MH Disorder Past'] == "No"].shape[0]
yes_mhd_2 = imputed_df[imputed_df['MH Disorder Past'] == "Yes"].shape[0]
maybe_mhd_2 = imputed_df[imputed_df['MH Disorder Past'] == "Maybe"].shape[0]

sizes_2 = [no_mhd_2, yes_mhd_2, maybe_mhd_2]

axs[0, 1].pie(sizes_2, colors=colors, labels=labels_2, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

#draw circle
centre_circle = plt.Circle((0.0125, 0.0125), 0.6, color='white')
axs[0, 1].add_artist(centre_circle)

axs[0, 1].set_title("Have you had a mental health disorder in the past?")

# Third Pie Chart: MH Family History
labels_3 = ['No', 'Yes', "I don't know"]

no_mhd_3 = imputed_df[imputed_df['MH Family History'] == "No"].shape[0]
yes_mhd_3 = imputed_df[imputed_df['MH Family History'] == "Yes"].shape[0]
maybe_mhd_3 = imputed_df[imputed_df['MH Family History'] == "I don't know"].shape[0]

sizes_3 = [no_mhd_3, yes_mhd_3, maybe_mhd_3]

axs[1, 0].pie(sizes_3, colors=colors, labels=labels_3, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

#draw circle
centre_circle = plt.Circle((0.0125, 0.0125), 0.6, color='white')
axs[1, 0].add_artist(centre_circle)

axs[1, 0].set_title("Do you have a family history of mental illness?")

# Fourth Pie Chart: MH Sharing Freinds/Family
labels_4 = ['Somewhat open', 'Neutral', 'No mental illness', 
            'Very open', 'Not open at all', 'Somewhat not open']

som_open = imputed_df[imputed_df['MH Sharing Freinds/Family'] == "Somewhat open"].shape[0]
neutral = imputed_df[imputed_df['MH Sharing Freinds/Family'] == "Neutral"].shape[0]
no_mhd = imputed_df[imputed_df['MH Sharing Freinds/Family'] == "No mental illness"].shape[0]
open_ = imputed_df[imputed_df['MH Sharing Freinds/Family'] == "Very open"].shape[0]
no_open = imputed_df[imputed_df['MH Sharing Freinds/Family'] == "Not open at all"].shape[0]
som_no_open = imputed_df[imputed_df['MH Sharing Freinds/Family'] == "Somewhat not open"].shape[0]

explode_4 = (0.05, 0.05, 0.05, 0.05, 0.05, 0.05)
colors_4 = ["#7EAA92", "#FFD9B7" ,"#A1CCD1", '#445069', '#FCBAAD', '#916DB3']

sizes_4 = [som_open, neutral, no_mhd, open_, no_open, som_no_open]

axs[1, 1].pie(sizes_4, colors=colors_4, labels=labels_4, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode_4)

#draw circle
centre_circle = plt.Circle((0.0125, 0.0125), 0.6, color='white')
axs[1, 1].add_artist(centre_circle)

axs[1, 1].set_title("How willing would you be to share with friends and family that you have a mental illness?")

# plt.savefig("mhd.png")

plt.show()

### Mental Health Disorder and Career

> Around **45%** of them think that having a mental disorder can harm their career, even though **45.8%** never experienced or observed a bad response to it in the workplace.

In [None]:
to_replace = ["No, I don't think it would", "Yes, I think it would", 'No, it has not']
new_values = ["I don't think so", "I think yes", 'No']

for i in range(len(to_replace)):
    imputed_df['MH Bad Impact on Career'].replace(to_replace=to_replace[i], value=new_values[i], inplace=True)

In [None]:
# First Pie: MH Bad Impact on Career
labels = ['No', 'Yes, it has', 'Maybe', "I don't think so", 'I think yes']

no_mhd = imputed_df[imputed_df['MH Bad Impact on Career'] == "No"].shape[0]
yes_mhd = imputed_df[imputed_df['MH Bad Impact on Career'] == "Yes, it has"].shape[0]
maybe_mhd = imputed_df[imputed_df['MH Bad Impact on Career'] == "Maybe"].shape[0]
think_no = imputed_df[imputed_df['MH Bad Impact on Career'] == "I don't think so"].shape[0]
think_yes = imputed_df[imputed_df['MH Bad Impact on Career'] == "I think yes"].shape[0]

sizes = [no_mhd, yes_mhd, maybe_mhd, think_no, think_yes]
colors = ["#7EAA92", "#FFD9B7", "#A1CCD1", '#445069', '#FCBAAD']
explode = (0.025, 0.025, 0.025, 0.025, 0.025)

plt.figure(figsize=(14, 6))
plt.suptitle('MHD Impact on Career', fontsize=15)
plt.subplot(1, 2, 2)

plt.pie(sizes, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)
plt.title("Do you think that being a person with MHD can harm your career?")
 
# Second Pie: MH Bad Response
labels_2 = ['No', 'Yes, I experienced', 'Maybe/Not sure', 'Yes, I observed']

no_mhd_2 = imputed_df[imputed_df['MH Bad Response'] == "No"].shape[0]
yes_mhd_2 = imputed_df[imputed_df['MH Bad Response'] == "Yes, I experienced"].shape[0]
maybe_mhd_2 = imputed_df[imputed_df['MH Bad Response'] == "Maybe/Not sure"].shape[0]
yes_obsv = imputed_df[imputed_df['MH Bad Response'] == "Yes, I observed"].shape[0]
explode_2 = (0.025, 0.025, 0.025, 0.025)

sizes_2 = [no_mhd_2, yes_mhd_2, maybe_mhd_2, yes_obsv]

plt.subplot(1, 2, 1)
plt.pie(sizes_2, colors=colors[:4], labels=labels_2, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode_2)

plt.title("Have you observed or experienced a bad response to MHD?")

# plt.savefig("career.png")

plt.show()

### Interviews

> According to the survey, almost one-third of the participants **(30.8%)** would not discuss their physical health issues with a potential employer due to privacy, fear of the negative impact on their chances of being hired, and uncertainty about the employer's reaction. And **61.8%** of participants would hesitate to discuss their mental health issues for the same reasons. 

> On the other hand, only **7.8%** of participants would be open to talking about their mental health issues, while **25.1%** would be comfortable discussing their physical health issues.


In [None]:
labels = ['No', 'Yes', 'Maybe']
colors = ["#7EAA92", "#FFD9B7", "#A1CCD1"]

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 2)

sns.countplot(imputed_df, x=imputed_df['Why/why not'], hue='PH in Interview', 
              hue_order=labels, palette=colors)
plt.xticks(rotation=90)

plt.ylabel("Reasons")
plt.xlabel("No. of Responders")
plt.title("Why/why not")

# Second Pie: PH in Interview

no_mhd = imputed_df[imputed_df['PH in Interview'] == "No"].shape[0]
yes_mhd = imputed_df[imputed_df['PH in Interview'] == "Yes"].shape[0]
maybe_mhd = imputed_df[imputed_df['PH in Interview'] == "Maybe"].shape[0]

sizes = [no_mhd, yes_mhd, maybe_mhd]
explode = (0.025, 0.025, 0.025)

plt.subplot(1, 2, 1)

plt.pie(sizes, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)
plt.title("Physical Health in an Interview")

# plt.savefig("ph_interview.png")

plt.show()

In [None]:
# First Chart

no_mhd_2 = imputed_df[imputed_df['MH in Interview'] == "No"].shape[0]
yes_mhd_2 = imputed_df[imputed_df['MH in Interview'] == "Yes"].shape[0]
maybe_mhd_2 = imputed_df[imputed_df['MH in Interview'] == "Maybe"].shape[0]

sizes_2 = [no_mhd_2, yes_mhd_2, maybe_mhd_2]

plt.figure(figsize=(14, 5))
plt.subplot(1, 2, 1)
plt.pie(sizes_2, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

plt.title("Mental Health in an Interview")

# Second Chart
plt.subplot(1, 2, 2)

sns.countplot(imputed_df, x=imputed_df['Why/why not (1)'], hue='MH in Interview',
              hue_order=labels, palette=colors)
plt.xticks(rotation=90)

plt.title('Why/why not')
plt.ylabel('Reasons')
plt.xlabel("No. of Responders")

# plt.savefig("mh_interview.png")

plt.show()

### Mental Health Discussion in the Current Workplace

> Currently, only **16%** of employees formally discuss mental health issues with their employers, while **almost two-thirds** of them have not yet done so. 

> **30.6%** of employees do not believe that discussing mental health issues with their employers can lead to negative consequences, while roughly **15.4%** do.

> **19.2 %** of employees feel comfortable discussing mental health issues with their coworkers and **almost half** of the employees with their supervisors, while **27.4%** and **23.4%** of them do not.

In [None]:
# First Pie: MH Discussion
labels = ['No', 'Yes', "I don't know"]

no_mhd = imputed_df[imputed_df['MH Discussion'] == "No"].shape[0]
yes_mhd = imputed_df[imputed_df['MH Discussion'] == "Yes"].shape[0]
dont_know = imputed_df[imputed_df['MH Discussion'] == "I don't know"].shape[0]

sizes = [no_mhd, yes_mhd, dont_know]
colors = ["#7EAA92", "#FFD9B7", "#A1CCD1"]
explode = (0.025, 0.025, 0.025)

plt.figure(figsize=(14, 10))
plt.suptitle('Mental Health Discussion in the Current Workplace', fontsize=15)
plt.subplot(2, 2, 1)

plt.pie(sizes, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)
plt.title("Has your employer ever formally discussed mental health?")

# Second Pie: MH Discuission Neg Impact 
labels_2 = ['No', 'Yes', 'Maybe']

no_mhd_2 = imputed_df[imputed_df['MH Discuission Neg Impact'] == "No"].shape[0]
yes_mhd_2 = imputed_df[imputed_df['MH Discuission Neg Impact'] == "Yes"].shape[0]
maybe_mhd_2 = imputed_df[imputed_df['MH Discuission Neg Impact'] == "Maybe"].shape[0]

sizes_2 = [no_mhd_2, yes_mhd_2, maybe_mhd_2]

plt.subplot(2, 2, 2)
plt.pie(sizes_2, colors=colors, labels=labels_2, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

plt.title("Will discussing MHD with your employer have negative consequences?")

# Third Pie: MH Discuission Coworkers 

no_mhd_3 = imputed_df[imputed_df['MH Discuission Coworkers'] == "No"].shape[0]
yes_mhd_3 = imputed_df[imputed_df['MH Discuission Coworkers'] == "Yes"].shape[0]
maybe_mhd_3 = imputed_df[imputed_df['MH Discuission Coworkers'] == "Maybe"].shape[0]

sizes_3 = [no_mhd_3, yes_mhd_3, maybe_mhd_3]

plt.subplot(2, 2, 3)
plt.pie(sizes_3, colors=colors, labels=labels_2, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

plt.title("Would you feel comfortable discussing a MHD with your coworkers?")

# Fourth Pie: MH Discuission Supervisor(s) 

no_mhd_4 = imputed_df[imputed_df['MH Discuission Supervisor(s)'] == "No"].shape[0]
yes_mhd_4 = imputed_df[imputed_df['MH Discuission Supervisor(s)'] == "Yes"].shape[0]
maybe_mhd_4 = imputed_df[imputed_df['MH Discuission Supervisor(s)'] == "Maybe"].shape[0]

sizes_4 = [no_mhd_4, yes_mhd_4, maybe_mhd_4]

plt.subplot(2, 2, 4)
plt.pie(sizes_4, colors=colors, labels=labels_2, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

plt.title("Would you feel comfortable discussing a MHD with your supervisor(s)?")

# plt.savefig('current.png')

plt.show()

### Mental Health Discussion in the Previous Workplace

> **Almost 20%** of employees formally discuss mental health issues with their previous employers or some of their employers, while **almost two-thirds** of them never did so. 

> Discussing mental health issues brought negative consequences to **approximately 70%** of participants in the past, while only **7.9%** did not.

> **Approximately 70 %** of employees felt comfortable discussing mental health issues with some of their previous or all of their coworkers and **63.9%** of the employees with their previous supervisors, while **30%** and **29%** of them, respectively did not.

In [None]:
# First Pie: Prev MH Discussion
labels = ['No', 'Yes', "I don't know", "Some of them"]

no_mhd = imputed_df[imputed_df['Prev MH Discussion'] == "None did"].shape[0]
yes_mhd = imputed_df[imputed_df['Prev MH Discussion'] == "Yes, they all did"].shape[0]
dont_know = imputed_df[imputed_df['Prev MH Discussion'] == "I don't know"].shape[0]
some = imputed_df[imputed_df['Prev MH Discussion'] == "Some did"].shape[0]

sizes = [no_mhd, yes_mhd, dont_know, some]
colors = ["#7EAA92", "#FFD9B7", "#A1CCD1", '#445069']
explode = (0.025, 0.025, 0.025, 0.025)

plt.figure(figsize=(14, 10))
plt.suptitle('Mental Health Discussion in the Previous Workplace', fontsize=15)
plt.subplot(2, 2, 1)

plt.pie(sizes, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)
plt.title("Had your employer ever formally discussed mental health?")

# Second Pie: Prev MH Discuission Neg Impact 

no_mhd_2 = imputed_df[imputed_df['Prev MH Discuission Neg Impact'] == "None of them"].shape[0]
yes_mhd_2 = imputed_df[imputed_df['Prev MH Discuission Neg Impact'] == "Yes, all of them"].shape[0]
dont_know = imputed_df[imputed_df['Prev MH Discuission Neg Impact'] == "I don't know"].shape[0]
some = imputed_df[imputed_df['Prev MH Discuission Neg Impact'] == "Some of them"].shape[0]

sizes_2 = [no_mhd_2, yes_mhd_2, dont_know, some]

plt.subplot(2, 2, 2)
plt.pie(sizes_2, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

plt.title("Was discussing MHD with your employer had negative consequences?")

# Third Pie: Prev MH Discuission Coworkers 

labels_3 = ['No', 'Yes', "Some of them"]

no_mhd_3 = imputed_df[imputed_df['Prev MH Discuission Coworkers'] == "No, at none of my previous employers"].shape[0]
yes_mhd_3 = imputed_df[imputed_df['Prev MH Discuission Coworkers'] == "Yes, at all of my previous employers"].shape[0]
some = imputed_df[imputed_df['Prev MH Discuission Coworkers'] == "Some of my previous employers"].shape[0]

sizes_3 = [no_mhd_3, yes_mhd_3, some]
explode_3 = (0.025, 0.025, 0.025)

plt.subplot(2, 2, 3)
plt.pie(sizes_3, colors=colors[:3], labels=labels_3, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode_3)

plt.title("Did you feel comfortable discussing a MHD with your coworkers?")

# Fourth Pie: Prev MH Discuission Supervisor(s) 

no_mhd_4 = imputed_df[imputed_df['Prev MH Discuission Supervisor(s)'] == "No, at none of my previous employers"].shape[0]
yes_mhd_4 = imputed_df[imputed_df['Prev MH Discuission Supervisor(s)'] == "Yes, at all of my previous employers"].shape[0]
dont_know = imputed_df[imputed_df['Prev MH Discuission Supervisor(s)'] == "I don't know"].shape[0]
some = imputed_df[imputed_df['Prev MH Discuission Supervisor(s)'] == "Some of my previous employers"].shape[0]

sizes_4 = [no_mhd_4, yes_mhd_4, dont_know, some]

plt.subplot(2, 2, 4)
plt.pie(sizes_4, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)

plt.title("Did you feel comfortable discussing a MHD with your supervisor(s)?")

# plt.savefig('past.png')

plt.show()

### Mental Health Coverage Awareness / Resources

> According to the survey’s results, **more than half** of the responders have Mental Health benefits covered by their healthcare plan, while around **15%** do not.

> **54%** of participants are not aware of Mental Health Care choices, and **57.1%** of their employers do not offer any information or support.

In [None]:
# First Pie: Prev MH Discussion
labels = ['No', 'Yes', "I don't know", "N/A"]

no_mhd = imputed_df[imputed_df['MH Coverage'] == "No"].shape[0]
yes_mhd = imputed_df[imputed_df['MH Coverage'] == "Yes"].shape[0]
dont_know = imputed_df[imputed_df['MH Coverage'] == "I don't know"].shape[0]
n_a = imputed_df[imputed_df['MH Coverage'] == "Not eligible for coverage / N/A"].shape[0]

sizes = [no_mhd, yes_mhd, dont_know, n_a]
colors = ["#7EAA92", "#FFD9B7", "#A1CCD1", '#445069']
explode = (0.025, 0.025, 0.025, 0.025)

plt.figure(figsize=(18, 6))
plt.suptitle('MH Coverage Awareness / Resources', fontsize=15)
plt.subplot(1, 3, 1)

plt.pie(sizes, colors=colors, labels=labels, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode)
plt.title("Are MH benefits included as part of healthcare coverage?")

# Second Pie: MH Coverage Awareness 

labels_2 = ['No', 'Yes', 'Not Sure']

no_mhd_2 = imputed_df[imputed_df['MH Coverage Awareness'] == "No"].shape[0]
yes_mhd_2 = imputed_df[imputed_df['MH Coverage Awareness'] == "Yes"].shape[0]
dont_know = imputed_df[imputed_df['MH Coverage Awareness'] == "I am not sure"].shape[0]

sizes_2 = [no_mhd_2, yes_mhd_2, dont_know]
explode_2 = (0.025, 0.025, 0.025)

plt.subplot(1, 3, 2)
plt.pie(sizes_2, colors=colors[:3], labels=labels_2, autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode_2)

plt.title("Are you aware of MH care choices?")

# Third Pie: MH Resources Provided 

no_mhd_3 = imputed_df[imputed_df['MH Resources Provided'] == "No"].shape[0]
yes_mhd_3 = imputed_df[imputed_df['MH Resources Provided'] == "Yes"].shape[0]
dont_know = imputed_df[imputed_df['MH Resources Provided'] == "I don't know"].shape[0]

sizes_3 = [no_mhd_3, yes_mhd_3, dont_know]

plt.subplot(1, 3, 3)
plt.pie(sizes_3, colors=colors[:3], labels=labels[:3], autopct='%1.1f%%', startangle=90, pctdistance=0.85, explode=explode_2)

plt.title("Does your employer offer resources for MH information and support?")

plt.savefig('awareness.png')

plt.show()

## 

# Step 2: Dimensionality Reduction and Clustering

## Multiple Correspondence Analysis (MCA)

In [None]:
mca = MCA(n_components=100, n_iter=3, copy=True, check_input=True, random_state=42, one_hot=True)
mca_components = mca.fit(labeled_df.drop(columns=['Age']))

In [None]:
mca_components.eigenvalues_summary

In [None]:
features_names = mca_components.column_contributions_.sum(axis=1).sort_values(ascending=False).keys()
features_values = list(mca_components.column_contributions_.sum(axis=1).sort_values(ascending=False))

In [None]:
plt.figure(figsize=(20, 25))

y = features_names
x = features_values

plt.barh(y, x)

plt.ylabel("Feature Names")
plt.xlabel("Sum of Feature Contribution")
plt.title("MCA as Feature Selection", size=16)

## Feature Selection

In [None]:
selected_features = []

for feature in features_names[:33]:
    for i in range(len(labeled_df.columns)):
        if labeled_df.columns[i] in feature:
            selected_features.append(labeled_df.columns[i])
        
selected_features = list(set(selected_features))

len(selected_features)

In [None]:
reduced_df = labeled_df[selected_features]
reduced_df.head()

## Choosing k value for K-means

### Elbow-Visualizer

In [None]:
# create a k-Means model an Elbow-Visualizer
model = KMeans(n_init=10, init="k-means++", max_iter=200)

visualizer = KElbowVisualizer(model, timings=False) 
# fit the visualizer and show the plot
plt.figure(figsize=(10, 6))
visualizer.fit(reduced_df)
visualizer.show()

## Kmeans

In [None]:
# clustering
n_clusters = 5

kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init='auto',init='k-means++', max_iter=200)
clustering = kmeans.fit_predict(reduced_df)


# extract centroids of clusters into a dataframe
centers = kmeans.cluster_centers_

# extract cluster labels
labels = kmeans.labels_ 

### Analyzing Clusters

In [None]:
# Analyzing cluster characteristics
cluster_characteristics = []
for i in range(n_clusters):
    cluster_data = reduced_df[labels == i]
    cluster_size = len(cluster_data)
    cluster_center = centers[i]
    avg_distance = np.mean(np.linalg.norm(cluster_data - cluster_center, axis=1))
    cluster_characteristics.append({
        "Cluster": i,
        "Size": cluster_size,
        "Avg Distance": avg_distance,
    })

cluster_characteristics

In [None]:
# Inertia
inertia_score = kmeans.inertia_
print("Inertia:", inertia_score)

# Silhouette Score
silhouette_avg = silhouette_score(labeled_df, labels)
print("Silhouette Avg:", silhouette_avg)

# Calinski Harabasz Score
cal_score = calinski_harabasz_score(labeled_df, labels)
print("Calinski Harabasz Score:", cal_score)

# Davies Bouldin Score
d_score = davies_bouldin_score(labeled_df, labels)
print("Davies Bouldin Score:", d_score)

### Visualization of Kmeans with TSNE and UMAP

In [None]:
# Visualization of kmeans with TSNE

tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(reduced_df)

plt.figure(figsize=(8, 6))
for i in range(n_clusters):
    plt.scatter(projections[labels == i][:, 0], projections[labels == i][:, 1], label=f'Cluster {i + 1}')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.title('K-means Clustering with t-SNE')
plt.legend()
plt.show()

In [None]:
# Visualization of kmeans with UMAP

umap = UMAP(n_components=2, random_state=0, init='random')
umap_projections = umap.fit_transform(reduced_df)

plt.figure(figsize=(8, 6))
for i in range(n_clusters):
    plt.scatter(umap_projections[labels == i][:, 0], umap_projections[labels == i][:, 1], label=f'Cluster {i + 1}')
plt.xlabel('UMAP Component 1')
plt.ylabel('UMAP Component 2')
plt.title('K-means Clustering with UMAP')
plt.legend()
plt.show()

# Step 3: Results

## Cluster Profiling

In [None]:
imputed_df['Clusters'] = labels

In [None]:
def cluster_profile(data):
    profile = {}
    
    for column_name in data.columns:
        if column_name != 'Clusters':
            profile[f'{column_name} Frequent'] = data[column_name].value_counts().keys()[0]
    
    return profile

In [None]:
employee_info = imputed_df[['Tech Company','Self Employed','Number of employees', 'Gender', 'Work Position', 'Remote', 'Clusters']]

employee_mh_info = imputed_df[['MH Sharing Freinds/Family', 'MH Bad Response', 'MH Family History', 'MH Disorder Past',
                               'Current MH Disorder', 'MH Diagnos Proffesional', 'MH Sought Proffes Treat', 
                               'MH Effective Treatment', 'MH NOT Effective Treatment', 'Clusters']]

mh_discussion = imputed_df[['MH Discuission Neg Impact', 'MH Discuission Coworkers', 'MH Discuission Supervisor(s)',
                            'Prev MH Discussion', 'Prev MH Discuission Neg Impact', 'Prev MH Discuission Coworkers',
                            'Prev MH Discuission Supervisor(s)', 'Clusters']]

career = imputed_df[['PH in Interview', 'Why/why not', 'MH in Interview', 'Why/why not (1)', 'MH Bad Impact on Career', 'Clusters']]

mh_resources = imputed_df[['MH Coverage', 'MH Coverage Awareness', 'MH Resources Provided', 'MH Anonimity', 'Prev MH Benefits', 
                           'MH Awarness Prev', 'Prev MH Resources Provided', 'Prev MH Anonimity', 'Clusters']]

In [None]:
employee_info_profiles = employee_info.groupby('Clusters').apply(cluster_profile)
employee_info_profiles_df = pd.DataFrame(employee_info_profiles.tolist(), index=employee_info_profiles.index)
employee_info_profiles_df

In [None]:
employee_mh_info_profiles = employee_mh_info.groupby('Clusters').apply(cluster_profile)
employee_mh_info_profiles_df = pd.DataFrame(employee_mh_info_profiles.tolist(), index=employee_mh_info_profiles.index)
employee_mh_info_profiles_df

In [None]:
mh_discussion_profiles = mh_discussion.groupby('Clusters').apply(cluster_profile)
mh_discussion_profiles_df = pd.DataFrame(mh_discussion_profiles.tolist(), index=mh_discussion_profiles.index)
mh_discussion_profiles_df

In [None]:
career_profiles = career.groupby('Clusters').apply(cluster_profile)
career_profiles_df = pd.DataFrame(career_profiles.tolist(), index=career_profiles.index)
career_profiles_df

In [None]:
mh_resources_profiles = mh_resources.groupby('Clusters').apply(cluster_profile)
mh_resources_profiles_df = pd.DataFrame(mh_resources_profiles.tolist(), index=mh_resources_profiles.index)
mh_resources_profiles_df

### Visualization

In [None]:
plt.figure(figsize=(16, 10))
for i, column in enumerate(employee_info.columns):
    if column != 'Clusters':
        plt.subplot(2, 3, i+1)
        sns.countplot(x=employee_info[column], hue=employee_info['Clusters'])
        plt.xticks(rotation=90)

In [None]:
plt.figure(figsize=(15, 15))
for i, column in enumerate(employee_mh_info.columns):
    if column != 'Clusters':
        plt.subplot(3, 3, i+1)
        sns.countplot(x=employee_mh_info[column], hue=employee_mh_info['Clusters'])
        plt.xticks(rotation=90)
        plt.subplots_adjust(wspace=0.2)

In [None]:
plt.figure(figsize=(15, 15))
for i, column in enumerate(mh_discussion.columns):
    if column != 'Clusters':
        plt.subplot(3, 3, i+1)
        sns.countplot(x=mh_discussion[column], hue=mh_discussion['Clusters'])
        plt.xticks(rotation=90)
plt.subplots_adjust(wspace=0.2, hspace=0.5)

In [None]:
plt.figure(figsize=(20, 15))
for i, column in enumerate(career.columns):
    if column != 'Clusters':
        plt.subplot(2, 3, i+1)
        sns.countplot(x=career[column], hue=career['Clusters'])
        plt.xticks(rotation=90)
        plt.subplots_adjust(wspace=0.2, hspace=0.8)

In [None]:
plt.figure(figsize=(20, 20))
for i, column in enumerate(mh_resources.columns):
    if column != 'Clusters':
        plt.subplot(3, 3, i+1)
        sns.countplot(x=mh_resources[column], hue=mh_resources['Clusters'])
        plt.xticks(rotation=90)
        plt.subplots_adjust(wspace=0.2, hspace=0.8)