In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

import seaborn as sns
import matplotlib.pyplot as plt

import re

# Dataset Documentation
## This dataset is from a 2014 survey that measures attitudes towards mental health and frequency of mental health disorders in the tech workplace. You are also encouraged to analyze data from the ongoing 2016 survey found here.

## Content:

Timestamp

Age

*18 to 100*

Gender

*F=1, M=0*

Country

*'United States' else 'Other'*

state: If you live in the United States, which state or territory do you live in?

*drop*

self_employed: Are you self-employed?

*boolean*

family_history: Do you have a family history of mental illness?

*boolean*

treatment: Have you sought treatment for a mental health condition?

*boolean*

work_interfere: If you have a mental health condition, do you feel that it interferes with your work?

*{'Never': 1, 'Rarely': 2, 'Sometimes': 3, 'Often': 4}*

no_employees: How many employees does your company or organization have?

*{'1-5': '1', '6-25': '2', '26-100': '3', 'More than 100': '4', '100-500': '5', '500-1000': '6', 'More than 1000': '7'}*

remote_work: Do you work remotely (outside of an office) at least 50% of the time?

*boolean*

tech_company: Is your employer primarily a tech company/organization?

*boolean*

benefits: Does your employer provide mental health benefits?

*categorical*

care_options: Do you know the options for mental health care your employer provides?

*categorical*

wellness_program: Has your employer ever discussed mental health as part of an employee wellness program?

*categorical*

seek_help: Does your employer provide resources to learn more about mental health issues and how to seek help?

*categorical*

anonymity: Is your anonymity protected if you choose to take advantage of mental health or substance abuse treatment resources?

*drop*

leave: How easy is it for you to take medical leave for a mental health condition?

*{'Very easy': 1, 'Somewhat easy': 2, "Don't know": 3, 'Somewhat difficult': 4, 'Very difficult': 5}*

mental_health_consequence: Do you think that discussing a mental health issue with your employer would have negative consequences?

*Yes=3, No=1, Maybe=2*

phys_health_consequence: Do you think that discussing a physical health issue with your employer would have negative consequences?

*Yes=3, No=1, Maybe=2*

coworkers: Would you be willing to discuss a mental health issue with your coworkers?

*'Yes': 3, 'No': 1, 'Some of them':2*

supervisor: Would you be willing to discuss a mental health issue with your direct supervisor(s)?

*'Yes': 3, 'No': 1, 'Some of them':2*

mental_health_interview: Would you bring up a mental health issue with a potential employer in an interview?

*Yes=3, No=1, Maybe=2*

phys_health_interview: Would you bring up a physical health issue with a potential employer in an interview?

*Yes=3, No=1, Maybe=2*

mental_vs_physical: Do you feel that your employer takes mental health as seriously as physical health?

*categorical*

obs_consequence: Have you heard of or observed negative consequences for coworkers with mental health conditions in your workplace?

*boolean*

comments: Any additional notes or comments

https://www.kaggle.com/osmi/mental-health-in-tech-survey

# STEP 1 - Cleanning

In [2]:
data_folder = Path('/Users/beatrizrenault/Documents/IH_DATA/DATASETS/')
file_to_open = data_folder / "MentalHealthTech.csv"
ds = pd.read_csv(file_to_open)

In [3]:
#ds.head()
#ds.info()
#ds.shape
#ds.describe()
#ds.columns

## Age

In [4]:
ds['Age'].unique()
# Get values: 18 < VALUE > 100
indexNames = ds[(ds['Age'] < 18) | (ds['Age'] > 100)].index
# Delete these row indexes from dataFrame
ds.drop(indexNames, inplace=True)
#ds.sort_values('Age', ascending = False)

In [5]:
'''
IN CASE I WANT TO BIN THE AGES IN THE FUTURE
(I WILL TRY WITH PLOTS FIRST)

labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
df.head(10)'''

'\nIN CASE I WANT TO BIN THE AGES IN THE FUTURE\n(I WILL TRY WITH PLOTS FIRST)\n\nlabels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]\ndf[\'group\'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)\ndf.head(10)'

## Gender

In [6]:
ds['Gender'] = ds.Gender.str.lower()
ds['Gender'] = ds.Gender.str.strip()

# Get values: != of f, m, female and male. And delete them. They are just a few.
indexNames_gender = ds[ds['Gender'].map(len) > 6].index
ds.drop(indexNames_gender, inplace=True)

indexNames_gender_ = ds[  (ds['Gender'] != 'm')
                        & (ds['Gender'] != 'male')  
                        & (ds['Gender'] != 'female')   
                        & (ds['Gender'] != 'f')].index

ds.drop(indexNames_gender_, inplace=True)

ds['Gender'] = ds.Gender.str.replace('female','f').str.replace('male','m')
ds['Gender'] = ds.Gender.str.upper()

#ds.groupby(ds['Gender']).count()

## Country & State

In [7]:
# Get values: != of f, m, female and male. And delete them. They are just a few.
ds['Country']  = [row if row == 'United States' else 'Other' for row in ds['Country']]
ds = ds.drop(['state'], axis=1)
#extra drop
ds = ds.drop(['anonymity'], axis=1)
ds = ds.drop(['Timestamp'], axis=1)

In [8]:
#ds.head(10)
#ds.info()

## Boolean Columns

In [9]:
#Visualization
ds.groupby(ds['self_employed']).count()
ds.groupby(ds['family_history']).count()
ds.groupby(ds['treatment']).count()
ds.groupby(ds['remote_work']).count()

Unnamed: 0_level_0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,tech_company,benefits,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
remote_work,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
No,851,851,851,839,851,851,664,851,851,851,...,851,851,851,851,851,851,851,851,851,95
Yes,356,356,356,350,356,356,289,356,356,356,...,356,356,356,356,356,356,356,356,356,55


In [10]:
#Alteration
ds['self_employed'] = ds.self_employed.map(dict(Yes=1, No=0)).astype(float)
ds['family_history'] = ds.family_history.map(dict(Yes=1, No=0)).astype(float)
ds['treatment'] = ds.treatment.map(dict(Yes=1, No=0)).astype(float)
ds['remote_work'] = ds.remote_work.map(dict(Yes=1, No=0)).astype(float)
ds['obs_consequence'] = ds.obs_consequence.map(dict(Yes=1, No=0)).astype(float)
ds['tech_company'] = ds.tech_company.map(dict(Yes=1, No=0)).astype(float)
ds['Gender'] = ds.Gender.map(dict(F=1, M=0)).astype(float)

ds

Unnamed: 0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,37,1.0,United States,,0.0,1.0,Often,6-25,0.0,1.0,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,0.0,
1,44,0.0,United States,,0.0,0.0,Rarely,More than 1000,0.0,0.0,...,Don't know,Maybe,No,No,No,No,No,Don't know,0.0,
2,32,0.0,Other,,0.0,0.0,Rarely,6-25,0.0,1.0,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,0.0,
3,31,0.0,Other,,1.0,1.0,Often,26-100,0.0,1.0,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,1.0,
4,31,0.0,United States,,0.0,0.0,Never,100-500,1.0,1.0,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,26,0.0,Other,0.0,0.0,1.0,,26-100,0.0,1.0,...,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,0.0,
1255,32,0.0,United States,0.0,1.0,1.0,Often,26-100,1.0,1.0,...,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,0.0,
1256,34,0.0,United States,0.0,1.0,1.0,Sometimes,More than 1000,0.0,1.0,...,Somewhat difficult,Yes,Yes,No,No,No,No,No,0.0,
1257,46,1.0,United States,0.0,0.0,0.0,,100-500,1.0,1.0,...,Don't know,Yes,No,No,No,No,No,No,0.0,


## Scales

In [11]:
#Visualization
ds.groupby(ds['work_interfere']).count()
ds.groupby(ds['no_employees']).count()
ds.groupby(ds['leave']).count()

#Alteration
ds['work_interfere'].replace({'Never': 1, 'Rarely': 2, 'Sometimes': 3, 'Often': 4}, inplace = True)
ds['no_employees'].replace({'1-5': 1, '6-25': 2, '26-100': 3, 'More than 100': 4, '100-500': 5, '500-1000': 6, 'More than 1000': 7}, inplace = True)
ds['leave'].replace({'Very easy': 1, 'Somewhat easy': 2, "Don't know": 3, 'Somewhat difficult': 4, 'Very difficult': 5}, inplace = True)

ds.dtypes

Age                            int64
Gender                       float64
Country                       object
self_employed                float64
family_history               float64
treatment                    float64
work_interfere               float64
no_employees                   int64
remote_work                  float64
tech_company                 float64
benefits                      object
care_options                  object
wellness_program              object
seek_help                     object
leave                          int64
mental_health_consequence     object
phys_health_consequence       object
coworkers                     object
supervisor                    object
mental_health_interview       object
phys_health_interview         object
mental_vs_physical            object
obs_consequence              float64
comments                      object
dtype: object

## Maybe / Don't Know

In [12]:
#visualization

#maybe
ds.groupby(ds['mental_health_consequence']).count()
ds.groupby(ds['phys_health_consequence']).count()
#some of them
ds.groupby(ds['coworkers']).count()
ds.groupby(ds['supervisor']).count()
#maybe
ds.groupby(ds['mental_health_interview']).count()
ds.groupby(ds['phys_health_interview']).count()
#Don't know
ds.groupby(ds['mental_vs_physical']).count()



Unnamed: 0_level_0,Age,Gender,Country,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,...,seek_help,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,obs_consequence,comments
mental_vs_physical,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Don't know,559,559,559,551,559,559,425,559,559,559,...,559,559,559,559,559,559,559,559,559,63
No,323,323,323,316,323,323,272,323,323,323,...,323,323,323,323,323,323,323,323,323,35
Yes,325,325,325,322,325,325,256,325,325,325,...,325,325,325,325,325,325,325,325,325,52


In [13]:

#Alteration

#some of them
ds['coworkers'].replace({'Yes': 3, 'No': 1, 'Some of them':2}, inplace = True)
ds['supervisor'].replace({'Yes': 3, 'No': 1, 'Some of them':2}, inplace = True)

#Don't know  - KEEP IT CATEGORICAL (but if I didn't, how to manage NaN as strings?)
#ds['leave'].replace({'Yes': 0, "Don't know": 'NaN', 'No': 1}, inplace = True)
#ds['mental_vs_physical'].replace({'Yes': 0, "Don't know": 'NaN', 'No': 1}, inplace = True)
#ds['benefits'].replace({'Yes': 0, "Don't know": 'NaN', 'No': 1}, inplace = True)
#ds['wellness_program'].replace({'Yes': 0, "Don't know": 'NaN', 'No': 1}, inplace = True)
#ds['seek_help'].replace({'Yes': 0, "Don't know": 'NaN', 'No': 1}, inplace = True)

#Not sure - KEEP IT CATEGORICAL (but if I didn't, how to manage NaN as strings?)
#ds['care_options'].replace({'Yes': 0, "Not sure": 'NaN', 'No': 1}, inplace = True)

#Maybe
ds['mental_health_consequence'] = ds.mental_health_consequence.map(dict(Yes=3, No=1, Maybe=2))
ds['phys_health_consequence'] = ds.phys_health_consequence.map(dict(Yes=3, No=1, Maybe=2))
ds['mental_health_interview'] = ds.mental_health_interview.map(dict(Yes=3, No=1, Maybe=2))
ds['phys_health_interview'] = ds.phys_health_interview.map(dict(Yes=3, No=1, Maybe=2))


# STEP 2 - Categorical

In [14]:
#String: 
#Countries

#Categorical with order:
#ds['benefits'] - No, Don't know, Yes
#ds['care_options'] - No, Not sure, Yes
#ds['wellness_program'] - No, Don't know, Yes
#ds['seek_help'] - No, Don't know, Yes
#ds['mental_vs_physical'] - No, Don't know, Yes

In [15]:
ds["benefits"] = pd.Categorical(ds["benefits"], categories=["No", "Don't know", "Yes"], ordered=True)
ds["care_options"] = pd.Categorical(ds["care_options"], categories=["No", "Don't know", "Yes"], ordered=True)

ds["wellness_program"] = pd.Categorical(ds["wellness_program"], categories=["No", "Don't know", "Yes"], ordered=True)
ds["seek_help"] = pd.Categorical(ds["seek_help"], categories=["No", "Don't know", "Yes"], ordered=True)
ds["mental_vs_physical"] = pd.Categorical(ds["mental_vs_physical"], categories=["No", "Don't know", "Yes"], ordered=True)

In [16]:
#ds.info()

In [17]:
ds.to_csv('MentalHealthTech_Clean.csv')

# Step 3 - Statistical experiments

In [18]:
#first glance
corr = ds.corr()
corr

Unnamed: 0,Age,Gender,self_employed,family_history,treatment,work_interfere,no_employees,remote_work,tech_company,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,obs_consequence
Age,1.0,-0.081498,0.08695,0.009443,0.079066,-0.048429,0.119219,0.157882,-0.0623,0.043783,0.058696,0.090457,0.000965,0.013898,-0.031287,-0.061775,0.062278
Gender,-0.081498,1.0,-0.049789,0.147478,0.193688,0.094559,0.089976,-0.005467,-0.066217,0.028698,0.092558,0.080864,-0.059001,-0.090229,-0.127394,-0.13027,0.071987
self_employed,0.08695,-0.049789,1.0,-0.002479,0.01175,0.058139,-0.364521,0.309804,0.078601,0.02394,-0.066853,0.023046,0.073208,0.034685,0.08614,0.027017,0.073446
family_history,0.009443,0.147478,-0.002479,1.0,0.382545,0.272004,0.035511,0.021135,-0.047278,0.023401,0.095601,0.04621,-0.000645,0.000747,-0.070714,-0.041612,0.132097
treatment,0.079066,0.193688,0.01175,0.382545,1.0,0.485044,0.015311,0.033657,-0.026108,0.080195,0.124576,0.039736,0.072619,-0.027905,-0.064436,0.004844,0.154739
work_interfere,-0.048429,0.094559,0.058139,0.272004,0.485044,1.0,-0.072618,0.046151,-0.014996,0.10153,0.146085,0.026364,0.015934,-0.035015,-0.066839,0.072904,0.144351
no_employees,0.119219,0.089976,-0.364521,0.035511,0.015311,-0.072618,1.0,-0.218559,-0.253155,0.0325,0.106965,0.017313,-0.134532,-0.096306,-0.130473,-0.082044,0.045675
remote_work,0.157882,-0.005467,0.309804,0.021135,0.033657,0.046151,-0.218559,1.0,0.127758,-0.011288,-0.000721,0.027843,0.071181,0.020628,0.084101,0.027403,-0.045377
tech_company,-0.0623,-0.066217,0.078601,-0.047278,-0.026108,-0.014996,-0.253155,0.127758,1.0,-0.04342,-0.119813,-0.054089,0.073915,0.049913,0.08426,0.003649,-0.068367
leave,0.043783,0.028698,0.02394,0.023401,0.080195,0.10153,0.0325,-0.011288,-0.04342,1.0,0.378417,0.221406,-0.22034,-0.291403,-0.144314,-0.013211,0.197705


In [19]:
#corr.replace({1.000000: 0}, inplace = True) #just to check the higher correlations(except 1)
#corr.max()
#corr.min()

## size of sample : in treatment + in tech

In [20]:
#ds.groupby(ds['treatment']).count()
filter_ = (ds['tech_company']>0) & (ds['treatment']>0)
ds[filter_].count()

Age                          491
Gender                       491
Country                      491
self_employed                483
family_history               491
treatment                    491
work_interfere               487
no_employees                 491
remote_work                  491
tech_company                 491
benefits                     491
care_options                 390
wellness_program             491
seek_help                    491
leave                        491
mental_health_consequence    491
phys_health_consequence      491
coworkers                    491
supervisor                   491
mental_health_interview      491
phys_health_interview        491
mental_vs_physical           491
obs_consequence              491
comments                      77
dtype: int64

## size of sample: companies with MH policies

In [21]:
#ds['filter_'] = (ds['benefits'] == 'Yes') & (ds['wellness_program'] == 'Yes') & (ds['seek_help'] == 'Yes')
#ds['filter_'].value_counts()
#not enough

In [22]:
#ds['filter_'] = (ds['benefits'] == 'Yes') #& (ds['wellness_program'] == 'Yes') #& (ds['seek_help'] == 'Yes')
#ds['filter_'].value_counts()
#can work

In [23]:
#ds['filter_'] =  (ds['seek_help'] == 'Yes')
#ds['filter_'].value_counts()
#not enough

In [24]:
ds['filter_'] =  (ds['tech_company'] == 1) & (ds['benefits'] == 'Yes')
ds['filter_'].value_counts()

False    854
True     353
Name: filter_, dtype: int64