# Data analysis

In [2]:
# import pandas, scipy, numpy, matplotlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats

## Data Import

In [5]:
os = pd.read_csv('OSMI_Survey_Data.csv')

In [6]:
os.shape

(60186, 28)

In [8]:
os.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60186 entries, 0 to 60185
Data columns (total 28 columns):
 #   Column                                                                                      Non-Null Count  Dtype  
---  ------                                                                                      --------------  -----  
 0   index                                                                                       60186 non-null  int64  
 1   ResponseID                                                                                  60186 non-null  object 
 2   Are you selfemployed                                                                        60186 non-null  bool   
 3   How many employees does your company or organization have                                   48132 non-null  object 
 4   Is your employer primarily a tech companyorganization                                       48132 non-null  object 
 5   Is your primary role within your compan

In [14]:
# Rename columns to be more intuitive
os.columns = ['index', 'ResponseID', "Self_employed", "Size_company", "Tech_company", "TechIT_Job", "Previous_employer", "Family_hist", "Past_MentalHealth", "Current_MentalHealth", "Yes_Current_Conditions", "Maybe_Believed_Conditions", "Med_Prof_Diagnose", "Diagnosed_Conditions", "Sought_Treatment", "Age", "Gender", "Age_Group", "Country_live", "US_State/territory_live", "Country_work", "US_State/territory_work", "Work_description", "Remotely", "Question Group", "Speak_openly", "Question", "Response"]

In [19]:
# show the columns now
os.columns

Index(['index', 'ResponseID', 'Self_employed', 'Size_company', 'Tech_company',
       'TechIT_Job', 'Previous_employer', 'Family_hist', 'Past_MentalHealth',
       'Current_MentalHealth', 'Yes_Current_Conditions',
       'Maybe_Believed_Conditions', 'Med_Prof_Diagnose',
       'Diagnosed_Conditions', 'Sought_Treatment', 'Age', 'Gender',
       'Age_Group', 'Country_live', 'US_State/territory_live', 'Country_work',
       'US_State/territory_work', 'Work_description', 'Remotely',
       'Question Group', 'Speak_openly', 'Question', 'Response'],
      dtype='object')

In [20]:
# show the first 5 rows
os.head()

Unnamed: 0,index,ResponseID,Self_employed,Size_company,Tech_company,TechIT_Job,Previous_employer,Family_hist,Past_MentalHealth,Current_MentalHealth,...,Country_live,US_State/territory_live,Country_work,US_State/territory_work,Work_description,Remotely,Question Group,Speak_openly,Question,Response
0,0,r00000,False,26-100,True,,True,No,Yes,No,...,United Kingdom,,United Kingdom,,Back-end Developer,Sometimes,Resources for employees with mental health dis...,No,Does your employer provide mental health benef...,Not eligible for coverage / N/A
1,1,r00000,False,26-100,True,,True,No,Yes,No,...,United Kingdom,,United Kingdom,,Back-end Developer,Sometimes,Resources for employees with mental health dis...,No,Do you know the options for mental health care...,
2,2,r00000,False,26-100,True,,True,No,Yes,No,...,United Kingdom,,United Kingdom,,Back-end Developer,Sometimes,Safe and supportive workplce for those with me...,No,Has your employer ever formally discussed ment...,No
3,3,r00000,False,26-100,True,,True,No,Yes,No,...,United Kingdom,,United Kingdom,,Back-end Developer,Sometimes,Resources for employees with mental health dis...,No,Does your employer offer resources to learn mo...,No
4,4,r00000,False,26-100,True,,True,No,Yes,No,...,United Kingdom,,United Kingdom,,Back-end Developer,Sometimes,Resources for employees with mental health dis...,No,Is your anonymity protected if you choose to t...,I don't know


## Caregorical data

In [17]:
# show first 5 rows of have you ever been diagnosed with a mental health disorder by a medical professional
os['Med_Prof_Diagnose'].head()

0    True
1    True
2    True
3    True
4    True
Name: Med_Prof_Diagnose, dtype: bool

- What are the two categories and the counts of these for medical professional diagnosis of a mental health disorder?

In [18]:
# What are the distinct values for have you been diagnosed with a mental health disorder by a medical professional
os['Med_Prof_Diagnose'].value_counts()

Med_Prof_Diagnose
False    30114
True     30072
Name: count, dtype: int64

- Here are the percetanges

In [22]:
# Calculate the proportions of the ones that have been diagnosed nd make percentages
os['Med_Prof_Diagnose'].value_counts(normalize=True) * 100

Med_Prof_Diagnose
False    50.034892
True     49.965108
Name: proportion, dtype: float64

- Look at the counts for tech company vs non tech

In [24]:
# Calculate frequency of tech company
os['Tech_company'].value_counts()

Tech_company
True     37086
False    11046
Name: count, dtype: int64

In [25]:
# calculate the percentage of tech company
os['Tech_company'].value_counts(normalize=True) * 100

Tech_company
True     77.050611
False    22.949389
Name: proportion, dtype: float64

- Now we will compare the counts of diagnosis related to whether or not they work for a tech company

In [34]:
# Make a neat table with tech company and diagnosis counts
pd.crosstab(os['Tech_company'], os['Med_Prof_Diagnose'])


Med_Prof_Diagnose,False,True
Tech_company,Unnamed: 1_level_1,Unnamed: 2_level_1
False,5460,5586
True,18858,18228


- Here is another comparison of two categorical variables: Country you work in and Mental Health diagnosis

In [38]:
# Calculate the frequency of each Coutnry you work in
os['Country_work'].value_counts()

Country_work
United States of America    35742
United Kingdom               7686
Canada                       3108
Germany                      2436
Netherlands                  1974
Australia                    1428
Sweden                        840
Ireland                       630
France                        588
Switzerland                   420
Brazil                        420
India                         378
Russia                        378
New Zealand                   378
Denmark                       294
Bulgaria                      294
Finland                       294
Belgium                       210
South Africa                  168
Poland                        168
Austria                       168
Czech Republic                126
Italy                         126
Chile                         126
Norway                        126
Romania                       126
Spain                         126
Other                          84
Bosnia and Herzegovina         84
A

In [39]:
# Compare the frequency of each country you work in with the frequency of tech company
pd.crosstab(os['Country_work'], os['Tech_company'])

Tech_company,False,True
Country_work,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0,42
Argentina,0,42
Australia,210,882
Austria,0,42
Bangladesh,0,42
Belgium,0,126
Bosnia and Herzegovina,42,42
Brazil,168,210
Bulgaria,0,252
Canada,378,2184


- Here is a frequency table of mental health diagnosis in the different countries

In [40]:
# Compare the frequency of each country you work in with the frequency of mental health diagnosis
pd.crosstab(os['Country_work'], os['Med_Prof_Diagnose'])

Med_Prof_Diagnose,False,True
Country_work,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0,84
Argentina,42,0
Australia,462,966
Austria,42,126
Bangladesh,0,42
Belgium,126,84
Bosnia and Herzegovina,84,0
Brazil,252,168
Brunei,42,0
Bulgaria,252,42


- This table also lists the countries and the diagnosis but limits it to only participants who work at tech companies

In [41]:
# Add onto the previous table a limiting factor that they must work in a tech company
pd.crosstab(os['Country_work'], os['Med_Prof_Diagnose'], values=os['Tech_company'], aggfunc='sum')

Med_Prof_Diagnose,False,True
Country_work,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,,42.0
Argentina,42.0,
Australia,294.0,588.0
Austria,0.0,42.0
Bangladesh,,42.0
Belgium,84.0,42.0
Bosnia and Herzegovina,42.0,
Brazil,168.0,42.0
Brunei,0.0,
Bulgaria,210.0,42.0


## Comparative numerical analysis

- First, here are some statistics on age

In [35]:
# Give age statistics
os['Age'].describe()

count    60102.000000
mean        34.106219
std          8.283055
min         15.000000
25%         28.000000
50%         33.000000
75%         39.000000
max         99.000000
Name: Age, dtype: float64

- Next we will compare age and professional mental health diagnosis

In [36]:
# Calculate the mean age for each category of mental health diagnosis
os.groupby('Med_Prof_Diagnose')['Age'].mean()

Med_Prof_Diagnose
False    33.537709
True     34.675524
Name: Age, dtype: float64

- Looks to be about the same maybe they are not associated

## Numerical data analysis

- Since my dataset only has one numerical column I cannot compare two of them