# <center>COMP 562 Final Project</center>
## <center>By: Elise Dixon</center>

    Data Exploration and Machine Learning Analysis of mental health dataset gathered by Open Sourcing Mental Illness(OSMI) through their Mental Health in Tech Survey. This survey was gathered in the year 2016 with over 1400 entries. This notebook will look into the question of whether a person will seek treatment taking into consideration different life factors.

### Setup

In [149]:
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45, rotation_mode='anchor', ha = 'right')
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    #plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

In [150]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn import preprocessing
from sklearn.preprocessing import binarize, LabelEncoder, MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

%matplotlib inline

### Load Data

In [84]:
train_df=pd.read_csv('mental-health-in-tech-2016/mental-health-in-tech-2016_cleaned.csv')
train_df.head()

Unnamed: 0,self_employed,num_employees,tech_company,role,mental_health_benefits,knowledge_of_mental_healthcare_coverage,employer_discussed_mental_health,employer_offer_resources,anonymity_protected,asking_for_leave,...,bad_response_to_mental_health_at_work,family_history,had_disorder_in_past,have_disorder,been_diagnosed,sought_treatment,interferes_with_work_if_treated_effectively,age,gender,work_remotely
0,0,26-100,1.0,,Not eligible for coverage / N/A,,No,No,I don't know,Very easy,...,No,No,Yes,No,Yes,0,Not applicable to me,39,Male,Sometimes
1,0,6-25,1.0,,No,Yes,Yes,Yes,Yes,Somewhat easy,...,No,Yes,Yes,Yes,Yes,1,Rarely,29,male,Never
2,0,6-25,1.0,,No,,No,No,I don't know,Neither easy nor difficult,...,Maybe/Not sure,No,Maybe,No,No,1,Not applicable to me,38,Male,Always
3,1,,,,,,,,,,...,No,No,Yes,Yes,Yes,1,Sometimes,43,male,Sometimes
4,0,6-25,0.0,1.0,Yes,Yes,No,No,No,Neither easy nor difficult,...,"Yes, I experienced",Yes,Yes,Yes,Yes,1,Sometimes,43,Female,Sometimes


In [85]:
features=(list(train_df))
print(len(features))
train_df.shape

37


(1433, 37)

### Clean Data
We will first remove entries that null then standardize the data

In [86]:
# look for unanswered questions -> indicates missing data
num_missing = train_df.isnull().sum().sort_values(ascending=False)
percent_missing = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([num_missing, percent_missing], axis=1, keys=['Number Missing', 'Percent Missing'])
missing_data.head(20)
print(missing_data)

                                                    Number Missing  \
negative_impact_of_revealing_diagnosis                        1289   
percentage_work_time_affected                                 1229   
role                                                          1170   
reveal_diagnosis_with_collegues                               1146   
medical_coverage                                              1146   
know_of_resources_for_help                                    1146   
productivity_affected_mental_health                           1146   
knowledge_of_mental_healthcare_coverage                        420   
discuss_mental_health_disorder_with_employer                   287   
negative_consequences_at_work                                  287   
discuss_mental_health_disorder_with_coworkers                  287   
believe_negative_consequences_of_employer_discu...             287   
asking_for_leave                                               287   
discuss_mental_healt

In [87]:
# remove significantly unanswered questions -> state and comments
count=0
not_answered=[]
for col in features:
    if(sum(pd.isnull(train_df[col]))>500):
        count=count+1
        not_answered.append(col)
print(not_answered)
train_df.drop([i for i in not_answered],axis=1,inplace=True)

['role', 'medical_coverage', 'know_of_resources_for_help', 'reveal_diagnosis_with_collegues', 'negative_impact_of_revealing_diagnosis', 'productivity_affected_mental_health', 'percentage_work_time_affected']


In [88]:
# check remaining missing data
num_missing = train_df.isnull().sum().sort_values(ascending=False)
percent_missing = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)
missing_data_2 = pd.concat([num_missing, percent_missing], axis=1, keys=['Number Missing', 'Percent Missing'])
missing_data_2.head(20)
print(missing_data_2)

                                                    Number Missing  \
knowledge_of_mental_healthcare_coverage                        420   
negative_consequences_at_work                                  287   
employer_mental_health_serious                                 287   
num_employees                                                  287   
tech_company                                                   287   
mental_health_benefits                                         287   
employer_discussed_mental_health                               287   
employer_offer_resources                                       287   
anonymity_protected                                            287   
asking_for_leave                                               287   
discuss_mental_health_disorder_with_employer                   287   
believe_negative_consequences_of_employer_discu...             287   
discuss_mental_health_disorder_with_coworkers                  287   
discuss_mental_healt

In [89]:
# remove NaN from self_employed
count=0

for index,col in enumerate(list(train_df)):
    is_null=train_df.index[train_df[col].isnull()]
    if(len(is_null)==287):
        k=is_null
        count+=1

train_df.drop(k,inplace=True)

train_df.shape

(1146, 30)

In [90]:
# Similar to above, we will remove the common rows with null entries
count=0

for index,col in enumerate(list(train_df)):
    is_null=train_df.index[train_df[col].isnull()]
    if(len(is_null)==133):
        k=is_null
        count+=1

train_df.drop(k,inplace=True)

train_df.shape

(1013, 30)

In [91]:
# check remaining missing data
num_missing = train_df.isnull().sum().sort_values(ascending=False)
percent_missing = (train_df.isnull().sum()/train_df.isnull().count()).sort_values(ascending=False)
missing_data_3 = pd.concat([num_missing, percent_missing], axis=1, keys=['Number Missing', 'Percent Missing'])
missing_data_3.head(20)
print(missing_data_3)

                                                    Number Missing  \
bad_response_to_mental_health_at_work                           45   
gender                                                           2   
work_remotely                                                    0   
employer_mental_health_serious                                   0   
num_employees                                                    0   
tech_company                                                     0   
mental_health_benefits                                           0   
knowledge_of_mental_healthcare_coverage                          0   
employer_discussed_mental_health                                 0   
employer_offer_resources                                         0   
anonymity_protected                                              0   
asking_for_leave                                                 0   
discuss_mental_health_disorder_with_employer                     0   
believe_negative_con

In [92]:
# Last entry removable
count=0

for index,col in enumerate(list(train_df)):
    is_null=train_df.index[train_df[col].isnull()]
    if(len(is_null)==45):
        k=is_null
        count+=1

train_df.drop(k,inplace=True)

train_df.shape

(968, 30)

In [93]:
# Last entry removable
count=0

for index,col in enumerate(list(train_df)):
    is_null=train_df.index[train_df[col].isnull()]
    if(len(is_null)==2):
        k=is_null
        count+=1

train_df.drop(k,inplace=True)

train_df.shape

(966, 30)

In [94]:
#check number of null entries
train_df.isnull().sum().max()

0

In [95]:
# Look at the gender column for entry types
train_df['gender'].unique()

array(['male', 'Female', 'Male', 'M', 'female', 'm',
       'I identify as female.', 'non-binary', 'Female assigned at birth ',
       'F', 'Woman', 'man', 'Male ', 'fm', 'f', 'Cis female ',
       'Transitioned, M2F', 'Genderfluid (born female)',
       'Female or Multi-Gender Femme', 'female/woman', 'Cis male',
       'Male.', 'Androgynous', 'Male (cis)', 'Other', 'Female ',
       'nb masculine', 'Cisgender Female', 'Man', 'Sex is male',
       'none of your business', 'genderqueer', 'cis male', 'Human',
       'Genderfluid', 'Enby', 'genderqueer woman', 'female ', 'Dude',
       'woman', 'mail', 'Male/genderqueer', 'fem', 'male ',
       'Female (props for making this a freeform field, though)',
       ' Female', 'Cis Male', 'Cis-woman', 'Genderqueer', 'cisdude',
       'Genderflux demi-girl', 'cis man', 'Transgender woman'],
      dtype=object)

In [96]:
# Sort the responses to their appropriate section. Move into groups: Male, Female, Genderqueer&Other
train_df['gender'] = train_df['gender'].replace([
    'male', 'Male ', 'M', 'm', 'man', 'Cis male',
    'Male.', 'Male (cis)', 'Man', 'Sex is male',
    'cis male', 'Dude',
    'mail', 'M|', 'male ', 'Cis Male',
    'cisdude', 'cis man'], 'Male')

train_df['gender'] = train_df['gender'].replace([
    'Female', 'female', 'I identify as female.', 'female ',
    'Female assigned at birth ', 'F', 'Woman', 'fm', 'f',
    'Cis female', 'Transitioned, M2F', 'Female or Multi-Gender Femme',
    'Female ', 'woman', 'female/woman', 'Cisgender Female', 
    'mtf', 'fem', 'Female (props for making this a freeform field, though)',
    ' Female', 'Cis-woman', 'Transgender woman',
    'Cis female '], 'Female')

train_df['gender'] = train_df['gender'].replace([
    'Bigender', 'non-binary,','non-binary', 'Genderfluid (born female)',
    'Other/Transfeminine', 'Androgynous', 'male 9:1 female, roughly',
    'nb masculine', 'genderqueer', 'Human', 'Genderfluid',
    'Enby', 'genderqueer woman', 'Queer', 'Agender', 'Fluid',
    'Genderflux demi-girl', 'female-bodied; no feelings about gender',
    'non-binary', 'Male/genderqueer', 'Nonbinary', 'Other', 'none of your business',
    'Unicorn', 'human', 'Genderqueer'], 'Genderqueer/Other')

In [97]:
train_df['gender'].value_counts()

Male                 710
Female               239
Genderqueer/Other     17
Name: gender, dtype: int64

In [98]:
# Look at the age column for entry types
train_df['age'].unique()

array([ 29,  43,  42,  30,  37,  44,  28,  34,  35,  52,  32,  31,  25,
        33,  27,  36,  40,  46,  41,  45,  38,  21,  24,  26,  23,  39,
        51,  55,  22,  49,  20,  54,  47,  56,  50,  99,  57,  61,  19,
       323,  48,  62,  53,  58,   3,  66,  59,  63,  74,  70])

In [99]:
# set the anomoly ages to the mean -> 3, 99, 323
mean_age = math.floor(train_df['age'].mean())

train_df.loc[(train_df['age'] > 80), 'age'] = mean_age
train_df.loc[(train_df['age'] < 5), 'age'] = mean_age

In [100]:
train_df.head()

Unnamed: 0,self_employed,num_employees,tech_company,mental_health_benefits,knowledge_of_mental_healthcare_coverage,employer_discussed_mental_health,employer_offer_resources,anonymity_protected,asking_for_leave,discuss_mental_health_disorder_with_employer,...,bad_response_to_mental_health_at_work,family_history,had_disorder_in_past,have_disorder,been_diagnosed,sought_treatment,interferes_with_work_if_treated_effectively,age,gender,work_remotely
1,0,6-25,1.0,No,Yes,Yes,Yes,Yes,Somewhat easy,No,...,No,Yes,Yes,Yes,Yes,1,Rarely,29,Male,Never
4,0,6-25,0.0,Yes,Yes,No,No,No,Neither easy nor difficult,Yes,...,"Yes, I experienced",Yes,Yes,Yes,Yes,1,Sometimes,43,Female,Sometimes
5,0,More than 1000,1.0,Yes,I am not sure,No,Yes,Yes,Somewhat easy,Yes,...,"Yes, I experienced",No,No,Yes,No,1,Not applicable to me,42,Male,Sometimes
6,0,26-100,1.0,I don't know,No,No,No,I don't know,Somewhat easy,No,...,No,No,No,No,No,0,Not applicable to me,30,Male,Sometimes
7,0,More than 1000,1.0,Yes,Yes,No,Yes,Yes,Very easy,No,...,"Yes, I observed",Yes,Yes,Yes,Yes,1,Sometimes,37,Female,Always


### Encode Data

In [101]:
dictionary = {}
for feature in train_df:
    encoder = preprocessing.LabelEncoder()
    encoder.fit(train_df[feature])
    encoder_dict = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
    train_df[feature] = encoder.transform(train_df[feature])
    label = [*encoder_dict]
    dictionary[feature] = label
    
for key, value in dictionary.items():     
    print(key, value)

self_employed [0]
believe_negative_consequences_of_employer_discussion ['No', 'Yes', 'Maybe']
discuss_mental_health_disorder_with_coworkers ['No', 'Yes', 'Maybe']
asking_for_leave ['Somewhat difficult', 'Somewhat easy', "I don't know", 'Very easy', 'Very difficult', 'Neither easy nor difficult']
gender ['Male', 'Female', 'Genderqueer/Other']
discuss_mental_health_disorder_with_supervisor ['No', 'Yes', 'Maybe']
mental_health_benefits ['No', 'Yes', 'Not eligible for coverage / N/A', "I don't know"]
sought_treatment [0, 1]
hurt_career ['No, it has not', 'Yes, I think it would', 'Maybe', 'Yes, it has', "No, I don't think it would"]
interferes_with_work_if_treated_effectively ['Not applicable to me', 'Often', 'Sometimes', 'Rarely', 'Never']
bad_response_to_mental_health_at_work ['No', 'Yes, I experienced', 'Yes, I observed', 'Maybe/Not sure']
knowledge_of_mental_healthcare_coverage ['No', 'I am not sure', 'Yes']
work_remotely ['Always', 'Sometimes', 'Never']
num_employees ['100-500', '26-10

In [102]:
train_df.head()

Unnamed: 0,self_employed,num_employees,tech_company,mental_health_benefits,knowledge_of_mental_healthcare_coverage,employer_discussed_mental_health,employer_offer_resources,anonymity_protected,asking_for_leave,discuss_mental_health_disorder_with_employer,...,bad_response_to_mental_health_at_work,family_history,had_disorder_in_past,have_disorder,been_diagnosed,sought_treatment,interferes_with_work_if_treated_effectively,age,gender,work_remotely
1,0,4,1,1,2,2,2,2,3,1,...,1,2,2,2,1,1,3,10,2,1
4,0,4,0,3,2,1,1,1,1,2,...,2,2,2,2,1,1,4,24,0,2
5,0,5,1,3,0,1,2,2,3,2,...,2,1,1,2,0,1,1,23,2,2
6,0,2,1,0,1,1,1,0,3,1,...,1,1,1,1,0,0,1,11,2,2
7,0,5,1,3,2,1,2,2,5,1,...,3,2,2,2,1,1,4,18,0,0


### Scale Data

In [103]:
scaler = MinMaxScaler()
for i in range(len(list(train_df))):
    train_df[(list(train_df)[i])] = scaler.fit_transform(train_df[[(list(train_df)[i])]])

train_df.head()

Unnamed: 0,self_employed,num_employees,tech_company,mental_health_benefits,knowledge_of_mental_healthcare_coverage,employer_discussed_mental_health,employer_offer_resources,anonymity_protected,asking_for_leave,discuss_mental_health_disorder_with_employer,...,bad_response_to_mental_health_at_work,family_history,had_disorder_in_past,have_disorder,been_diagnosed,sought_treatment,interferes_with_work_if_treated_effectively,age,gender,work_remotely
1,0.0,0.8,1.0,0.333333,1.0,1.0,1.0,1.0,0.6,0.5,...,0.333333,1.0,1.0,1.0,1.0,1.0,0.75,0.217391,1.0,0.5
4,0.0,0.8,0.0,1.0,1.0,0.5,0.5,0.5,0.2,1.0,...,0.666667,1.0,1.0,1.0,1.0,1.0,1.0,0.521739,0.0,1.0
5,0.0,1.0,1.0,1.0,0.0,0.5,1.0,1.0,0.6,1.0,...,0.666667,0.5,0.5,1.0,0.0,1.0,0.25,0.5,1.0,1.0
6,0.0,0.4,1.0,0.0,0.5,0.5,0.5,0.0,0.6,0.5,...,0.333333,0.5,0.5,0.5,0.0,0.0,0.25,0.23913,1.0,1.0
7,0.0,1.0,1.0,1.0,1.0,0.5,1.0,1.0,1.0,0.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.391304,0.0,0.0


### Models

#### Random Forest Tree

In [104]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [105]:
target=train_df['sought_treatment']
train_df.drop(['sought_treatment'],axis=1,inplace=True)

In [113]:
model = RandomForestClassifier(max_depth=14, random_state=0)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=14, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [114]:
model_predict = model.predict(X_test)

In [115]:
X=train_df
X_train, X_test, y_train, y_test = train_test_split(X,target, test_size=0.2, random_state=0)

In [145]:
c_matrix = confusion_matrix(y_test, model_predict)
                            
false_p = c_matrix.sum(axis=0) - np.diag(c_matrix)  
false_n = c_matrix.sum(axis=1) - np.diag(c_matrix)
true_p = np.diag(c_matrix)
true_n = sum(c_matrix) - (FP + FN + TP)

true_positive_rate = true_p/(true_p+false_n)
true_negative_rate = true_n/(true_n+false_p) 
false_positive_rate = false_p/(false_p+TN)
false_negative_rate  = false_n/(true_p+false_n)

In [146]:
print("======= Confusion Matrix =======")
print(confusion_matrix(y_test, model_predict), '\n')
print("======= Classification Report =======")
print(classification_report(y_test, model_predict), '\n')
print("======= Accuracy =======")
print('True Positive Rate: ', true_positive_rate)
print('False Positive Rate: ', false_positive_rate)
print('True Negative Rate: ', true_negative_rate)
print('False Negative Rate: ', false_negative_rate )
print('Mean Accuracy: ', model.score(X_test,y_test))

[[ 52  14]
 [ 11 117]] 

              precision    recall  f1-score   support

         0.0       0.83      0.79      0.81        66
         1.0       0.89      0.91      0.90       128

    accuracy                           0.87       194
   macro avg       0.86      0.85      0.85       194
weighted avg       0.87      0.87      0.87       194
 

True Positive Rate:  [0.78787879 0.9140625 ]
False Positive Rate:  [-3.66666667  4.66666667]
True Negative Rate:  [ 4.66666667 -3.66666667]
False Negative Rate:  [0.21212121 0.0859375 ]
Mean Accuracy:  0.8711340206185567
