In [131]:
#Importing required libraries
import pandas as pd
import numpy as np
import hvplot.pandas
import hvplot.dask
from pathlib import Path
from collections import Counter
import matplotlib as mpl
from matplotlib import style
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [132]:
# Load the dataset

df = pd.read_csv(Path('./Resources/Mental_Health_Data.csv'))
df.head(10)

Unnamed: 0,ID,AGE,GENDER,COUNTRY_WORKING_IN,WFH,MENTAL_HEALTH_COVERAGE,TECH_COMPANY,MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE,EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,MEDICAL_COVERAGE_INCLUDING_MENTAL_HEALTH_TREATMENT,PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS,AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS,PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER,CURRENT_MENTAL_HEALTH_DISORDER
0,0,39,Male,United Kingdom,Sometimes,Not eligible for coverage,1.0,,No,,"No, none did",N/A (not currently aware),None did,Maybe,No
1,1,29,male,United States of America,Never,No,1.0,Yes,Yes,,"Yes, they all did",I was aware of some,Some did,"No, I don't think it would",Yes
2,2,38,Male,United Kingdom,Always,No,1.0,,No,,"No, none did",N/A (not currently aware),Some did,Maybe,No
3,3,43,male,United Kingdom,Sometimes,,,,,1.0,Some did,N/A (not currently aware),None did,"Yes, I think it would",Yes
4,4,43,Female,United States of America,Sometimes,Yes,0.0,Yes,No,,I don't know,N/A (not currently aware),None did,"Yes, I think it would",Yes
5,5,42,Male,United Kingdom,Sometimes,Yes,1.0,I am not sure,Yes,,"No, none did","Yes, I was aware of all of them",None did,"Yes, I think it would",Yes
6,6,30,M,United States of America,Sometimes,I don't know,1.0,No,No,,Some did,I was aware of some,Some did,"Yes, I think it would",No
7,7,37,female,United States of America,Always,Yes,1.0,Yes,Yes,,Some did,I was aware of some,Some did,Maybe,Yes
8,8,44,Female,United States of America,Sometimes,I don't know,0.0,No,No,,I don't know,N/A (not currently aware),None did,Maybe,Yes
9,9,30,Male,United States of America,Always,,,,,1.0,Some did,I was aware of some,None did,Maybe,Yes


In [133]:
# Basic Information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1433 entries, 0 to 1432
Data columns (total 15 columns):
 #   Column                                                 Non-Null Count  Dtype  
---  ------                                                 --------------  -----  
 0   ID                                                     1433 non-null   int64  
 1   AGE                                                    1433 non-null   int64  
 2   GENDER                                                 1430 non-null   object 
 3   COUNTRY_WORKING_IN                                     1433 non-null   object 
 4   WFH                                                    1433 non-null   object 
 5   MENTAL_HEALTH_COVERAGE                                 1146 non-null   object 
 6   TECH_COMPANY                                           1146 non-null   float64
 7   MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE                   1013 non-null   object 
 8   EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES          

In [134]:
# Describe the data

df.describe()

Unnamed: 0,ID,AGE,TECH_COMPANY,MEDICAL_COVERAGE_INCLUDING_MENTAL_HEALTH_TREATMENT
count,1433.0,1433.0,1146.0,287.0
mean,716.0,34.286113,0.770506,0.644599
std,413.81578,11.290931,0.420691,0.479471
min,0.0,3.0,0.0,0.0
25%,358.0,28.0,1.0,0.0
50%,716.0,33.0,1.0,1.0
75%,1074.0,39.0,1.0,1.0
max,1432.0,323.0,1.0,1.0


In [135]:
# Check for duplicates

df.duplicated().sum()

0

In [136]:
df.shape

(1433, 15)

In [137]:
df.columns

Index(['ID', 'AGE', 'GENDER', 'COUNTRY_WORKING_IN', 'WFH',
       'MENTAL_HEALTH_COVERAGE', 'TECH_COMPANY',
       'MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE',
       'EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES',
       'MEDICAL_COVERAGE_INCLUDING_MENTAL_HEALTH_TREATMENT',
       'PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS',
       'AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS',
       'PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES',
       'MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER',
       'CURRENT_MENTAL_HEALTH_DISORDER'],
      dtype='object')

In [138]:
# Find null values

df.isnull().sum()

ID                                                          0
AGE                                                         0
GENDER                                                      3
COUNTRY_WORKING_IN                                          0
WFH                                                         0
MENTAL_HEALTH_COVERAGE                                    287
TECH_COMPANY                                              287
MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE                      420
EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES                    287
MEDICAL_COVERAGE_INCLUDING_MENTAL_HEALTH_TREATMENT       1146
PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS             169
AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS     169
PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES               169
MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER                     0
CURRENT_MENTAL_HEALTH_DISORDER                              0
dtype: int64

In [139]:
# Drop MEDICAL_COVERAGE_INCLUDING_MENTAL_HEALTH_TREATMENT column

df.drop(['MEDICAL_COVERAGE_INCLUDING_MENTAL_HEALTH_TREATMENT'], axis=1, inplace=True)

In [140]:
# Replace null values in Gender column
df["GENDER"].fillna("NA", inplace = True)
df

Unnamed: 0,ID,AGE,GENDER,COUNTRY_WORKING_IN,WFH,MENTAL_HEALTH_COVERAGE,TECH_COMPANY,MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE,EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS,AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS,PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER,CURRENT_MENTAL_HEALTH_DISORDER
0,0,39,Male,United Kingdom,Sometimes,Not eligible for coverage,1.0,,No,"No, none did",N/A (not currently aware),None did,Maybe,No
1,1,29,male,United States of America,Never,No,1.0,Yes,Yes,"Yes, they all did",I was aware of some,Some did,"No, I don't think it would",Yes
2,2,38,Male,United Kingdom,Always,No,1.0,,No,"No, none did",N/A (not currently aware),Some did,Maybe,No
3,3,43,male,United Kingdom,Sometimes,,,,,Some did,N/A (not currently aware),None did,"Yes, I think it would",Yes
4,4,43,Female,United States of America,Sometimes,Yes,0.0,Yes,No,I don't know,N/A (not currently aware),None did,"Yes, I think it would",Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1428,1428,34,Female,United States of America,Sometimes,,,,,"Yes, they all did",I was aware of some,Some did,Maybe,No
1429,1429,56,MALE,Afghanistan,Sometimes,,,,,,,,"No, it has not",No
1430,1430,52,Male,United States of America,Sometimes,Yes,1.0,Yes,Yes,Some did,I was aware of some,Some did,"Yes, it has",Maybe
1431,1431,30,Female,United States of America,Sometimes,I don't know,0.0,I am not sure,Yes,"No, none did",N/A (not currently aware),None did,"No, I don't think it would",Yes


In [141]:
# Replace null values in MENTAL_HEALTH_COVERAGE column
df["MENTAL_HEALTH_COVERAGE"].fillna("NA", inplace = True)

In [142]:
# Replace null values in TECH_COMPANY column
df["TECH_COMPANY"].fillna("NA", inplace = True)

In [143]:
# Replace null values in MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE column
df["MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE"].fillna("NA", inplace = True)

In [144]:
# Replace null values in EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES  column
df["EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES"].fillna("NA", inplace = True)

In [145]:
# Replace null values in PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS column
df["PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS"].fillna("NA", inplace = True)

In [146]:
# Replace null values in AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS column
df["AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS"].fillna("NA", inplace = True)

In [147]:
# Replace null values in PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES column
df["PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES"].fillna("NA", inplace = True)

In [148]:
# Find null values
df.isnull().sum()

ID                                                       0
AGE                                                      0
GENDER                                                   0
COUNTRY_WORKING_IN                                       0
WFH                                                      0
MENTAL_HEALTH_COVERAGE                                   0
TECH_COMPANY                                             0
MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE                     0
EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES                   0
PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS            0
AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS    0
PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES              0
MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER                  0
CURRENT_MENTAL_HEALTH_DISORDER                           0
dtype: int64

In [149]:
# Check datatypes
df.dtypes

ID                                                        int64
AGE                                                       int64
GENDER                                                   object
COUNTRY_WORKING_IN                                       object
WFH                                                      object
MENTAL_HEALTH_COVERAGE                                   object
TECH_COMPANY                                             object
MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE                     object
EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES                   object
PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS            object
AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS    object
PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES              object
MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER                  object
CURRENT_MENTAL_HEALTH_DISORDER                           object
dtype: object

In [150]:
df.hvplot.line(x='MENTAL_HEALTH_COVERAGE', y='ID')

In [113]:
df.hvplot(x='MENTAL_HEALTH_COVERAGE', y=['ID', 'TECH_COMPANY', 'COUNTRY_WORKING_IN'],
         value_label='MENTAL_HEALTH_COVERAGE')

DataError: Dimensions may not reference duplicated DataFrame columns (found duplicate 'MENTAL_HEALTH_COVERAGE' columns). If you want to plot a column against itself simply declare two dimensions with the same name. 

PandasInterface expects tabular data, for more information on supported datatypes see http://holoviews.org/user_guide/Tabular_Datasets.html

In [151]:
df.hvplot.table(columns=['MENTAL_HEALTH_COVERAGE'], sortable=True, selectable=True)

In [157]:
df['MENTAL_HEALTH_COVERAGE'] = df['MENTAL_HEALTH_COVERAGE'].replace(['Yes','No','NA',"Not eligible for coverage","I don't know","Not eligible for coverage "], [1,0,0,0,0,0])

In [158]:
df.head(10)

Unnamed: 0,ID,AGE,GENDER,COUNTRY_WORKING_IN,WFH,MENTAL_HEALTH_COVERAGE,TECH_COMPANY,MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE,EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS,AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS,PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER,CURRENT_MENTAL_HEALTH_DISORDER
0,0,39,Male,United Kingdom,Sometimes,0,1.0,,No,"No, none did",N/A (not currently aware),None did,Maybe,No
1,1,29,male,United States of America,Never,0,1.0,Yes,Yes,"Yes, they all did",I was aware of some,Some did,"No, I don't think it would",Yes
2,2,38,Male,United Kingdom,Always,0,1.0,,No,"No, none did",N/A (not currently aware),Some did,Maybe,No
3,3,43,male,United Kingdom,Sometimes,0,,,,Some did,N/A (not currently aware),None did,"Yes, I think it would",Yes
4,4,43,Female,United States of America,Sometimes,1,0.0,Yes,No,I don't know,N/A (not currently aware),None did,"Yes, I think it would",Yes
5,5,42,Male,United Kingdom,Sometimes,1,1.0,I am not sure,Yes,"No, none did","Yes, I was aware of all of them",None did,"Yes, I think it would",Yes
6,6,30,M,United States of America,Sometimes,0,1.0,No,No,Some did,I was aware of some,Some did,"Yes, I think it would",No
7,7,37,female,United States of America,Always,1,1.0,Yes,Yes,Some did,I was aware of some,Some did,Maybe,Yes
8,8,44,Female,United States of America,Sometimes,0,0.0,No,No,I don't know,N/A (not currently aware),None did,Maybe,Yes
9,9,30,Male,United States of America,Always,0,,,,Some did,I was aware of some,None did,Maybe,Yes


In [159]:
df.hvplot.table(columns=['MENTAL_HEALTH_COVERAGE'], sortable=True, selectable=True)

In [160]:
df.dtypes

ID                                                        int64
AGE                                                       int64
GENDER                                                   object
COUNTRY_WORKING_IN                                       object
WFH                                                      object
MENTAL_HEALTH_COVERAGE                                    int64
TECH_COMPANY                                             object
MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE                     object
EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES                   object
PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS            object
AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS    object
PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES              object
MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER                  object
CURRENT_MENTAL_HEALTH_DISORDER                           object
dtype: object

In [161]:
application_cat = df.dtypes[df.dtypes == "object"].index.tolist()
application_cat

['GENDER',
 'COUNTRY_WORKING_IN',
 'WFH',
 'TECH_COMPANY',
 'MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE',
 'EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES',
 'PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS',
 'AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS',
 'PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES',
 'MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER',
 'CURRENT_MENTAL_HEALTH_DISORDER']

In [162]:
df[application_cat].head(10)

Unnamed: 0,GENDER,COUNTRY_WORKING_IN,WFH,TECH_COMPANY,MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE,EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS,AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS,PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER,CURRENT_MENTAL_HEALTH_DISORDER
0,Male,United Kingdom,Sometimes,1.0,,No,"No, none did",N/A (not currently aware),None did,Maybe,No
1,male,United States of America,Never,1.0,Yes,Yes,"Yes, they all did",I was aware of some,Some did,"No, I don't think it would",Yes
2,Male,United Kingdom,Always,1.0,,No,"No, none did",N/A (not currently aware),Some did,Maybe,No
3,male,United Kingdom,Sometimes,,,,Some did,N/A (not currently aware),None did,"Yes, I think it would",Yes
4,Female,United States of America,Sometimes,0.0,Yes,No,I don't know,N/A (not currently aware),None did,"Yes, I think it would",Yes
5,Male,United Kingdom,Sometimes,1.0,I am not sure,Yes,"No, none did","Yes, I was aware of all of them",None did,"Yes, I think it would",Yes
6,M,United States of America,Sometimes,1.0,No,No,Some did,I was aware of some,Some did,"Yes, I think it would",No
7,female,United States of America,Always,1.0,Yes,Yes,Some did,I was aware of some,Some did,Maybe,Yes
8,Female,United States of America,Sometimes,0.0,No,No,I don't know,N/A (not currently aware),None did,Maybe,Yes
9,Male,United States of America,Always,,,,Some did,I was aware of some,None did,Maybe,Yes


In [163]:
df['TECH_COMPANY'] = df['TECH_COMPANY'].replace([1.0,0.0], ['Yes','No'])

In [164]:
df.head(10)

Unnamed: 0,ID,AGE,GENDER,COUNTRY_WORKING_IN,WFH,MENTAL_HEALTH_COVERAGE,TECH_COMPANY,MENTAL_HEALTH_OPTIONS_UNDER_COVERAGE,EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,PREV_EMPLOYERS_PROVIDE_MENTAL_HEALTH_BENEFITS,AWARENESS_OF_MENTAL_HEALTH_COVERAGE_BY_PREV_EMPLOYERS,PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES,MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER,CURRENT_MENTAL_HEALTH_DISORDER
0,0,39,Male,United Kingdom,Sometimes,0,Yes,,No,"No, none did",N/A (not currently aware),None did,Maybe,No
1,1,29,male,United States of America,Never,0,Yes,Yes,Yes,"Yes, they all did",I was aware of some,Some did,"No, I don't think it would",Yes
2,2,38,Male,United Kingdom,Always,0,Yes,,No,"No, none did",N/A (not currently aware),Some did,Maybe,No
3,3,43,male,United Kingdom,Sometimes,0,,,,Some did,N/A (not currently aware),None did,"Yes, I think it would",Yes
4,4,43,Female,United States of America,Sometimes,1,No,Yes,No,I don't know,N/A (not currently aware),None did,"Yes, I think it would",Yes
5,5,42,Male,United Kingdom,Sometimes,1,Yes,I am not sure,Yes,"No, none did","Yes, I was aware of all of them",None did,"Yes, I think it would",Yes
6,6,30,M,United States of America,Sometimes,0,Yes,No,No,Some did,I was aware of some,Some did,"Yes, I think it would",No
7,7,37,female,United States of America,Always,1,Yes,Yes,Yes,Some did,I was aware of some,Some did,Maybe,Yes
8,8,44,Female,United States of America,Sometimes,0,No,No,No,I don't know,N/A (not currently aware),None did,Maybe,Yes
9,9,30,Male,United States of America,Always,0,,,,Some did,I was aware of some,None did,Maybe,Yes


In [165]:
# Create a OneHotEncoder instance

enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list

encode_df = pd.DataFrame(enc.fit_transform(df[application_cat]))

# Add the encoded variable names to the dataframe

encode_df.columns = enc.get_feature_names_out(application_cat)
encode_df.head()

Unnamed: 0,GENDER_ Female,GENDER_AFAB,GENDER_Agender,GENDER_Androgynous,GENDER_Bigender,GENDER_Cis Male,GENDER_Cis female,GENDER_Cis male,GENDER_Cis-woman,GENDER_Cisgender Female,...,PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES_Some did,"PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES_Yes, they all did",MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_Maybe,"MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_No, I don't think it would","MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_No, it has not","MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_Yes, I think it would","MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_Yes, it has",CURRENT_MENTAL_HEALTH_DISORDER_Maybe,CURRENT_MENTAL_HEALTH_DISORDER_No,CURRENT_MENTAL_HEALTH_DISORDER_Yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [169]:
application_df = df.merge(encode_df,left_index=True, right_index=True)
application_df = application_df.drop(application_cat,1)
application_df.head()

Unnamed: 0,ID,AGE,MENTAL_HEALTH_COVERAGE,GENDER_ Female,GENDER_AFAB,GENDER_Agender,GENDER_Androgynous,GENDER_Bigender,GENDER_Cis Male,GENDER_Cis female,...,PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES_Some did,"PREV_EMPLOYER_OFFER_MENTAL_HEALTH_RESOURCES_Yes, they all did",MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_Maybe,"MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_No, I don't think it would","MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_No, it has not","MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_Yes, I think it would","MENTAL_HEALTH_IDENTITY_HURT_YOUR_CAREER_Yes, it has",CURRENT_MENTAL_HEALTH_DISORDER_Maybe,CURRENT_MENTAL_HEALTH_DISORDER_No,CURRENT_MENTAL_HEALTH_DISORDER_Yes
0,0,39,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,29,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,38,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3,43,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,4,43,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [175]:
# Create our features

X = application_df.drop(columns="MENTAL_HEALTH_COVERAGE").values

# Create our target
y =application_df['MENTAL_HEALTH_COVERAGE'].values

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, stratify=y)

Counter(y_train)

Counter({1: 398, 0: 676})

In [176]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [177]:
# Fitting the model

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
   max_iter=200,
   random_state=1)

classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [178]:
# Making predictions using the testing data.

y_pred = classifier.predict(X_test)

In [180]:
# Calculated the balanced accuracy score

from sklearn.metrics import accuracy_score

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.8328690807799443

In [182]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,204,22
Actual 1,38,95


In [185]:
# Print the imbalanced classification report

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       226
           1       0.81      0.71      0.76       133

    accuracy                           0.83       359
   macro avg       0.83      0.81      0.82       359
weighted avg       0.83      0.83      0.83       359

