In [23]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# Machine Learning Preprocessing and Scoring Metrics
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score, roc_curve, auc
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Machine Learning Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

In [24]:
# Import Data
df = pd.read_csv('data/depression_data.csv')

In the previous notebook we examined each of the null cases to ensure we can impute an appropriate value. We found that:
* Nulls in 'academic pressure' are attributed to non-students.
* Null in 'work pressure' are attributed to non-working professionals.
* Nulls in 'study satisfaction' are attributed to non-students.
* Nulls in 'job satisfaction' are attributed to non-working professionals.

Because of this we will impute:
* 1 for academic pressure, meaning that those that did not respond do not experience academic pressure.
* 1 for work pressure, meaning that those that did not respond do not experience work pressure.
* 3 for study satisfaction, meaning that those that did not respond will be neutral.
* 3 for job satisfaction, meaning that those that did not respond will be neutral.

In [25]:
# Drop ? from naming convention and replace spaces with underscores.
# Also make all columns lower case
df.columns = [column.replace(' ?', '').replace(' ', '_').lower() for column in df.columns]

df.drop(columns=['name'], inplace=True)

In [26]:
# Impute values as specified above
df.loc[:, 'academic_pressure'].fillna(1, inplace=True)
df.loc[:, 'work_pressure'].fillna(1, inplace=True)
df.loc[:, 'study_satisfaction'].fillna(3, inplace=True)
df.loc[:, 'job_satisfaction'].fillna(3, inplace=True)

# Check all values filled as expected
df.isna().sum()

gender                                    0
age                                       0
city                                      0
working_professional_or_student           0
profession                              673
academic_pressure                         0
work_pressure                             0
cgpa                                   2054
study_satisfaction                        0
job_satisfaction                          0
sleep_duration                            0
dietary_habits                            0
degree                                    0
have_you_ever_had_suicidal_thoughts       0
work/study_hours                          0
financial_stress                          0
family_history_of_mental_illness          0
depression                                0
dtype: int64

Neither Profession nor CGPA was statistically significant from our previous notebook. Both contain many null values with no clear best imputation strategy. For these reasons we will drop them from our predicitve model.

In [27]:
df.drop(columns=['profession', 'cgpa'], inplace=True)
df.head()

Unnamed: 0,gender,age,city,working_professional_or_student,academic_pressure,work_pressure,study_satisfaction,job_satisfaction,sleep_duration,dietary_habits,degree,have_you_ever_had_suicidal_thoughts,work/study_hours,financial_stress,family_history_of_mental_illness,depression
0,Female,37,Ghaziabad,Working Professional,1.0,2.0,3.0,4.0,7-8 hours,Moderate,MA,No,6,2,No,No
1,Male,60,Kalyan,Working Professional,1.0,4.0,3.0,3.0,5-6 hours,Unhealthy,B.Com,Yes,0,4,Yes,No
2,Female,42,Bhopal,Working Professional,1.0,2.0,3.0,3.0,5-6 hours,Moderate,M.Com,No,0,2,No,No
3,Female,44,Thane,Working Professional,1.0,3.0,3.0,5.0,7-8 hours,Healthy,MD,Yes,1,2,Yes,No
4,Male,48,Indore,Working Professional,1.0,4.0,3.0,3.0,7-8 hours,Moderate,BE,Yes,6,5,Yes,No


At first glance it looks like we can encode:
* gender
* working_professional_or_student
* sleep_duration
* dietary_habits
* have_you_ever_had_suicidal_thoughts
* family_history_of_mental_illness
* depression

In [28]:
# Check current values
print(f"{df['gender'].value_counts()}\n")
print(f"{df['working_professional_or_student'].value_counts()}\n")
print(f"{df['sleep_duration'].value_counts()}\n")
print(f"{df['dietary_habits'].value_counts()}\n")
print(f"{df['have_you_ever_had_suicidal_thoughts'].value_counts()}\n")
print(f"{df['family_history_of_mental_illness'].value_counts()}\n")
print(f"{df['depression'].value_counts()}\n")

gender
Male      1333
Female    1223
Name: count, dtype: int64

working_professional_or_student
Working Professional    2054
Student                  502
Name: count, dtype: int64

sleep_duration
7-8 hours            658
Less than 5 hours    648
5-6 hours            628
More than 8 hours    622
Name: count, dtype: int64

dietary_habits
Unhealthy    882
Healthy      842
Moderate     832
Name: count, dtype: int64

have_you_ever_had_suicidal_thoughts
No     1307
Yes    1249
Name: count, dtype: int64

family_history_of_mental_illness
No     1311
Yes    1245
Name: count, dtype: int64

depression
No     2101
Yes     455
Name: count, dtype: int64



In [29]:
df['sleep_duration'].value_counts()

sleep_duration
7-8 hours            658
Less than 5 hours    648
5-6 hours            628
More than 8 hours    622
Name: count, dtype: int64

In [30]:
# Encode for model ingestion

# Gender
df.loc[:, 'gender'].replace({
    'Female': 0,
    'Male': 1
}, inplace=True)

# Working/Student
df.loc[:, 'working_professional_or_student'].replace({
    'Student': 0,
    'Working Professional': 1
}, inplace=True)

# Sleep Duration
df.loc[:, 'sleep_duration'].replace({
    'Less than 5 hours': 0,
    '5-6 hours': 1,
    '7-8 hours': 2,
    'More than 8 hours': 3
}, inplace=True)

# Diet
df.loc[:, 'dietary_habits'].replace({
    'Unhealthy': 0,
    'Moderate': 1,
    'Healthy': 2
}, inplace=True)

# Previous thoughts of suicide
df.loc[:, 'have_you_ever_had_suicidal_thoughts'].replace({
    'No': 0,
    'Yes': 1
}, inplace=True)

# Family History of Mental Illness
df.loc[:, 'family_history_of_mental_illness'].replace({
    'No': 0,
    'Yes': 1
}, inplace=True)

# Depression
df.loc[:, 'depression'].replace({
    'No': 0,
    'Yes': 1
}, inplace=True)

# Check
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 16 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   gender                               2556 non-null   int64  
 1   age                                  2556 non-null   int64  
 2   city                                 2556 non-null   object 
 3   working_professional_or_student      2556 non-null   int64  
 4   academic_pressure                    2556 non-null   float64
 5   work_pressure                        2556 non-null   float64
 6   study_satisfaction                   2556 non-null   float64
 7   job_satisfaction                     2556 non-null   float64
 8   sleep_duration                       2556 non-null   int64  
 9   dietary_habits                       2556 non-null   int64  
 10  degree                               2556 non-null   object 
 11  have_you_ever_had_suicidal_tho

In [None]:
# Isolate categorical columns
cat_cols = ['city', 'degree']

# One Hot Encode categorical columns
cat_df = df[cat_cols]
ohe = OneHotEncoder(drop='first', dtype=int, sparse_output=False)
ohe_data = ohe.fit_transform(cat_df)
ohe_df = pd.DataFrame(ohe_data, columns=ohe.get_feature_names_out(cat_df.columns))

# Merge encoded data frames together
full_df = pd.merge(df.drop(columns=cat_cols), ohe_df, left_index=True, right_index=True)

In [54]:
# Split training and testing data
X = full_df.drop(columns='depression')
y = full_df['depression']

# Split into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=50)