<a href="https://colab.research.google.com/github/digital0923RJ/Data-Science---Kaggle-Project/blob/main/Exploring_Mental_Health.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
!pip install catboost



In [67]:
!pip install optuna



In [68]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import optuna
from optuna.samplers import TPESampler

In [69]:
# Progress Tracking
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook

In [70]:
!pip install category_encoders



In [71]:
# Feature Engineering and Processing
from sklearn.preprocessing import (
    LabelEncoder,
    StandardScaler,
    MinMaxScaler,
    PowerTransformer,
    FunctionTransformer,
    RobustScaler
)
from category_encoders import (
    OneHotEncoder,
    OrdinalEncoder,
    CountEncoder,
    CatBoostEncoder,
    TargetEncoder
)
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [72]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
DATA_PATH = "/content/drive/MyDrive/Kaggle/Exploring Mental Health Data"

In [74]:
train = pd.read_csv(f"{DATA_PATH}/train.csv") # 학습데이터
test = pd.read_csv(f"{DATA_PATH}/test.csv") # 테스트 데이터
ori = pd.read_csv(f"{DATA_PATH}/final_depression_dataset_1.csv")

Column Descriptions
*   Name: Identifier for participants (anonymized)
*   Gender: Participant's gender identity
*   Age: Participant's age
*   City: Location of residence
*   Working Professional or Student: Current occupation category
*   Profession: Specific profession/field of work
*   Degree: Educational qualification
*   CGPA: Academic performance measure
*   Academic Pressure: Level of pressure from academic responsibilities
*   Work Pressure: Level of pressure from work responsibilities
*   Study Satisfaction: Level of satisfaction with studies
*   Job Satisfaction: Level of satisfaction with current job
*   Work/Study Hours: Daily hours spent on work/study
*   Sleep Duration: Average daily sleep hours
*   Dietary Habits: Eating patterns and food preferences
*   Have you ever had suicidal thoughts ?: History of suicidal ideation (Yes/No)
*   Financial Stress: Level of stress related to financial situation
*   Family History of Mental Illness: Presence of mental illness in family (Yes/No)
*   Target Variable: Depression, Binary indicator (1 = Yes, 0 = No) of depression risk

Evaluation Metric

*   The model performance is evaluated using Accuracy Score
*   Accuracy = (Number of Correct Predictions) / (Total Number of Predictions)

Objectives

*   Build a machine learning model to predict depression risk based on various life factors
*   Identify key contributors to mental health challenges
*   Analyze the relationship between different life aspects (academic, professional, personal) and depression risk
*   Create a reliable predictive model for early depression risk assessment
*   Understand the impact of lifestyle factors on mental health

In [75]:
ori['Depression'] = ori['Depression'].map({"Yes":1,"No":0}) #change Yes to 1 , No to 0 - Part of binary encoding
train = pd.concat([train, ori], ignore_index=True)

In [79]:
# Drop ID columns if present
if 'id' in train.columns:
    train.drop(columns=['id'], inplace=True)
if 'id' in test.columns:
    test.drop(columns=['id'], inplace=True)

In [80]:
def display_object_columns(df):
    object_columns = df.select_dtypes(include=['object']).columns
    print("Object 타입 열들:")
    for col in object_columns:
        print(f"열 이름: {col}")
        print(df[col].head())
        print()
# 함수 호출 (train에 대해)
display_object_columns(train)
display_object_columns(test)

Object 타입 열들:
열 이름: Name
0    Aaradhya
1       Vivan
2      Yuvraj
3      Yuvraj
4        Rhea
Name: Name, dtype: object

열 이름: Gender
0    Female
1      Male
2      Male
3      Male
4    Female
Name: Gender, dtype: object

열 이름: City
0         Ludhiana
1         Varanasi
2    Visakhapatnam
3           Mumbai
4           Kanpur
Name: City, dtype: object

열 이름: Working Professional or Student
0    Working Professional
1    Working Professional
2                 Student
3    Working Professional
4    Working Professional
Name: Working Professional or Student, dtype: object

열 이름: Profession
0                Chef
1             Teacher
2                 NaN
3             Teacher
4    Business Analyst
Name: Profession, dtype: object

열 이름: Sleep Duration
0    More than 8 hours
1    Less than 5 hours
2            5-6 hours
3    Less than 5 hours
4            5-6 hours
Name: Sleep Duration, dtype: object

열 이름: Dietary Habits
0      Healthy
1    Unhealthy
2      Healthy
3     Moderate
4    Un

In [81]:
categorical_columns = train.select_dtypes(include=['object']).columns
categorical_columns

Index(['Name', 'Gender', 'City', 'Working Professional or Student',
       'Profession', 'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?',
       'Family History of Mental Illness'],
      dtype='object')

In [None]:
for columns in categorical_columns: # 범주형 열(column) 이름이 담긴 리스트임. #루프를 사용하여 리스트에 있는 각 열에 대해 반복적으로 작업을 수행함.
    plt.figure(figsize= (10, 4)) # 그림 크기
    sns.countplot(x=columns, data = train) # 빈도수 #x는 해당 범주형 열을 설정 # 데이터 가져오기
    sns.histplot(x=columns, data = train) #
    plt.title(f'Distribution of {columns}')
    plt.xticks(rotation=90)
    plt.show()

    # 빈도 계산
    counts = df_train[columns].value_counts(normalize=True) * 100  # 퍼센트로 변환
    counts = counts.reset_index()
    counts.columns = [columns, 'Percentage']

    # 그래프 그리기
    sns.barplot(x=columns, y='Percentage', data=counts)

    plt.title(f'Distribution of {columns} (Percentage)')
    plt.xticks(rotation=90)
    plt.show()

In [42]:
!pip install tensorflow



In [43]:
import tensorflow as tf
# Check for GPU availability
global device
gpus = tf.config.list_physical_devices('GPU')

In [44]:
if gpus:
    print("GPU is available")
    device = 'gpu'
    # Enable memory growth for GPU
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
else:
    print("GPU is not available")
    device = 'cpu'


GPU is not available


In [46]:
# Create copies for backup
train_copy = train.copy()
test_copy = test.copy()
original_copy = ori.copy()

In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143256 entries, 0 to 143255
Data columns (total 19 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   Name                                   143256 non-null  object 
 1   Gender                                 143256 non-null  object 
 2   Age                                    143256 non-null  float64
 3   City                                   143256 non-null  object 
 4   Working Professional or Student        143256 non-null  object 
 5   Profession                             105953 non-null  object 
 6   Academic Pressure                      28399 non-null   float64
 7   Work Pressure                          114836 non-null  float64
 8   CGPA                                   28400 non-null   float64
 9   Study Satisfaction                     28399 non-null   float64
 10  Job Satisfaction                       114844 non-null  

In [63]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
scaler = StandardScaler()
Onehot = OneHotEncoder()
Label = LabelEncoder()

In [65]:
# 수치형 열과 범주형 열 구분
numeric_columns = train.select_dtypes(include=[np.number]).columns
categorical_columns = train.select_dtypes(include=['object']).columns

In [62]:
def display_object_float(df):
    float64_columns = df.select_dtypes(include=['float64']).columns
    print("float64 타입 열들:")
    for col in float64_columns:
        print(f"열 이름: {col}")
        print(df[col].head())
        print()
display_object_float(train)
display_object_float(test)

float64 타입 열들:
열 이름: Age
0    49.0
1    26.0
2    33.0
3    22.0
4    30.0
Name: Age, dtype: float64

열 이름: Academic Pressure
0    3.139829
1    3.139829
2    5.000000
3    3.139829
4    3.139829
Name: Academic Pressure, dtype: float64

열 이름: Work Pressure
0    5.000000
1    4.000000
2    2.999408
3    5.000000
4    1.000000
Name: Work Pressure, dtype: float64

열 이름: CGPA
0    7.657031
1    7.657031
2    8.970000
3    7.657031
4    7.657031
Name: CGPA, dtype: float64

열 이름: Study Satisfaction
0    2.947252
1    2.947252
2    2.000000
3    2.947252
4    2.947252
Name: Study Satisfaction, dtype: float64

열 이름: Job Satisfaction
0    2.000000
1    3.000000
2    2.975131
3    1.000000
4    1.000000
Name: Job Satisfaction, dtype: float64

열 이름: Work/Study Hours
0     1.0
1     7.0
2     3.0
3    10.0
4     9.0
Name: Work/Study Hours, dtype: float64

열 이름: Financial Stress
0    2.0
1    3.0
2    1.0
3    1.0
4    4.0
Name: Financial Stress, dtype: float64

float64 타입 열들:
열 이름: Age
0    53.0
1

In [54]:
# 결측치를 평균값으로 대체
for col in train.select_dtypes(include=['float64']).columns:
    train[col].fillna(train[col].mean(), inplace=True)

for col in test.select_dtypes(include=['float64']).columns:
    test[col].fillna(test[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mean(), inplace=True)


In [55]:
print(train.isnull().sum())
print(test.isnull().sum())

Name                                         0
Gender                                       0
Age                                          0
City                                         0
Working Professional or Student              0
Profession                               37303
Academic Pressure                            0
Work Pressure                                0
CGPA                                         0
Study Satisfaction                           0
Job Satisfaction                             0
Sleep Duration                               0
Dietary Habits                               4
Degree                                       2
Have you ever had suicidal thoughts ?        0
Work/Study Hours                             0
Financial Stress                             0
Family History of Mental Illness             0
Depression                                   0
dtype: int64
Name                                         0
Gender                                       0


Unnamed: 0,Name,Gender,Age,City,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,...,Profession_Student,Profession_Teacher,Profession_Travel Consultant,Profession_UX/UI Designer,Profession_Unemployed,Profession_Unveil,Profession_Visakhapatnam,Profession_Working Professional,Profession_Yogesh,Profession_Yuvraj
0,Aaradhya,Female,49.0,Ludhiana,3.139829,5.0,7.657031,2.947252,2.0,More than 8 hours,...,False,False,False,False,False,False,False,False,False,False
1,Vivan,Male,26.0,Varanasi,3.139829,4.0,7.657031,2.947252,3.0,Less than 5 hours,...,False,True,False,False,False,False,False,False,False,False
2,Yuvraj,Male,33.0,Visakhapatnam,5.0,2.999408,8.97,2.0,2.975131,5-6 hours,...,False,False,False,False,False,False,False,False,False,False
3,Yuvraj,Male,22.0,Mumbai,3.139829,5.0,7.657031,2.947252,1.0,Less than 5 hours,...,False,True,False,False,False,False,False,False,False,False
4,Rhea,Female,30.0,Kanpur,3.139829,1.0,7.657031,2.947252,1.0,5-6 hours,...,False,False,False,False,False,False,False,False,False,False


In [61]:
print(train.isnull().sum())
print(test.isnull().sum())

Name                                         0
Gender                                       0
Age                                          0
City                                         0
Working Professional or Student              0
Profession                               37303
Academic Pressure                            0
Work Pressure                                0
CGPA                                         0
Study Satisfaction                           0
Job Satisfaction                             0
Sleep Duration                               0
Dietary Habits                               4
Degree                                       2
Have you ever had suicidal thoughts ?        0
Work/Study Hours                             0
Financial Stress                             0
Family History of Mental Illness             0
Depression                                   0
dtype: int64
Name                                         0
Gender                                       0
