## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
from sklearn.metrics import accuracy_score
import xgboost as xgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
import optuna

import warnings
warnings.filterwarnings('ignore')

## Read Data

In [2]:
path = os.getcwd() # Assuming the data and the notebook have the same path
X_train = pd.read_csv(path + '/train.csv')
X_test = pd.read_csv(path + '/test.csv')
original = pd.read_csv(path + '/original.csv')

# Replace spaces by underscores in the columns' titles
X_train.columns = X_train.columns.str.replace(' ','_')
original.columns = original.columns.str.replace(' ','_')
X_test.columns = X_test.columns.str.replace(' ','_')

id_test = X_test.index

In [3]:
# Drop id column
X_test = X_test.drop(['id'], axis=1)
X_train = X_train.drop(['id'], axis=1)

In [4]:
original['Depression'] = original['Depression'].map({'Yes': 1, 'No': 0})

In [5]:
X_train = pd.concat([X_train,original])

## EDA

In [6]:
# Dataset dimensions
X_train.shape

(143256, 19)

In [7]:
# Data types of each column
X_train.dtypes

Name                                      object
Gender                                    object
Age                                      float64
City                                      object
Working_Professional_or_Student           object
Profession                                object
Academic_Pressure                        float64
Work_Pressure                            float64
CGPA                                     float64
Study_Satisfaction                       float64
Job_Satisfaction                         float64
Sleep_Duration                            object
Dietary_Habits                            object
Degree                                    object
Have_you_ever_had_suicidal_thoughts_?     object
Work/Study_Hours                         float64
Financial_Stress                         float64
Family_History_of_Mental_Illness          object
Depression                                 int64
dtype: object

In [8]:
print(f'% of depressed individuals: {100*round(X_train[X_train['Depression']==1]['Depression'].sum()/len(X_train['Depression']),4)}')
X_train['Depression'].value_counts()

% of depressed individuals: 18.16


Depression
0    117234
1     26022
Name: count, dtype: int64

In [9]:
# report = ProfileReport(X_train)
# report

## Preprocessing

### Duplicated Values

In [10]:
# Nuber of duplicated entries
X_train[X_train.duplicated()].shape[0]

0

### NaN values

In [11]:
# Count of nan values per column
X_train.isna().sum()

Name                                          0
Gender                                        0
Age                                           0
City                                          0
Working_Professional_or_Student               0
Profession                                37303
Academic_Pressure                        114857
Work_Pressure                             28420
CGPA                                     114856
Study_Satisfaction                       114857
Job_Satisfaction                          28412
Sleep_Duration                                0
Dietary_Habits                                4
Degree                                        2
Have_you_ever_had_suicidal_thoughts_?         0
Work/Study_Hours                              0
Financial_Stress                              4
Family_History_of_Mental_Illness              0
Depression                                    0
dtype: int64

##### Profession

There are students with professions, and we can see right away a high rate of depression for this cases (82% compared with 18% for the whole dataset). However, based on column 'Degree', this 'students' already have some form of education:

In [12]:
print(X_train[(X_train['Working_Professional_or_Student']=='Student')&(~X_train['Profession'].isna())]['Depression'].value_counts())
print(X_train[(X_train['Working_Professional_or_Student']=='Student')&(~X_train['Profession'].isna())]['Degree'].value_counts())

Depression
1    28
0     6
Name: count, dtype: int64
Degree
MSc        4
B.Ed       4
MD         4
MA         3
BHM        3
BSc        3
MCA        2
BBA        1
M.Com      1
B.Arch     1
MBBS       1
BCA        1
LLM        1
BA         1
M.Pharm    1
BE         1
PhD        1
B.Pharm    1
Name: count, dtype: int64


Profession NaN values be labeled 'Unemployed' if they are professionals without jobs or 'Student' if they are pursuing some degree

In [13]:
X_train['Profession'] = X_train.apply(
    lambda row: 'Unemployed' if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['Profession'])
    else 'Student' if row['Working_Professional_or_Student'] == 'Student' and pd.isna(row['Profession'])
    else row['Profession'],
    axis=1
)

X_test['Profession'] = X_test.apply(
    lambda row: 'Unemployed' if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['Profession'])
    else 'Student' if row['Working_Professional_or_Student'] == 'Student' and pd.isna(row['Profession'])
    else row['Profession'],
    axis=1
)

##### Academic Pressure

Excluding the following cases, Academic Pressure is only applicable to students:

In [14]:
X_train[(X_train['Working_Professional_or_Student']=='Working Professional')&(~X_train['Academic_Pressure'].isna())]

Unnamed: 0,Name,Gender,Age,City,Working_Professional_or_Student,Profession,Academic_Pressure,Work_Pressure,CGPA,Study_Satisfaction,Job_Satisfaction,Sleep_Duration,Dietary_Habits,Degree,Have_you_ever_had_suicidal_thoughts_?,Work/Study_Hours,Financial_Stress,Family_History_of_Mental_Illness,Depression
18752,Aarti,Female,18.0,Kalyan,Working Professional,Unemployed,2.0,,8.14,5.0,,Less than 5 hours,Healthy,Class 12,Yes,7.0,1.0,Yes,1
41915,Advait,Male,18.0,Kolkata,Working Professional,Unemployed,4.0,,8.04,3.0,,5-6 hours,Healthy,Class 12,Yes,1.0,5.0,Yes,1
55827,Prachi,Female,19.0,Kalyan,Working Professional,Unemployed,3.0,,8.11,2.0,,5-6 hours,Unhealthy,Class 12,Yes,4.0,4.0,No,1
99062,Tanisha,Female,24.0,Surat,Working Professional,Content Writer,4.0,,5.42,4.0,,Less than 5 hours,Moderate,B.Ed,Yes,1.0,5.0,No,1
101189,Keshav,Male,34.0,Rajkot,Working Professional,Unemployed,4.0,,8.24,3.0,,More than 8 hours,Moderate,MD,No,11.0,1.0,No,0


However, there are a few cases of NaN values for Academic Pressure for students:

In [15]:
X_train[(X_train['Working_Professional_or_Student']=='Student')&(X_train['Academic_Pressure'].isna())].shape[0]

9

NaN values in students will be labeled with the mean of the Academic Pressure for each Degree, and Working Professionals will be labeled with 0

In [16]:
ap_median = X_train['Academic_Pressure'].median()

X_train['Academic_Pressure'] = X_train.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['Academic_Pressure'])
    else ap_median if row['Working_Professional_or_Student'] == 'Student' and pd.isna(row['Academic_Pressure'])
    else row['Academic_Pressure'],
    axis=1
)

X_test['Academic_Pressure'] = X_test.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['Academic_Pressure'])
    else ap_median if row['Working_Professional_or_Student'] == 'Student' and pd.isna(row['Academic_Pressure'])
    else row['Academic_Pressure'],
    axis=1
)

##### Work_Pressure

Besides the students, most NaN values of Work Pressure are mostly for unemployed professionals

In [17]:
print(X_train[(X_train['Working_Professional_or_Student']=='Working Professional')&(X_train['Work_Pressure'].isna())].shape[0])
X_train[(X_train['Working_Professional_or_Student']=='Working Professional')&(X_train['Work_Pressure'].isna())]['Profession'].value_counts()

20


Profession
Unemployed             18
Mechanical Engineer     1
Content Writer          1
Name: count, dtype: int64

We will fill with 0 if profession is 'Not Applicable' (Students) or 'Unemployed'. Else we will fill with the median

In [18]:
wp_median = X_train['Work_Pressure'].median()

X_train['Work_Pressure'] = X_train.apply(
    lambda row: 0 if row['Profession'] in ['Not Applicable', 'Unemployed'] and pd.isna(row['Work_Pressure'])
    else wp_median if pd.isna(row['Work_Pressure'])
    else row['Work_Pressure'],
    axis=1
)

X_test['Work_Pressure'] = X_test.apply(
    lambda row: 0 if row['Profession'] in ['Not Applicable', 'Unemployed'] and pd.isna(row['Work_Pressure'])
    else wp_median if pd.isna(row['Work_Pressure'])
    else row['Work_Pressure'],
    axis=1
)

##### CGPA

Most cases of NaN CGPA values are for Working Professionals. We will label these cases as 0 and as the median for the students 

In [19]:
X_train[X_train['CGPA'].isna()]['Working_Professional_or_Student'].value_counts()

Working_Professional_or_Student
Working Professional    114847
Student                      9
Name: count, dtype: int64

In [20]:
cgpa_median = X_train['CGPA'].median()


X_train['CGPA'] = X_train.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['CGPA'])
    else cgpa_median if pd.isna(row['CGPA'])
    else row['CGPA'],
    axis=1
)

X_test['CGPA'] = X_test.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['CGPA'])
    else cgpa_median if pd.isna(row['CGPA'])
    else row['CGPA'],
    axis=1
)

##### Study Satisfaction

In [21]:
X_train[X_train['Study_Satisfaction'].isna()]['Working_Professional_or_Student'].value_counts()

Working_Professional_or_Student
Working Professional    114847
Student                     10
Name: count, dtype: int64

Again, NaN values for Students will be replaced by the median, and with 0 for the Working Professionals

In [22]:
study_median = X_train['Study_Satisfaction'].median()


X_train['Study_Satisfaction'] = X_train.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['Study_Satisfaction'])
    else study_median if pd.isna(row['Study_Satisfaction'])
    else row['Study_Satisfaction'],
    axis=1
)

X_test['Study_Satisfaction'] = X_test.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['Study_Satisfaction'])
    else study_median if pd.isna(row['Study_Satisfaction'])
    else row['Study_Satisfaction'],
    axis=1
)

##### Job Satisfaction

In [23]:
X_train[X_train['Job_Satisfaction'].isna()]['Working_Professional_or_Student'].value_counts()

Working_Professional_or_Student
Student                 28395
Working Professional       17
Name: count, dtype: int64

The other way around:

In [24]:
job_median = X_train['Job_Satisfaction'].median()


X_train['Job_Satisfaction'] = X_train.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['Job_Satisfaction'])
    else job_median if pd.isna(row['Job_Satisfaction'])
    else row['Job_Satisfaction'],
    axis=1
)

X_test['Job_Satisfaction'] = X_test.apply(
    lambda row: 0 if row['Working_Professional_or_Student'] == 'Working Professional' and pd.isna(row['Job_Satisfaction'])
    else job_median if pd.isna(row['Job_Satisfaction'])
    else row['Job_Satisfaction'],
    axis=1
)

##### Remaining columns - Mode if categorical, Median if numerical

More than 99% of column 'Dietary_Habits' are either 'Moderate', 'Unhealthy' or 'Healthy'. The other values are probably inputation mistakes and will be replaced by NaN and then by the mode

In [25]:
dietary_values = ['Unhealthy', 'Moderate', 'Healthy']
X_train['Dietary_Habits'] = np.where(X_train['Dietary_Habits'].isin(dietary_values), 
                                X_train['Dietary_Habits'], 
                                np.nan)

dietary_values = ['Unhealthy', 'Moderate', 'Healthy']
X_test['Dietary_Habits'] = np.where(X_test['Dietary_Habits'].isin(dietary_values), 
                                X_test['Dietary_Habits'], 
                                np.nan)

There are some errors in the name of the Degree for a few cases

In [26]:
X_train['Degree'] = X_train['Degree'].replace('BTech', 'B.Tech')
X_train['Degree'] = X_train['Degree'].replace('BPharm', 'B.Pharm')
X_train['Degree'] = X_train['Degree'].replace('LL B.Ed', 'B.Ed')
X_train['Degree'] = X_train['Degree'].replace('B B.Tech', 'B.Tech')
X_train['Degree'] = X_train['Degree'].replace('MPharm', 'M.Pharm')

X_test['Degree'] = X_test['Degree'].replace('BTech', 'B.Tech')
X_test['Degree'] = X_test['Degree'].replace('BPharm', 'B.Pharm')
X_test['Degree'] = X_test['Degree'].replace('LL B.Ed', 'B.Ed')
X_test['Degree'] = X_test['Degree'].replace('B B.Tech', 'B.Tech')
X_test['Degree'] = X_test['Degree'].replace('MPharm', 'M.Pharm')

For sleep duration column, there are 4 values that account for more than 99% of the column. we will change the sleep duration to <5 hours, 5-7 hours, 7-8 hours, or >8 hours, and remove other rows

In [27]:
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('5-6 hours', '5-7 hours')
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('6-7 hours', '5-7 hours')
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('3-4 hours', 'Less than 5 hours')
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('4-5 hours', 'Less than 5 hours')
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('2-3 hours', 'Less than 5 hours')
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('4-6 hours', 'Less than 5 hours')
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('6-8 hours', '7-8 hours')
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('1-6 hours', 'Less than 5 hours')
X_train['Sleep_Duration'] = X_train['Sleep_Duration'].replace('8-9 hours', 'More than 8 hours')

X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('5-6 hours', '5-7 hours')
X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('6-7 hours', '5-7 hours')
X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('3-4 hours', 'Less than 5 hours')
X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('4-5 hours', 'Less than 5 hours')
X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('2-3 hours', 'Less than 5 hours')
X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('4-6 hours', 'Less than 5 hours')
X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('6-8 hours', '7-8 hours')
X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('1-6 hours', 'Less than 5 hours')
X_test['Sleep_Duration'] = X_test['Sleep_Duration'].replace('8-9 hours', 'More than 8 hours')

In [28]:
sleep_values = ['Less than 5 hours','5-7 hours','7-8 hours','More than 8 hours']

X_train['Sleep_Duration'] = np.where(X_train['Sleep_Duration'].isin(sleep_values), 
                                X_train['Sleep_Duration'], 
                                np.nan)

X_test['Sleep_Duration'] = np.where(X_test['Sleep_Duration'].isin(sleep_values), 
                                X_test['Sleep_Duration'], 
                                np.nan)

Replace NaN values with mode for categorical columns and with median for numerical columns

In [29]:
def replace_nan(X_train, column_name):
    if X_train[column_name].isna().sum() == 0:
        return # Exits the function if there are no NaN values in the column

    if X_train[column_name].dtype=='O':
        X_train[column_name] = X_train.apply(
            lambda row:  X_train[column_name].mode()[0] if pd.isna(row[column_name])
            else row[column_name],
            axis=1
        )
    if X_train[column_name].dtype=='float64':
        col_median = X_train[column_name].median()
        X_train[column_name] = X_train.apply(
            lambda row:  col_median if pd.isna(row[column_name])
            else row[column_name],
            axis=1
        )

In [30]:
replace_nan(X_train, 'Dietary_Habits')
replace_nan(X_train, 'Degree')
replace_nan(X_train, 'Financial_Stress')
replace_nan(X_train, 'Sleep_Duration')

replace_nan(X_test, 'Dietary_Habits')
replace_nan(X_test, 'Degree')
replace_nan(X_test, 'Financial_Stress')
replace_nan(X_test, 'Sleep_Duration')

## Feature Engineering

Columns Sleep_Duration and Dietary_Habits are ordinal data, and can be converted to integers

In [None]:
def map_sleep(X):
    sleep_duration_mapping = {
        'Less than 5 hours': 0,
        '5-7 hours': 1,
        '7-8 hours': 2,
        'More than 8 hours': 3
    }

    X['Sleep_Duration'] = X['Sleep_Duration'].map(sleep_duration_mapping)

map_sleep(X_train)
map_sleep(X_test)

In [None]:
def map_diet(X):
    dietary_mapping = {
        'Unhealthy': 0,
        'Moderate': 1,
        'Healthy': 2
    }

    X['Dietary_Habits'] = X['Dietary_Habits'].map(dietary_mapping)

map_diet(X_train)
map_diet(X_test)

In [None]:
unique_counts = X_train.nunique()

# Get the data types of each column
column_dtypes = X_train.dtypes

# Combine both into a DataFrame for easier viewing
summary = pd.DataFrame({
    'Unique Values': unique_counts,
    'Data Type': column_dtypes
})

# Display the summary
print(summary)

                                       Unique Values Data Type
Name                                             422    object
Gender                                             2    object
Age                                               43   float64
City                                              98    object
Working_Professional_or_Student                    2    object
Profession                                        64    object
Academic_Pressure                                  6   float64
Work_Pressure                                      6   float64
CGPA                                             332   float64
Study_Satisfaction                                 6   float64
Job_Satisfaction                                   6   float64
Sleep_Duration                                     4     int64
Dietary_Habits                                     3     int64
Degree                                           112    object
Have_you_ever_had_suicidal_thoughts_?              2   

## Model Implementation

In [34]:
y_train = X_train.iloc[:,-1]
X_train = X_train.iloc[:,:-1]

In [35]:
X_train.dtypes

Name                                      object
Gender                                    object
Age                                      float64
City                                      object
Working_Professional_or_Student           object
Profession                                object
Academic_Pressure                        float64
Work_Pressure                            float64
CGPA                                     float64
Study_Satisfaction                       float64
Job_Satisfaction                         float64
Sleep_Duration                             int64
Dietary_Habits                             int64
Degree                                    object
Have_you_ever_had_suicidal_thoughts_?     object
Work/Study_Hours                         float64
Financial_Stress                         float64
Family_History_of_Mental_Illness          object
dtype: object

In [36]:
# Convert categorical columns to 'category' dtype for the XGBoost model
categorical_columns = ['Name','Gender', 'City', 'Working_Professional_or_Student', 'Profession', 
                       'Degree', 'Have_you_ever_had_suicidal_thoughts_?', 'Family_History_of_Mental_Illness']

for col in categorical_columns:
    X_train[col] = X_train[col].astype('category')

for col in categorical_columns:
    X_test[col] = X_test[col].astype('category')

### CatBoost

Here we apply CatBoost with optuna to perform a grid search. The results of the grid search are available below

In [38]:
# Define the Optuna objective function for CatBoost
def objective(trial):
    # Suggest values for the hyperparameters for CatBoost
    param = {
        'objective': 'Logloss',
        'eval_metric': 'Accuracy',
        'depth': trial.suggest_categorical('depth', [3, 6, 10]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.03, 0.1, 0.3]),
        'iterations': trial.suggest_categorical('iterations', [800, 1100, 1300, 1500]),
        'random_seed': 42,
        'verbose': False
    }

    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    accuracy_scores = []

    # Perform stratified k-fold cross-validation
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Initialize the CatBoost model with the parameters
        model = CatBoostClassifier(**param)

        # Train the model
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), cat_features=categorical_columns, verbose=0)

        # Predict probabilities on the validation fold
        preds = model.predict(X_val_fold)

        # Calculate accuracy and store the result
        fold_accuracy = accuracy_score(y_val_fold, preds)
        accuracy_scores.append(fold_accuracy)

    # Return the mean accuracy across all folds
    print(accuracy_scores)
    return np.mean(accuracy_scores)

# Set up the Optuna study and start the tuning process
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30, show_progress_bar=True)

[I 2024-12-02 16:00:24,362] A new study created in memory with name: no-name-d342ca73-3083-4be8-9745-63b44003b7bb


  0%|          | 0/30 [00:00<?, ?it/s]

[0.9407371213178836, 0.9416445623342176, 0.9414351528689097, 0.9392712550607287, 0.9459723579505793, 0.9387826329750105, 0.9418499127399651, 0.9386387434554974, 0.9415008726003491, 0.9405235602094241]
[I 2024-12-02 16:29:23,473] Trial 0 finished with value: 0.9410356171512566 and parameters: {'depth': 10, 'learning_rate': 0.03, 'iterations': 1500}. Best is trial 0 with value: 0.9410356171512566.
[0.9396202708362418, 0.9417143654893201, 0.9405975150076783, 0.9377355856484713, 0.9449951137791428, 0.9396202708362418, 0.9415706806282722, 0.9384991273996509, 0.9422687609075043, 0.9405933682373473]
[I 2024-12-02 16:40:37,436] Trial 1 finished with value: 0.9407215058769871 and parameters: {'depth': 3, 'learning_rate': 0.03, 'iterations': 1500}. Best is trial 0 with value: 0.9410356171512566.
[0.9396900739913444, 0.9420633812648331, 0.9417143654893201, 0.9382242077341896, 0.9461119642607846, 0.9407371213178836, 0.9415008726003491, 0.9400349040139616, 0.9419895287958115, 0.9407329842931937]
[I

Result of the grid search:
```
{'depth': 3, 'learning_rate': 0.3, 'iterations': 1300}
```

In [59]:
# Step 1: Retrieve the best hyperparameters from both studies
best_params = study.best_params  # The first grid search results
best_params = {'depth': 3,
 'learning_rate': 0.3,
 'iterations': 1100,
 'objective': 'Logloss',
 'eval_metric': 'Logloss',
 'verbose': 0}

# Add the fixed parameters that weren't part of the grid search
best_params['objective'] = 'Logloss'  # CatBoost equivalent of binary logistic loss
best_params['eval_metric'] = 'Logloss'  # Evaluation metric for binary classification
best_params['verbose'] = 0  # To reduce output during training

# Step 2: Train the model on the full training set using the combined best parameters
# Define the CatBoostClassifier with the best parameters
final_model = CatBoostClassifier(**best_params)

# Fit the final model on the full training set
final_model.fit(X_train, y_train, cat_features=categorical_columns)

# Step 3: Prepare the test set and make predictions
# Predict continuous probabilities for train and test sets
y_train_pred_cont = final_model.predict_proba(X_train)[:, 1]  # Probability for class 1
y_test_pred_cont = final_model.predict_proba(X_test)[:, 1]  


In [None]:
max_score = 0
score_list = []
n_list = []
for n in np.arange(0.3, 0.7, 0.001):
    y_train_pred = (y_train_pred_cont >= n).astype(int)
    scoring = accuracy_score(y_train, y_train_pred)
    n_list.append(n)
    score_list.append(scoring)

    if scoring > max_score:
        max_score = scoring
        best_n = n
print(f'{best_n}    {max_score}')  
# pd.DataFrame({'n':n_list,'score':score_list}) # Dataframe with scores for each threshold

In [None]:
# Convert probabilities to binary class predictions with a threshold
y_train_pred = (y_train_pred_cont >= best_n).astype(int)
y_test_pred = (y_test_pred_cont >= best_n).astype(int)

In [65]:
final = pd.DataFrame({"id":id_test, "Depression":y_test_pred})
final['id'] = final['id'] + 140700
final.to_csv("final.csv", index=False)