# **Predicting Depression: Machine Learning Challenge**


CatBoost is a powerful gradient boosting algorithm designed for handling categorical data efficiently. Unlike traditional gradient boosting methods, CatBoost automatically handles categorical features by using innovative techniques like ordered boosting and target-based encoding, which reduce overfitting and improve model performance. It is particularly effective for datasets with a mix of numerical and categorical features, as it eliminates the need for extensive preprocessing like one-hot encoding. CatBoost also supports GPU acceleration, making it faster and more scalable for large datasets. With built-in support for missing value handling and robust hyperparameter tuning, CatBoost is widely used in competitions and real-world applications for its accuracy, speed, and ease of use.

*Let's import relevent modules*

In [1]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import optuna
from optuna.samplers import TPESampler
import shap

import warnings
warnings.filterwarnings("ignore")

### **Load the Dataset**

In [2]:
# Load data
df = pd.read_csv('../data/extracted_data/train.csv', index_col='id')

genDf = pd.read_csv('../data/extracted_data/createdData.csv')

genDf['Depression'] = genDf['Depression'].map({
    'No': 0,
    'Yes': 1
})

train = pd.concat([df, genDf], ignore_index=True)

# Load test data
test = pd.read_csv('../data/extracted_data/test.csv')

# Fill missing values
test = test.fillna('None').astype(str)

### **Model for Working Proffetional**

In [3]:
train_workingprofessional = train[train['Working Professional or Student'] != 'Student']  
train_workingprofessional = train_workingprofessional.fillna('None').astype(str)
test_workingprofessional = test[test['Working Professional or Student'] != 'Student'].copy()

# Define the target variable
y = train_workingprofessional['Depression']
X = train_workingprofessional.drop(['Depression'], axis=1)



In [4]:
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [5]:
catboost_params1 = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': 0.08114394459649094,
        'iterations': 1000,
        'depth': 6,
        'random_strength':0,
        'l2_leaf_reg': 0.7047064221215757,
        'min_data_in_leaf' : 2,
        'task_type':'CPU',
        'random_seed':42,
        'verbose':False 
    }

cv = StratifiedKFold(3, shuffle=True, random_state=0)
cv_splits = cv.split(X, y)
scores1 = []
test_preds1 = []
X_test_pool = Pool(test_workingprofessional, cat_features=X.columns.values)
for i, (train_idx, val_idx) in enumerate(cv_splits):
    model = CatBoostClassifier(**catboost_params1)
    X_train_fold, X_val_fold = X.loc[train_idx], X.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=X.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X.columns.values)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=500, early_stopping_rounds=200)
    val_pred = model.predict(X_valid_pool)
    score = accuracy_score(y_val_fold, val_pred)
    scores1.append(score)
    test_pred1 = model.predict_proba(X_test_pool)[:, 1]
    test_preds1.append(test_pred1)
    #print(f'Fold {i + 1} accuracy_score: {score}')
print(f'workingprofessional:{np.mean(scores1):.4f};')


0:	test: 0.9467281	best: 0.9467281 (0)	total: 460ms	remaining: 7m 39s
500:	test: 0.9721970	best: 0.9722054 (497)	total: 1m 58s	remaining: 1m 58s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9722053508
bestIteration = 497

Shrink model to first 498 iterations.
0:	test: 0.9516507	best: 0.9516507 (0)	total: 305ms	remaining: 5m 4s
500:	test: 0.9712759	best: 0.9712950 (493)	total: 2m 56s	remaining: 2m 55s
999:	test: 0.9717552	best: 0.9717708 (951)	total: 6m 6s	remaining: 0us

bestTest = 0.9717707596
bestIteration = 951

Shrink model to first 952 iterations.
0:	test: 0.9514719	best: 0.9514719 (0)	total: 352ms	remaining: 5m 51s
500:	test: 0.9699581	best: 0.9699671 (495)	total: 3m 15s	remaining: 3m 14s
999:	test: 0.9703600	best: 0.9704381 (944)	total: 6m 30s	remaining: 0us

bestTest = 0.9704381387
bestIteration = 944

Shrink model to first 945 iterations.
workingprofessional:0.9622;


In [6]:
# Save the trained CatBoost model in the fastest format
model.save_model("../models/catboost_model_wf.cbm", format="cbm")

### **Model for Student**

In [7]:
train_student = train[(train['Working Professional or Student'] == 'Student')  ] 
train_student = train_student.fillna('None').astype(str) 
test_student = test[test['Working Professional or Student'] == 'Student'].copy()

y = train_student['Depression']
X = train_student.drop(['Depression'], axis=1)

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [8]:
catboost_params2 = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': 0.08114394459649094,
        'iterations': 1000,
        'depth': 6,
        'random_strength':0,
        'l2_leaf_reg': 0.7047064221215757,
        'min_data_in_leaf' : 2,
        'task_type':'CPU',
        'random_seed':42,
        'verbose':False 
    }

cv = StratifiedKFold(3, shuffle=True, random_state=0)
cv_splits = cv.split(X, y)
scores2 = []
test_preds2 = []
X_test_pool = Pool(test_student, cat_features=X.columns.values)
for i, (train_idx, val_idx) in enumerate(cv_splits):
    model = CatBoostClassifier(**catboost_params2)
    X_train_fold, X_val_fold = X.loc[train_idx], X.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
    X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=X.columns.values)
    X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X.columns.values)
    model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=500, early_stopping_rounds=200)
    val_pred = model.predict(X_valid_pool)
    score = accuracy_score(y_val_fold, val_pred)
    scores2.append(score)
    test_pred2 = model.predict_proba(X_test_pool)[:, 1]
    test_preds2.append(test_pred2)
    print(f'Fold {i + 1} accuracy_score: {score}')
print(f'student:{np.mean(scores2):.4f};')

0:	test: 0.8976608	best: 0.8976608 (0)	total: 105ms	remaining: 1m 44s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9226903992
bestIteration = 290

Shrink model to first 291 iterations.
Fold 1 accuracy_score: 0.8537177862272919
0:	test: 0.8999773	best: 0.8999773 (0)	total: 225ms	remaining: 3m 44s
500:	test: 0.9242224	best: 0.9244949 (323)	total: 1m 53s	remaining: 1m 52s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9244948661
bestIteration = 323

Shrink model to first 324 iterations.
Fold 2 accuracy_score: 0.854034643008027
0:	test: 0.8939818	best: 0.8939818 (0)	total: 261ms	remaining: 4m 20s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9211415482
bestIteration = 213

Shrink model to first 214 iterations.
Fold 3 accuracy_score: 0.8433505862469631
student:0.8504;


In [9]:
# Save the trained CatBoost model in the fastest format
model.save_model("../models/catboost_model_s.cbm", format="cbm")

### **Hyper Parameter Tuning**

In [None]:

# Define the objective function for Optuna
def objective(trial):
    catboost_params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': trial.suggest_float('learning_rate', 0.05, 0.15, log=True),
        'iterations': trial.suggest_int('iterations', 500, 2000, step=500),
        'depth': trial.suggest_int('depth', 3, 10),
        'random_strength': trial.suggest_int('random_strength', 0, 5),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.1, 10, log=True),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 10),
        'random_seed': 42,
        'verbose': False,
        'task_type': 'GPU'
    }

    cv = StratifiedKFold(3, shuffle=True, random_state=0)
    cv_splits = cv.split(X, y)
    
    scores = []
    for train_idx, val_idx in cv_splits:
        model = CatBoostClassifier(**catboost_params)
        X_train_fold, X_val_fold = X.loc[train_idx], X.loc[val_idx]
        y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
        X_train_pool = Pool(X_train_fold, y_train_fold, cat_features=X.columns.values)
        X_valid_pool = Pool(X_val_fold, y_val_fold, cat_features=X.columns.values)

        model.fit(X=X_train_pool, eval_set=X_valid_pool, verbose=0, early_stopping_rounds=100)
        val_pred = model.predict_proba(X_valid_pool)[:, 1]
        score = roc_auc_score(y_val_fold, val_pred)  # Use AUC as optimization metric
        scores.append(score)

    return np.mean(scores)  # Return mean AUC

# Run Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # Run 20 trials 

# Print best hyperparameters
print("Best hyperparameters:", study.best_params)
print("Best AUC score:", study.best_value)


### **Submition**

In [None]:
# submission
test_workingprofessional = test_workingprofessional.reset_index()
test_student = test_student.reset_index()

preds_workingprofessional = np.round(np.mean(test_preds1, axis=0))
preds_student = np.round(np.mean(test_preds2, axis=0))

submission_workingprofessional = pd.DataFrame({'id': test_workingprofessional['id'], 'Depression': preds_workingprofessional})
submission_student = pd.DataFrame({'id': test_student['id'], 'Depression': preds_student})

submission = pd.concat([submission_student, submission_workingprofessional], axis=0)
submission = submission.sort_values(by='id', ascending=True)
submission.to_csv('../predictions/catboost.csv', index=False)