In [53]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint, uniform
from scipy.stats import rv_discrete, rv_continuous
import os

In [54]:
# Load the data
data = pd.read_csv('./../../../' + './data/processed/train_imputed_average.csv')

# Show dtypes and examplle data from first row
print(data.dtypes)

Gender                                    object
Age                                      float64
City                                      object
Working Professional or Student           object
Profession                                object
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                            object
Dietary Habits                            object
Degree                                    object
Have you ever had suicidal thoughts ?     object
Work/Study Hours                         float64
Financial Stress                         float64
Family History of Mental Illness          object
Depression                                 int64
dtype: object


In [55]:
# Define categorical features
categorical_features = ['Gender', 'City', 'Working Professional or Student', 'Profession',
                        'Dietary Habits', 'Degree',
                        'Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
text_features = ['Sleep Duration']
target_features = ['Depression']

# Define target and features
X = data.drop('Depression', axis=1)
y = data['Depression'].astype(int)  # Ensure the target is integer-encoded

# Train/test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [56]:
# Define CatBoost Pool (to handle categorical features)
train_pool = Pool(  
    X_train, 
    y_train,
    cat_features=categorical_features,
    text_features=text_features
)
valid_pool = Pool(  
    X_valid, 
    y_valid, 
    cat_features=categorical_features,
    text_features=text_features
)

# Define parameter distribution for randomized search
cross_val = 3
param_grid = {
    'depth': [4, 7],
    'learning_rate': [0.01, 0.05],
    'iterations': [100, 500],
    'l2_leaf_reg': [1, 5],
    'border_count': [32, 64, 128],
    'bagging_temperature': [0, 1, 2],
    'random_strength': [0, 1, 2],
    'random_seed': [42],
}

test_param_grid = {
    'depth': [4],
    'learning_rate': [0.01],
    'iterations': [100],
    'l2_leaf_reg': [1],
    'border_count': [32],
    'bagging_temperature': [0],
    'random_strength': [0],
    'random_seed': [42],
}

In [57]:
import time

# Initialize and train CatBoost model
timed_model = CatBoostClassifier(
    iterations=500,  # Can be increased for a better model if runtime allows
    learning_rate=0.1,
    depth=7,
    loss_function='Logloss',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=50
)

# Train the model and measure time
start_time = time.time()
timed_model.fit(train_pool, eval_set=valid_pool, use_best_model=True)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Training completed in {elapsed_time:.2f} seconds.")

# Predict total grid search time
total_combinations = np.prod([len(values) for values in param_grid.values()]) * 3
expected_time_hours = (total_combinations * elapsed_time) / 3600  # Convert seconds to hours
print("Total combinations: {}".format(total_combinations))
print("Expected total time: {:.2f} hours".format(expected_time_hours))


0:	learn: 0.9250533	test: 0.9247690	best: 0.9247690 (0)	total: 56.4ms	remaining: 28.1s
50:	learn: 0.9399343	test: 0.9389481	best: 0.9390903 (49)	total: 2.42s	remaining: 21.3s
100:	learn: 0.9421642	test: 0.9390547	best: 0.9391969 (91)	total: 4.64s	remaining: 18.3s
150:	learn: 0.9436834	test: 0.9391613	best: 0.9391969 (91)	total: 6.99s	remaining: 16.2s
200:	learn: 0.9450071	test: 0.9392679	best: 0.9396233 (194)	total: 9.29s	remaining: 13.8s
250:	learn: 0.9464108	test: 0.9394812	best: 0.9398010 (244)	total: 11.6s	remaining: 11.5s
300:	learn: 0.9475924	test: 0.9396233	best: 0.9398010 (244)	total: 13.9s	remaining: 9.17s
350:	learn: 0.9487385	test: 0.9393035	best: 0.9398010 (244)	total: 16.2s	remaining: 6.87s
400:	learn: 0.9497246	test: 0.9393390	best: 0.9398010 (244)	total: 18.5s	remaining: 4.56s
450:	learn: 0.9509240	test: 0.9394456	best: 0.9398010 (244)	total: 20.7s	remaining: 2.25s
499:	learn: 0.9518213	test: 0.9394101	best: 0.9398010 (244)	total: 23s	remaining: 0us

bestTest = 0.9398009

In [58]:
model = CatBoostClassifier(
    iterations=500,  # Can be increased for a better model if runtime allows
    learning_rate=0.1,
    depth=7,
    loss_function='Logloss',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=50
)

# Perform grid search
grid_search_result = model.grid_search(
    param_grid=param_grid,
    X=train_pool,
    y=None,  # Target labels are in the pool
    cv=cross_val,  # 3-fold cross-validation
    partition_random_seed=42,
    calc_cv_statistics=True,
    search_by_train_test_split=False,  # Use CV instead of a train-test split
    refit=True,  # Refit the model on the entire training set
    shuffle=True,
    stratified=True,  # Stratify folds
    train_size=0.8,  # Fraction of training data in each fold
    verbose=True,
    plot=True,  # Plot results (if environment supports visualization)
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/3]
0:	learn: 0.9129653	test: 0.9149809	best: 0.9149809 (0)	total: 47ms	remaining: 4.65s
50:	learn: 0.9219872	test: 0.9247888	best: 0.9266811 (30)	total: 2.05s	remaining: 1.97s
99:	learn: 0.9241195	test: 0.9274273	best: 0.9275073 (98)	total: 3.96s	remaining: 0us

bestTest = 0.9275072626
bestIteration = 98

Training on fold [1/3]
0:	learn: 0.9207489	test: 0.9180704	best: 0.9180704 (0)	total: 46.7ms	remaining: 4.62s
50:	learn: 0.9264792	test: 0.9249467	best: 0.9250000 (47)	total: 2.08s	remaining: 2s
99:	learn: 0.9253598	test: 0.9235341	best: 0.9250000 (47)	total: 4.04s	remaining: 0us

bestTest = 0.925
bestIteration = 47

Training on fold [2/3]
0:	learn: 0.9201636	test: 0.9192942	best: 0.9192942 (0)	total: 39.7ms	remaining: 3.93s
50:	learn: 0.9240948	test: 0.9213465	best: 0.9248381 (32)	total: 2.06s	remaining: 1.98s
99:	learn: 0.9259205	test: 0.9235321	best: 0.9248381 (32)	total: 4.05s	remaining: 0us

bestTest = 0.924838082
bestIteration = 32

0:	loss: 0.9248312	best: 0

In [59]:
# Extract the best parameters from the grid search result
best_params = grid_search_result['params']

# Initialize the best model with the optimal parameters and train
best_model = CatBoostClassifier(**best_params)
best_model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

# Make predictions on the validation set
y_pred = best_model.predict(X_valid)  # Predictions

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_valid, y_pred)

print(f"Accuracy: {accuracy:.4f}")

0:	learn: 0.6136491	test: 0.6133260	best: 0.6133260 (0)	total: 54.7ms	remaining: 27.3s
1:	learn: 0.5482553	test: 0.5477601	best: 0.5477601 (1)	total: 88.9ms	remaining: 22.1s
2:	learn: 0.4979007	test: 0.4971907	best: 0.4971907 (2)	total: 130ms	remaining: 21.5s
3:	learn: 0.4523913	test: 0.4514646	best: 0.4514646 (3)	total: 177ms	remaining: 21.9s
4:	learn: 0.4140131	test: 0.4129476	best: 0.4129476 (4)	total: 214ms	remaining: 21.2s
5:	learn: 0.3853917	test: 0.3842493	best: 0.3842493 (5)	total: 273ms	remaining: 22.5s
6:	learn: 0.3603252	test: 0.3590650	best: 0.3590650 (6)	total: 312ms	remaining: 22s
7:	learn: 0.3365693	test: 0.3351167	best: 0.3351167 (7)	total: 346ms	remaining: 21.2s
8:	learn: 0.3174840	test: 0.3160252	best: 0.3160252 (8)	total: 382ms	remaining: 20.8s
9:	learn: 0.3016936	test: 0.3001695	best: 0.3001695 (9)	total: 419ms	remaining: 20.5s
10:	learn: 0.2869009	test: 0.2852539	best: 0.2852539 (10)	total: 454ms	remaining: 20.2s
11:	learn: 0.2777907	test: 0.2761588	best: 0.2761588

In [60]:
import os
import json

# Define the save paths
save_path = './../../../saved_model/catboost/catboost_model.cbm'
params_save_path = './../../../saved_model/catboost/best_params.json'

# Ensure the directories exist
os.makedirs(os.path.dirname(save_path), exist_ok=True)

# Save the best model
best_model.save_model(save_path)  # Save in CatBoost's .cbm format

# Save the best hyperparameters
best_params = grid_search_result['params']  # Access best parameters
with open(params_save_path, 'w') as f:
    json.dump(best_params, f, indent=4)

print(f"Best CatBoost model saved to {save_path}")
print(f"Best hyperparameters saved to {params_save_path}")

Best CatBoost model saved to ./../../../saved_model/catboost/catboost_model.cbm
Best hyperparameters saved to ./../../../saved_model/catboost/best_params.json
