In [1]:
import pandas as pd
from collections import Counter
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score
import optuna
import catboost as cb

from utils import clean_with_real_data

### Data cleaning
For data cleaning, we will refer to the real dataset and retain only those values that appear there. For everything that's not supported by the real data, we will impute with mean/median, as these values most likely do not make sense and are just artifacts of synthetic data.

In [5]:
train = pd.read_csv("data/synth/train.csv", index_col="id")
real = pd.read_csv("data/real/final_depression_dataset_1.csv")
real.Depression = real.Depression.map({"No": 0, "Yes": 1})

In [6]:
cat_vars = [c for c in train.columns if train.dtypes[c]=='object']
train[cat_vars] = train[cat_vars].fillna("None")
real[cat_vars] = real[cat_vars].fillna("None")
train, cat_vars = clean_with_real_data(train, real)
train = pd.concat([train, real])
train, cat_vars = clean_with_real_data(train, real)

In [7]:
from sklearn.model_selection import train_test_split

y = train["Depression"]
X = train.drop("Depression", axis=1)

### XGBoost with optuna

In [None]:
def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param = {
        "silent": 1,
        'nthread' : -1 ,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "eta": trial.suggest_float("eta", 1e-8, 1, log=True),
        "lambda": trial.suggest_float("lambda", 1e-8, 10.0, log=True),
        "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
        'max_depth':trial.suggest_int('max_depth', 2, 25),
        'min_child_weight':trial.suggest_int('min_child_weight', 0, 10),
        "max_delta_step": trial.suggest_int("max_delta_step", 0, 10),
        'gamma':trial.suggest_int('gamma', 0, 10),
    }
    
    model = XGBClassifier(
        enable_categorical = True,
        num_parallel_tree=8,
        booster="dart",
        eta=0.3,
        gamma=param["gamma"],
        max_depth=param["max_depth"], 
        min_child_weight=param["min_child_weight"],
        max_delta_step=param["max_delta_step"]
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
print(study.best_params)

In [None]:
{'eta': 9.082009677263025e-06, 'lambda': 1.9370194698350048e-07, 'alpha': 2.2273330090867337e-06, 'max_depth': 3, 'min_child_weight': 5, 'max_delta_step': 6, 'gamma': 2}

In [None]:
best_model = XGBClassifier(
        enable_categorical = True,
        #num_parallel_tree=8,
        booster="dart",
        eta=0.3,
        gamma=2,
        max_depth=3, 
        min_child_weight=5,
        max_delta_step=6
    )
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_model.fit(X, y)
# y_pred = best_model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)

In [14]:
test = pd.read_csv("data/synth/test.csv", index_col="id")
test, _ = clean_with_real_data(test, real)

In [None]:
y_test = best_model.predict(test)

In [None]:
submission = pd.DataFrame({"id": test.index, "Depression":y_test})
submission.to_csv("data/submissions/optuna_xgboost_fulldata.csv", index=None)

### Catboost with optuna

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
gbm = cb.CatBoostClassifier(cat_features=cat_vars)

gbm.fit(X, y, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100)

y_pred = gbm.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.9496370235934665


In [17]:
y_hat = gbm.predict(test)
submission = pd.DataFrame({"id": test.index, "Depression": y_hat})
submission.to_csv("data/submissions/catboost_fulldata.csv", index=None)

In [22]:
def objective_catboost(trial):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = cb.CatBoostClassifier(**param, cat_features=cat_vars)

    gbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=100)
    
    y_pred = gbm.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [23]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_catboost, n_trials=10)

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-11-26 10:52:56,180] A new study created in memory with name: no-name-ecea2340-23b8-4471-967c-20454130740d
[I 2024-11-26 10:58:52,391] Trial 0 finished with value: 0.9408418260505375 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.08536539774964477, 'depth': 11, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.462107204940981}. Best is trial 0 with value: 0.9408418260505375.
[I 2024-11-26 11:04:10,984] Trial 1 finished with value: 0.9420284796872819 and parameters: {'objective': 'Logloss', 'colsample_bylevel': 0.08738951429298226, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.9420284796872819.
[I 2024-11-26 11:06:18,229] Trial 2 finished with value: 0.9401437944995114 and parameters: {'objective': 'CrossEntropy', 'colsample_bylevel': 0.07284270879659789, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9087605835615459}. Best is trial 1 with value: 

Number of finished trials:  10
Best trial:
  Value: 0.9420284796872819
  Params: 
    objective: Logloss
    colsample_bylevel: 0.08738951429298226
    depth: 10
    boosting_type: Ordered
    bootstrap_type: MVS


In [24]:
print(study.best_params)

{'objective': 'Logloss', 'colsample_bylevel': 0.08738951429298226, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}


In [25]:
best_model = cb.CatBoostClassifier(**study.best_params, cat_features=cat_vars)
best_model.fit(X, y)

Learning rate set to 0.085821
0:	learn: 0.6275867	total: 288ms	remaining: 4m 47s
1:	learn: 0.5679803	total: 549ms	remaining: 4m 33s
2:	learn: 0.4838832	total: 908ms	remaining: 5m 1s
3:	learn: 0.4274434	total: 1.22s	remaining: 5m 3s
4:	learn: 0.3828272	total: 1.55s	remaining: 5m 9s
5:	learn: 0.3499543	total: 2.11s	remaining: 5m 49s
6:	learn: 0.3121945	total: 2.47s	remaining: 5m 50s
7:	learn: 0.2821934	total: 3.08s	remaining: 6m 22s
8:	learn: 0.2653721	total: 3.41s	remaining: 6m 15s
9:	learn: 0.2609987	total: 3.7s	remaining: 6m 5s
10:	learn: 0.2600161	total: 3.96s	remaining: 5m 56s
11:	learn: 0.2567600	total: 4.26s	remaining: 5m 50s
12:	learn: 0.2473486	total: 4.6s	remaining: 5m 49s
13:	learn: 0.2398627	total: 4.9s	remaining: 5m 44s
14:	learn: 0.2323673	total: 5.58s	remaining: 6m 6s
15:	learn: 0.2228860	total: 6.19s	remaining: 6m 20s
16:	learn: 0.2212577	total: 6.48s	remaining: 6m 14s
17:	learn: 0.2211647	total: 6.7s	remaining: 6m 5s
18:	learn: 0.2179553	total: 7.13s	remaining: 6m 8s
19:

<catboost.core.CatBoostClassifier at 0x27941617910>

In [26]:
y_hat = gbm.predict(test)
submission = pd.DataFrame({"id": test.index, "Depression": y_hat})
submission.to_csv("data/submissions/optuna_catboost_fulldata.csv", index=None)