In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import catboost as cat
import optuna
import warnings
from sklearn.metrics import matthews_corrcoef
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv").drop(["id","Name"],axis =1) 
original = pd.read_csv("/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv").drop("Name",axis = 1)
original["Depression"] = original["Depression"].map({"No": 0, "Yes": 1})
train = pd.concat([train,original])
test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
test = test.drop(["id","Name"],axis = 1)

In [3]:
def calculate_medians(df):

    medians = {}
    for column in df.columns:
        if df[column].dtype != 'object': 
            medians[column] = df[column].median()
    return medians

def fill_missing_values(df, medians):

    df_filled = df.copy()
    for column in df_filled.columns:
        if df_filled[column].dtype == 'object':
            df_filled[column].fillna('None', inplace=True)
        else:
            if column in medians: 
                df_filled[column].fillna(medians[column], inplace=True)
    return df_filled

def find_categorical_columns(df):
  
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object' or df[col].dtype.name == 'category']
    return categorical_columns

In [4]:
cat_cols = find_categorical_columns(train)
train_medians = calculate_medians(train)

train = fill_missing_values(train, train_medians)
test = fill_missing_values(test, train_medians)

X = train.iloc[:,:-1]
X[cat_cols] = X[cat_cols].astype("string")
y = train.iloc[:,-1]

In [5]:
N_SPLITS = 5
def objective(trial):

    
    cat_params = {"objective": "CrossEntropy",
                  "eval_metric": "MCC",
                  "learning_rate": trial.suggest_float("learning_rate",0.001,1, log = True),
                  "random_seed": 42,
                  "use_best_model": True,
                  "task_type":"GPU",
                  "max_depth": trial.suggest_int("max_depth",3,10),
                  "l2_leaf_reg": trial.suggest_float('l2_leaf_reg', 0.001, 10, log=True)
                 }
    
    skf = StratifiedKFold(n_splits=N_SPLITS,random_state = 0, shuffle = True)
    scores = []
    for fold,(train_idx,val_idx) in enumerate(skf.split(X,y)):
        X_train,X_val = X.iloc[train_idx],X.iloc[val_idx]
        y_train,y_val = y.iloc[train_idx],y.iloc[val_idx]
        dtrain = cat.Pool(X_train,label = y_train,cat_features = cat_cols)
        dval = cat.Pool(X_val,label = y_val,cat_features = cat_cols)

        model = cat.train(params = cat_params,
                          pool = dtrain,
                          verbose = 0,
                          eval_set=[dval],
                          early_stopping_rounds=50,
                          num_boost_round = 2000
                         )
        preds = np.round(model.predict(X_val,prediction_type = "Probability")[:,1]).astype(int)
        score = matthews_corrcoef(y_val,preds)
        scores.append(score)

    return np.mean(scores)

study = optuna.create_study(direction="maximize",
                            pruner=optuna.pruners.HyperbandPruner(),
                            sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=100)

print(study.best_trial)
print()
print(study.best_params)

[I 2024-11-25 11:49:28,100] A new study created in memory with name: no-name-a8d65baf-6de9-4dfb-9b37-01a931551f89
[I 2024-11-25 11:50:15,626] Trial 0 finished with value: 0.7916519211187476 and parameters: {'learning_rate': 0.019906548984239554, 'max_depth': 5, 'l2_leaf_reg': 6.298263012766542}. Best is trial 0 with value: 0.7916519211187476.
[I 2024-11-25 11:52:01,626] Trial 1 finished with value: 0.7850017840866725 and parameters: {'learning_rate': 0.00593967489276428, 'max_depth': 8, 'l2_leaf_reg': 0.0248860283962132}. Best is trial 0 with value: 0.7916519211187476.
[I 2024-11-25 11:52:25,635] Trial 2 finished with value: 0.7612087404631824 and parameters: {'learning_rate': 0.013881514275914707, 'max_depth': 4, 'l2_leaf_reg': 0.09763062000393523}. Best is trial 0 with value: 0.7916519211187476.
[I 2024-11-25 11:52:35,426] Trial 3 finished with value: 0.7437305815454197 and parameters: {'learning_rate': 0.008637711454962792, 'max_depth': 4, 'l2_leaf_reg': 0.05131646564616684}. Best i

FrozenTrial(number=68, state=TrialState.COMPLETE, values=[0.7963058578842899], datetime_start=datetime.datetime(2024, 11, 25, 12, 16, 52, 535512), datetime_complete=datetime.datetime(2024, 11, 25, 12, 17, 5, 839669), params={'learning_rate': 0.3503544344869301, 'max_depth': 4, 'l2_leaf_reg': 0.049221745524122555}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=1.0, log=True, low=0.001, step=None), 'max_depth': IntDistribution(high=10, log=False, low=3, step=1), 'l2_leaf_reg': FloatDistribution(high=10.0, log=True, low=0.001, step=None)}, trial_id=68, value=None)

{'learning_rate': 0.3503544344869301, 'max_depth': 4, 'l2_leaf_reg': 0.049221745524122555}
