In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import catboost as cat
import optuna
import warnings
from sklearn.metrics import matthews_corrcoef
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv").drop(["id","Name"],axis =1) 
original = pd.read_csv("/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv").drop("Name",axis = 1)
original["Depression"] = original["Depression"].map({"No": 0, "Yes": 1})
train = pd.concat([train,original])
test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
test = test.drop(["id","Name"],axis = 1)

In [3]:
def calculate_medians(df):

    medians = {}
    for column in df.columns:
        if df[column].dtype != 'object': 
            medians[column] = df[column].median()
    return medians

def fill_missing_values(df, medians):

    df_filled = df.copy()
    for column in df_filled.columns:
        if df_filled[column].dtype == 'object':
            df_filled[column].fillna('None', inplace=True)
        else:
            if column in medians: 
                df_filled[column].fillna(medians[column], inplace=True)
    return df_filled

def find_categorical_columns(df):
  
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object' or df[col].dtype.name == 'category']
    return categorical_columns

In [4]:
cat_cols = find_categorical_columns(train)
train_medians = calculate_medians(train)

train = fill_missing_values(train, train_medians)
test = fill_missing_values(test, train_medians)

X = train.iloc[:,:-1]
X[cat_cols] = X[cat_cols].astype("string")
test[cat_cols] = test[cat_cols].astype("string")
y = train.iloc[:,-1]

In [5]:
N_SPLITS = 5

cat_params = {"objective": "CrossEntropy",
              "eval_metric": "MCC",
              "random_seed": 42,
              "use_best_model": True,
              'learning_rate': 0.3503544344869301,
              'max_depth': 4,
              'l2_leaf_reg': 0.049221745524122555,
             # "task_type":"GPU"
             }
             


skf = StratifiedKFold(n_splits=N_SPLITS,random_state = 0, shuffle = True)

scores = []
oof_pred_probs = np.zeros(len(y))
test_predictions = []


for fold,(train_idx,val_idx) in enumerate(skf.split(X,y)):
    X_train,X_val = X.iloc[train_idx],X.iloc[val_idx]
    y_train,y_val = y.iloc[train_idx],y.iloc[val_idx]
    dtrain = cat.Pool(X_train,label = y_train,cat_features = cat_cols)
    dval = cat.Pool(X_val,label = y_val,cat_features = cat_cols)

    model = cat.train(params = cat_params,
                      pool = dtrain,
                      verbose = 0,
                      eval_set=[dval],
                      early_stopping_rounds=50,
                      num_boost_round = 2000
                     )
    
    pred_probs = model.predict(X_val,prediction_type="Probability")[:,1]
    preds = np.round(pred_probs).astype(int)
    test_pred_probs = model.predict(test,prediction_type="Probability")[:,1]
    
    oof_pred_probs[val_idx] += pred_probs
    
    test_predictions.append(test_pred_probs)
    
    score = matthews_corrcoef(preds,y_val)
    print(f"Fold {fold+1} Score:", score)
    scores.append(score)

print("\nMean OOF Score:", np.mean(scores))

Fold 1 Score: 0.7958941098499239
Fold 2 Score: 0.7944668248712918
Fold 3 Score: 0.7998662417365
Fold 4 Score: 0.795092831585525
Fold 5 Score: 0.7914418099228262

Mean OOF Score: 0.7953523635932134


In [6]:
test_results = pd.DataFrame(np.array(test_predictions).T,columns = ["Fold1","Fold2","Fold3","Fold4","Fold5"])
print(test_results.head())
labels = np.round(test_results.mean(axis = 1).values).astype(int)
np.savetxt("oof_pred_probs_catboost.txt",oof_pred_probs)
np.savetxt("mean_test_probs_catboost.txt",test_results.mean(axis = 1).values)

      Fold1     Fold2     Fold3     Fold4     Fold5
0  0.000292  0.000049  0.000025  0.000044  0.000172
1  0.000194  0.000036  0.000156  0.000038  0.000115
2  0.026980  0.030704  0.031859  0.041796  0.027031
3  0.980000  0.986270  0.983938  0.981406  0.984071
4  0.015254  0.017343  0.013607  0.018508  0.013460


In [7]:
submission = pd.read_csv("/kaggle/input/playground-series-s4e11/sample_submission.csv")
submission["Depression"] = labels
submission.to_csv("submission.csv",index = False)