In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import catboost as cat
import optuna
import warnings
from sklearn.metrics import matthews_corrcoef
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e11/train.csv").drop(["id","Name"],axis =1) 
original = pd.read_csv("/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv").drop("Name",axis = 1)
original["Depression"] = original["Depression"].map({"No": 0, "Yes": 1})
train = pd.concat([train,original])
test = pd.read_csv("/kaggle/input/playground-series-s4e11/test.csv")
test = test.drop(["id","Name"],axis = 1)

In [3]:
train.columns = [x.replace(" ","_") for x in train.columns.values]
test.columns = [x.replace(" ","_") for x in test.columns.values]

train = train.fillna("None").astype("string")
test = test.fillna("None").astype("string")

X = train.iloc[:,:-1]
y = train.iloc[:,-1].astype(int)

In [4]:
sleep = pd.DataFrame(X.Sleep_Duration.value_counts()).join(test.Sleep_Duration.value_counts(), how = "outer",lsuffix = "_train",rsuffix = "_test").fillna(0).sort_values(by = "count_train",ascending = False)
sleep.head(5)

Unnamed: 0_level_0,count_train,count_test
Sleep_Duration,Unnamed: 1_level_1,Unnamed: 2_level_1
Less than 5 hours,39432,25661
7-8 hours,37627,24491
More than 8 hours,33348,22190
5-6 hours,32770,21404
3-4 hours,12,3


In [5]:
profession = pd.DataFrame(X.Profession.value_counts()).join(test.Profession.value_counts(), how = "outer",lsuffix = "_train",rsuffix = "_test").fillna(0).sort_values(by = "count_train",ascending = False)
rare_profession = list(profession[(profession.count_train < 200) & (profession.count_test < 200)].index)
profession.head(5)

Unnamed: 0_level_0,count_train,count_test
Profession,Unnamed: 1_level_1,Unnamed: 2_level_1
,37303,24632
Teacher,25228,16385
Content Writer,7930,5187
Architect,4443,2982
Consultant,4301,2920


In [6]:
habits = pd.DataFrame(X.Dietary_Habits.value_counts()).join(test.Dietary_Habits.value_counts(), how = "outer",lsuffix = "_train",rsuffix = "_test").fillna(0).sort_values(by = "count_train",ascending = False)
habits.head(5)

Unnamed: 0_level_0,count_train,count_test
Dietary_Habits,Unnamed: 1_level_1,Unnamed: 2_level_1
Moderate,50537,33018
Unhealthy,47109,30786
Healthy,45583,29966
,4,5
No,2,6


In [7]:
sleep_duration_map = {"4-5 hours": "Less than 5 hours",
                     "2-3 hours": "Less than 5 hours",
                     "4-6 hours": "5-6 hours",
                     "6-8 hours":"7-8 hours",
                     "1-6 hours": "Less than 5 hours",
                     "No": "None",
                     "Unhealthy": "None",
                     "45": "None",
                     "10-11 hours": "More than 8 hours",
                     "9-11 hours": "More than 8 hours",
                     "8 hours": "7-8 hours",
                     "1-2 hours": "Less than 5 hours",
                     "40-45 hours":"None",
                     "Moderate":"None",
                     "55-66 hours":"More than 8 hours",
                     "1-3 hours": "Less than 5 hours",
                     "Indore":"None",
                     "35-36 hours":"5-6 hours",
                     "10-6 hours":"None",
                     "9-6 hours":"None",
                     "Pune": "None",
                     "than 5 hours":"None",
                     "49 hours": "None",
                     "Work_Study_Hours":"None",
                     "3-6 hours":"Less than 5 hours",
                     "45-48 hours": "None",
                     "9-5": "None",
                     "9-5 hours":"None",
                     "3-4 hours": "Less than 5 hours",
                     "Sleep_Duration": "None",
                     "8-9 hours": "More than 8 hours",
                     "Meerut": "None",
                     'Vivan': "None",
                     'Have_you_ever_had_suicidal_thoughts':"None",
                     '8-89 hours':"None",
                     '50-75 hours':"None",
                     '60-65 hours': "More than 8 hours",
                     '0': "None",
                     '6 hours':"5-6 hours",
                     '9-10 hours': "More than 8 hours",
                     '20-21 hours': "Less than 5 hours"}

X.Sleep_Duration = X.Sleep_Duration.replace(sleep_duration_map)
test.Sleep_Duration = test.Sleep_Duration.replace(sleep_duration_map)

rare_profession_map = {key: "None" for key in rare_profession}
X.Profession = X.Profession.replace(rare_profession_map)
test.Profession = test.Profession.replace(rare_profession_map)

dietary_habits_map = {"Less than Healthy": "Unhealthy",
                     "More Healthy": "Healthy",
                     "Less Healthy": "Unhealthy",
                     "No Healthy": "Unhealthy"}
X["Dietary_Habits"] = X.Dietary_Habits.replace(dietary_habits_map)
test["Dietary_Habits"] = test.Dietary_Habits.replace(dietary_habits_map)

rare_habits = [x for x in set(X.Dietary_Habits.unique()).union(test.Dietary_Habits.unique()) if x not in ["Healthy","Unhealthy","Moderate","None"]]
rare_habits_map = {key: "None" for key in rare_habits}
X.Dietary_Habits = X.Dietary_Habits.replace(rare_habits_map)
test.Dietary_Habits = test.Dietary_Habits.replace(rare_habits_map)

In [8]:
N_SPLITS = 5

cat_params = {"objective": "CrossEntropy",
              "eval_metric": "MCC",
              "random_seed": 42,
              "use_best_model": True,
              'learning_rate': 0.34045595894844477,
              'max_depth': 6,
              'l2_leaf_reg': 0.049221745524122555,
              "task_type":"GPU"
             }
             
cat_cols = X.columns.values

skf = StratifiedKFold(n_splits=N_SPLITS,random_state = 0, shuffle = True)

scores = []
oof_pred_probs = np.zeros(len(y))
test_predictions = []


for fold,(train_idx,val_idx) in enumerate(skf.split(X,y)):
    X_train,X_val = X.iloc[train_idx],X.iloc[val_idx]
    y_train,y_val = y.iloc[train_idx],y.iloc[val_idx]
    dtrain = cat.Pool(X_train,label = y_train,cat_features = cat_cols)
    dval = cat.Pool(X_val,label = y_val,cat_features = cat_cols)

    model = cat.train(params = cat_params,
                      pool = dtrain,
                      verbose = 0,
                      eval_set=[dval],
                      early_stopping_rounds=50,
                      num_boost_round = 2000
                     )
    
    pred_probs = model.predict(X_val,prediction_type="Probability")[:,1]
    preds = np.round(pred_probs).astype(int)
    test_pred_probs = model.predict(test,prediction_type="Probability")[:,1]
    
    oof_pred_probs[val_idx] += pred_probs
    
    test_predictions.append(test_pred_probs)
    
    score = matthews_corrcoef(preds,y_val)
    print(f"Fold {fold+1} Score:", score)
    scores.append(score)

print("\nMean OOF Score:", np.mean(scores))

Fold 1 Score: 0.7933239350484856
Fold 2 Score: 0.7951594615173696
Fold 3 Score: 0.7954948210429407
Fold 4 Score: 0.7962586613319232
Fold 5 Score: 0.7914546891784096

Mean OOF Score: 0.7943383136238257


In [9]:
test_results = pd.DataFrame(np.array(test_predictions).T,columns = ["Fold1","Fold2","Fold3","Fold4","Fold5"])
print(test_results.head())
labels = np.round(test_results.mean(axis = 1).values).astype(int)
np.savetxt("oof_pred_probs_catboost.txt",oof_pred_probs)
np.savetxt("mean_test_probs_catboost.txt",test_results.mean(axis = 1).values)

      Fold1     Fold2     Fold3     Fold4     Fold5
0  0.001417  0.002063  0.001333  0.001233  0.001794
1  0.002591  0.001646  0.001207  0.001932  0.001787
2  0.069210  0.078326  0.094881  0.100563  0.087103
3  0.979970  0.984107  0.978240  0.975934  0.976955
4  0.017179  0.024658  0.017789  0.020057  0.031229


In [10]:
submission = pd.read_csv("/kaggle/input/playground-series-s4e11/sample_submission.csv")
submission["Depression"] = labels
submission.to_csv("submission.csv",index = False)